From e90fdb1000236b306af0692d6cf5066d76388400 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 24 May 2023 09:53:21 +0800
Subject: [PATCH 01/26] fix typo docs/

---
 docs/README.md                                         |  2 +-
 docs/REFERENCE.md                                      |  2 +-
 docs/source/en/advanced_tutorials/add_your_parallel.md |  2 +-
 .../integrate_mixture_of_experts_into_your_model.md    |  2 +-
 docs/source/en/advanced_tutorials/opt_service.md       |  2 +-
 .../parallelize_your_training_like_Megatron.md         |  2 +-
 .../train_vit_using_pipeline_parallelism.md            |  2 +-
 .../train_vit_with_hybrid_parallelism.md               | 10 +++++-----
 docs/source/en/basics/booster_api.md                   |  4 ++--
 docs/source/en/basics/colotensor_concept.md            |  2 +-
 docs/source/en/features/3D_tensor_parallel.md          |  2 +-
 .../en/features/gradient_clipping_with_booster.md      |  2 +-
 docs/source/en/features/nvme_offload.md                |  2 +-
 docs/source/en/features/pipeline_parallel.md           |  2 +-
 docs/source/en/features/zero_with_chunk.md             |  2 +-
 .../zh-Hans/advanced_tutorials/add_your_parallel.md    |  2 +-
 .../integrate_mixture_of_experts_into_your_model.md    |  2 +-
 docs/source/zh-Hans/advanced_tutorials/meet_gemini.md  |  4 ++--
 docs/source/zh-Hans/advanced_tutorials/opt_service.md  |  2 +-
 .../train_vit_with_hybrid_parallelism.md               |  4 ++--
 docs/source/zh-Hans/basics/colotensor_concept.md       |  2 +-
 .../zh-Hans/features/mixed_precision_training.md       |  2 +-
 docs/source/zh-Hans/features/nvme_offload.md           |  2 +-
 23 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index f520608d5..f0cb50ffe 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -98,7 +98,7 @@ Lastly, if you want to skip some code, you just need to add the following annota
 <!--- doc-test-ignore-end -->
 ```
 
-If you have any dependency required, please add it to `requriements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda.
+If you have any dependency required, please add it to `requirements-doc-test.txt` for pip and `conda-doc-test-deps.yml` for Conda.
 
 
 ### 💉 Auto Documentation
diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md
index 268119819..0984b2dc3 100644
--- a/docs/REFERENCE.md
+++ b/docs/REFERENCE.md
@@ -1,6 +1,6 @@
 # References
 
-The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few reserach works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format.
+The Colossal-AI project aims to provide a wide array of parallelism techniques for the machine learning community in the big-model era. This project is inspired by quite a few research works, some are conducted by some of our developers and the others are research projects open-sourced by other organizations. We would like to credit these amazing projects below in the IEEE citation format.
 
 ## By Our Team
 
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
index be7284a7a..1caf58c87 100644
--- a/docs/source/en/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -56,7 +56,7 @@ follow the steps below to create a new distributed initialization.
                     world_size: int,
                     config: Config,
                     data_parallel_size: int,
-                    pipeline_parlalel_size: int,
+                    pipeline_parallel_size: int,
                     tensor_parallel_size: int,
                     arg1,
                     arg2):
diff --git a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index e01caf76d..d5edd135c 100644
--- a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -121,7 +121,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b
 
 
 ## Train Your Model
-Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine.
 We handle the back-propagation of MoE models for you.
 In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
 You can find more information about the handler `MoeGradientHandler` in colossal directory.
diff --git a/docs/source/en/advanced_tutorials/opt_service.md b/docs/source/en/advanced_tutorials/opt_service.md
index a43ec7fdd..eccfa12f9 100644
--- a/docs/source/en/advanced_tutorials/opt_service.md
+++ b/docs/source/en/advanced_tutorials/opt_service.md
@@ -53,7 +53,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path"
 # the ${CONFIG_DIR} must contain a server.sh file as the entry of service
 export CONFIG_DIR="config_file_path"
 
-docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest
 ```
 
 Then open `https://[IP-ADDRESS]:8020/docs#` in your browser to try out!
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index e7698e5e9..1a7ab9a65 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -69,7 +69,7 @@ After the forward operation of the embedding module, each word in all sequences
 <figcaption>The embedding module</figcaption>
 </figure>
 
-Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer percepton is located in the second block.
+Each transformer layer contains two blocks. The self-attention operation is called in the first block and a two-layer perception is located in the second block.
 
 <figure style={{textAlign: "center"}}>
 <img src="https://s2.loli.net/2022/08/17/LAVzDlpRcj4dYeb.png"/>
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index b26599740..6adfe4f11 100644
--- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -195,7 +195,7 @@ def build_cifar(batch_size):
 
 ## Training ViT using pipeline
 
-You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an approriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage.
+You can set the size of pipeline parallel and number of microbatches in config. `NUM_CHUNKS` is useful when using interleaved-pipeline (for more details see [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://arxiv.org/abs/2104.04473) ). The original batch will be split into `num_microbatches`, and each stage will load a micro batch each time. Then we will generate an appropriate schedule for you to execute the pipeline training. If you don't need the output and label of model, you can set `return_output_label` to `False` when calling `trainer.fit()` which can further reduce GPU memory usage.
 
 You should `export DATA=/path/to/cifar`.
 
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index b2438a1cf..a2deaeb88 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -16,14 +16,14 @@ In this example for ViT model, Colossal-AI provides three different parallelism
 We will show you how to train ViT on CIFAR-10 dataset with these parallelism techniques. To run this example, you will need 2-4 GPUs.
 
 
-## Tabel of Contents
+## Table of Contents
 1. Colossal-AI installation
 2. Steps to train ViT with data parallelism
 3. Steps to train ViT with pipeline parallelism
 4. Steps to train ViT with tensor parallelism or hybrid parallelism
 
 ## Colossal-AI Installation
-You can install Colossal-AI pacakage and its dependencies with PyPI.
+You can install Colossal-AI package and its dependencies with PyPI.
 ```bash
 pip install colossalai
 ```
@@ -31,7 +31,7 @@ pip install colossalai
 
 
 ## Data Parallelism
-Data parallism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps:
+Data parallelism is one basic way to accelerate model training process. You can apply data parallelism to training by only two steps:
 1. Define a configuration file
 2. Change a few lines of code in train script
 
@@ -94,7 +94,7 @@ from torchvision import transforms
 from torchvision.datasets import CIFAR10
 ```
 
-#### Lauch Colossal-AI
+#### Launch Colossal-AI
 
 In train script,  you need to initialize the distributed environment for Colossal-AI after your config file is prepared. We call this process `launch`. In Colossal-AI, we provided several launch methods to initialize the distributed backend. In most cases, you can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. Besides, Colossal-AI can utilize the existing launch tool provided by PyTorch as many users are familiar with by using `colossalai.launch_from_torch`. For more details, you can view the related [documents](https://www.colossalai.org/docs/basics/launch_colossalai).
 
@@ -613,7 +613,7 @@ NUM_MICRO_BATCHES = parallel['pipeline']
 TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LENGTH, HIDDEN_SIZE)
 ```
 
-Ohter configs:
+Other configs:
 ```python
 # hyper parameters
 # BATCH_SIZE is as per GPU
diff --git a/docs/source/en/basics/booster_api.md b/docs/source/en/basics/booster_api.md
index cafcb6d43..a446ff31b 100644
--- a/docs/source/en/basics/booster_api.md
+++ b/docs/source/en/basics/booster_api.md
@@ -14,9 +14,9 @@ In our new design, `colossalai.booster` replaces the role of `colossalai.initial
 ### Plugin
 Plugin is an important component that manages parallel configuration (eg: The gemini plugin encapsulates the gemini acceleration solution). Currently supported plugins are as follows:
 
-***GeminiPlugin:*** This plugin wrapps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
+***GeminiPlugin:*** This plugin wraps the Gemini acceleration solution, that ZeRO with chunk-based memory management.
 
-***TorchDDPPlugin:*** This plugin wrapps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
+***TorchDDPPlugin:*** This plugin wraps the DDP acceleration solution, it implements data parallelism at the module level which can run across multiple machines.
 
 ***LowLevelZeroPlugin:*** This plugin wraps the 1/2 stage of Zero Redundancy Optimizer. Stage 1 : Shards optimizer states across data parallel workers/GPUs. Stage 2 : Shards optimizer states + gradients across data parallel workers/GPUs.
 
diff --git a/docs/source/en/basics/colotensor_concept.md b/docs/source/en/basics/colotensor_concept.md
index 050f2ef9f..abe470fe0 100644
--- a/docs/source/en/basics/colotensor_concept.md
+++ b/docs/source/en/basics/colotensor_concept.md
@@ -52,7 +52,7 @@ An instance of class [ComputeSpec](https://colossalai.readthedocs.io/en/latest/c
 
 ## Example
 
-Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_dgree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor.
+Let's see an example. A ColoTensor is initialized and sharded on 8 GPUs using tp_degree=4, dp_degree=2. And then the tensor is sharded along the last dim among the TP process groups. Finally, we reshard it along the first dim (0 dim) among the TP process groups. We encourage users to run the code and observe the shape of each tensor.
 
 
 ```python
diff --git a/docs/source/en/features/3D_tensor_parallel.md b/docs/source/en/features/3D_tensor_parallel.md
index b9e98eac9..0e28f08b2 100644
--- a/docs/source/en/features/3D_tensor_parallel.md
+++ b/docs/source/en/features/3D_tensor_parallel.md
@@ -67,7 +67,7 @@ Given $P=q \times q \times q$ processors, we present the theoretical computation
 
 ## Usage
 
-To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallism setting as below.
+To enable 3D tensor parallelism for our model, e.g. on 8 GPUs, we need to configure the parallelism setting as below.
 ```python
 CONFIG = dict(parallel=dict(
     data=1,
diff --git a/docs/source/en/features/gradient_clipping_with_booster.md b/docs/source/en/features/gradient_clipping_with_booster.md
index 8686eb06f..341a608a5 100644
--- a/docs/source/en/features/gradient_clipping_with_booster.md
+++ b/docs/source/en/features/gradient_clipping_with_booster.md
@@ -75,7 +75,7 @@ Build your model, optimizer, loss function, lr scheduler and dataloaders. Note t
 NUM_EPOCHS = 200
 BATCH_SIZE = 128
 GRADIENT_CLIPPING = 0.1
-# build resnetå
+# build resnet
 model = resnet34(num_classes=10)
 # build dataloaders
 train_dataset = CIFAR10(root=Path(os.environ.get('DATA', './data')),
diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index 4374da3c9..d940fd5ec 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -53,7 +53,7 @@ It's compatible with all parallel methods in ColossalAI.
 
 > ⚠ It only offloads optimizer states on CPU. This means it only affects CPU training or Zero/Gemini with offloading.
 
-## Exampls
+## Examples
 
 Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 
diff --git a/docs/source/en/features/pipeline_parallel.md b/docs/source/en/features/pipeline_parallel.md
index ac49863b3..30654b0b0 100644
--- a/docs/source/en/features/pipeline_parallel.md
+++ b/docs/source/en/features/pipeline_parallel.md
@@ -156,4 +156,4 @@ trainer.fit(train_dataloader=train_dataloader,
             display_progress=True)
 ```
 
-We use `2` pipeline stages and the batch will be splitted into `4` micro batches.
+We use `2` pipeline stages and the batch will be split into `4` micro batches.
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index a105831a5..8448c52ac 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -72,7 +72,7 @@ chunk_manager = init_chunk_manager(model=module,
 gemini_manager = GeminiManager(placement_policy, chunk_manager)
 ```
 
-`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still samller than the minimum chunk size, all parameters will be compacted into one small chunk.
+`hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_mb` is the the minimum chunk size in MegaByte. If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk.
 
 Initialization of the optimizer.
 ```python
diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
index 4825a6fa1..059eb014a 100644
--- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
@@ -48,7 +48,7 @@ Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管
                     world_size: int,
                     config: Config,
                     data_parallel_size: int,
-                    pipeline_parlalel_size: int,
+                    pipeline_parallel_size: int,
                     tensor_parallel_size: int,
                     arg1,
                     arg2):
diff --git a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index 456878caa..276fcc261 100644
--- a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -122,7 +122,7 @@ Inside the initialization of Experts, the local expert number of each GPU will b
 
 
 ## Train Your Model
-Do not to forget to use `colossalai.initialize` function in `colosalai` to add gradient handler for the engine.
+Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine.
 We handle the back-propagation of MoE models for you.
 In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
 You can find more information about the handler `MoeGradientHandler` in colossal directory.
diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
index 2bf0a9c98..a52bc6ac7 100644
--- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
+++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
@@ -48,7 +48,7 @@ zero = dict(
 </figure>
 
 
-ColossalAI设计了Gemini，就像双子星一样，它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内，从而让模型训练突破GPU的内存墙。内存管理器由两部分组成，分别是MemStatsCollector(MSC)和StatefuleTensorMgr(STM)。
+ColossalAI设计了Gemini，就像双子星一样，它管理CPU和GPU二者内存空间。它可以让张量在训练过程中动态分布在CPU-GPU的存储空间内，从而让模型训练突破GPU的内存墙。内存管理器由两部分组成，分别是MemStatsCollector(MSC)和StatefulTensorMgr(STM)。
 
 
 我们利用了深度学习网络训练过程的迭代特性。我们将迭代分为warmup和non-warmup两个阶段，开始时的一个或若干迭代步属于预热阶段，其余的迭代步属于正式阶段。在warmup阶段我们为MSC收集信息，而在non-warmup阶段STM入去MSC收集的信息来移动tensor，以达到最小化CPU-GPU数据移动volume的目的。
@@ -75,7 +75,7 @@ STM管理所有model data tensor的信息。在模型的构造过程中，Coloss
 
 我们在算子的开始和结束计算时，触发内存采样操作，我们称这个时间点为**采样时刻（sampling moment)**，两个采样时刻之间的时间我们称为**period**。计算过程是一个黑盒，由于可能分配临时buffer，内存使用情况很复杂。但是，我们可以较准确的获取period的系统最大内存使用。非模型数据的使用可以通过两个统计时刻之间系统最大内存使用-模型内存使用获得。
 
-我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used，和下一个period的model data memoy used。并行策略会给MSC的工作造成障碍。如图所示，比如对于ZeRO或者Tensor Parallel，由于Op计算前需要gather模型数据，会带来额外的内存需求。因此，我们要求在模型数据变化前进行采样系统内存，这样在一个period内，MSC会把preOp的模型变化内存捕捉。比如在period 2-3内，我们考虑的tensor gather和shard带来的内存变化。
+我们如何设计采样时刻呢。我们选择preOp的model data layout adjust之前。如下图所示。我们采样获得上一个period的system memory used，和下一个period的model data memory used。并行策略会给MSC的工作造成障碍。如图所示，比如对于ZeRO或者Tensor Parallel，由于Op计算前需要gather模型数据，会带来额外的内存需求。因此，我们要求在模型数据变化前进行采样系统内存，这样在一个period内，MSC会把preOp的模型变化内存捕捉。比如在period 2-3内，我们考虑的tensor gather和shard带来的内存变化。
 尽管可以将采样时刻放在其他位置，比如排除gather buffer的变动新信息，但是会给造成麻烦。不同并行方式Op的实现有差异，比如对于Linear Op，Tensor Parallel中gather buffer的分配在Op中。而对于ZeRO，gather buffer的分配是在PreOp中。将放在PreOp开始时采样有利于将两种情况统一。
 
 
diff --git a/docs/source/zh-Hans/advanced_tutorials/opt_service.md b/docs/source/zh-Hans/advanced_tutorials/opt_service.md
index a213584fd..1f8324a53 100644
--- a/docs/source/zh-Hans/advanced_tutorials/opt_service.md
+++ b/docs/source/zh-Hans/advanced_tutorials/opt_service.md
@@ -52,7 +52,7 @@ export CHECKPOINT_DIR="your_opt_checkpoint_path"
 # the ${CONFIG_DIR} must contain a server.sh file as the entry of service
 export CONFIG_DIR="config_file_path"
 
-docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:lastest
+docker run --gpus all  --rm -it -p 8020:8020 -v ${CHECKPOINT_DIR}:/model_checkpoint -v ${CONFIG_DIR}:/config --ipc=host energonai:latest
 ```
 
 接下来，您就可以在您的浏览器中打开 `https://[IP-ADDRESS]:8020/docs#` 进行测试。
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 6dc5eccf4..e2f2c90a3 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -477,7 +477,7 @@ def build_cifar(batch_size):
     return train_dataloader, test_dataloader
 
 
-# craete dataloaders
+# create dataloaders
 train_dataloader , test_dataloader = build_cifar()
 # create loss function
 criterion = CrossEntropyLoss(label_smoothing=0.1)
@@ -492,7 +492,7 @@ lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
 #### 启动 Colossal-AI 引擎
 
 ```python
-# intiailize
+# initialize
 engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                      optimizer=optimizer,
                                                                      criterion=criterion,
diff --git a/docs/source/zh-Hans/basics/colotensor_concept.md b/docs/source/zh-Hans/basics/colotensor_concept.md
index b725e48a7..ab2413e99 100644
--- a/docs/source/zh-Hans/basics/colotensor_concept.md
+++ b/docs/source/zh-Hans/basics/colotensor_concept.md
@@ -53,7 +53,7 @@ ColoTensor 包含额外的属性[ColoTensorSpec](https://colossalai.readthedocs.
 
 ## Example
 
-让我们看一个例子。 使用 tp_degree=4, dp_dgree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后，我们沿着 TP 进程组中的第一个维度（dim 0）对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。
+让我们看一个例子。 使用 tp_degree=4, dp_degree=2 在 8 个 GPU 上初始化并Shard一个ColoTensor。 然后tensor被沿着 TP 进程组中的最后一个维度进行分片。 最后，我们沿着 TP 进程组中的第一个维度（dim 0）对其进行重新Shard。 我们鼓励用户运行代码并观察每个张量的形状。
 
 
 ```python
diff --git a/docs/source/zh-Hans/features/mixed_precision_training.md b/docs/source/zh-Hans/features/mixed_precision_training.md
index c4df6271b..4628b09cd 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training.md
@@ -203,7 +203,7 @@ Naive AMP 的默认参数:
 - initial_scale(int): gradient scaler 的初始值
 - growth_factor(int): loss scale 的增长率
 - backoff_factor(float): loss scale 的下降率
-- hysterisis(int): 动态 loss scaling 的延迟偏移
+- hysteresis(int): 动态 loss scaling 的延迟偏移
 - max_scale(int): loss scale 的最大允许值
 - verbose(bool): 如果被设为`True`,将打印调试信息
 
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index fd75ed1f5..db5f10184 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -53,7 +53,7 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n
 
 > ⚠ 它只会卸载在 CPU 上的优化器状态。这意味着它只会影响 CPU 训练或者使用卸载的 Zero/Gemini。
 
-## Exampls
+## Examples
 
 Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖`transformers`。

From 518b31c059e11d7acce2e229d1579af594688561 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Wed, 24 May 2023 14:51:49 +0800
Subject: [PATCH 02/26] [docs] change placememt_policy to placement_policy
 (#3829)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/
---
 .../parallelize_your_training_like_Megatron.md            | 4 ++--
 docs/source/en/features/zero_with_chunk.md                | 8 ++++----
 .../parallelize_your_training_like_Megatron.md            | 4 ++--
 docs/source/zh-Hans/features/zero_with_chunk.md           | 8 ++++----
 examples/images/dreambooth/train_dreambooth_colossalai.py | 4 ++--
 .../images/dreambooth/train_dreambooth_colossalai_lora.py | 4 ++--
 examples/language/palm/train.py                           | 8 ++++----
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 1a7ab9a65..22d52fb3c 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -175,11 +175,11 @@ In this way, users can train their models as usual.
 In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overhead and improve efficiency.For the details of this part, please refer to [ZeRO](../features/zero_with_chunk.md). You can combine these two parts to understand our entire training process:
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
-                        placement_policy=placememt_policy,
+                        placement_policy=placement_policy,
                         pin_memory=True,
                         search_range_mb=32)
     return model
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index 8448c52ac..d7a99f2fb 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -185,23 +185,23 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
 Define a model which uses Gemini + ZeRO DDP:
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     cai_version = colossalai.__version__
     if version.parse(cai_version) > version.parse("0.1.10"):
         from colossalai.nn.parallel import GeminiDDP
         model = GeminiDDP(model,
                           device=get_current_device(),
-                          placement_policy=placememt_policy,
+                          placement_policy=placement_policy,
                           pin_memory=True,
                           search_range_mb=32)
     elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
         from colossalai.gemini import ChunkManager, GeminiManager
         chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        gemini_manager = GeminiManager(placement_policy, chunk_manager)
         chunk_manager = ChunkManager(chunk_size,
                                      pg,
                                      enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placememt_policy))
+                                     init_device=GeminiManager.get_default_device(placement_policy))
         model = ZeroDDP(model, gemini_manager)
     else:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index f3c6247c3..c4131e593 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -159,11 +159,11 @@ for mn, module in model.named_modules():
 在我们最新示例中还定义了一个Gemini + ZeRO DDP 的模型从而减小开销，提升效率。这一部分的详细内容可以参考[ZeRO](../features/zero_with_chunk.md)，你可以将这两部分内容结合起来看从而理解我们整个训练流程：
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
-                        placement_policy=placememt_policy,
+                        placement_policy=placement_policy,
                         pin_memory=True,
                         search_range_mb=32)
     return model
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 72403bf61..ba57ba4e8 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -185,23 +185,23 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
 定义一个使用 Gemini + ZeRO DDP 的模型：
 
 ```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     cai_version = colossalai.__version__
     if version.parse(cai_version) > version.parse("0.1.10"):
         from colossalai.nn.parallel import GeminiDDP
         model = GeminiDDP(model,
                           device=get_current_device(),
-                          placement_policy=placememt_policy,
+                          placement_policy=placement_policy,
                           pin_memory=True,
                           search_range_mb=32)
     elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
         from colossalai.gemini import ChunkManager, GeminiManager
         chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        gemini_manager = GeminiManager(placement_policy, chunk_manager)
         chunk_manager = ChunkManager(chunk_size,
                                      pg,
                                      enable_distributed_storage=True,
-                                 			init_device=GeminiManager.get_default_device(placememt_policy))
+                                 			init_device=GeminiManager.get_default_device(placement_policy))
         model = ZeroDDP(model, gemini_manager)
     else:
         raise NotImplemented(f"CAI version {cai_version} is not supported")
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index e6159e105..d07febea0 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -340,12 +340,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
 
     model = GeminiDDP(model,
                       device=get_current_device(),
-                      placement_policy=placememt_policy,
+                      placement_policy=placement_policy,
                       pin_memory=True,
                       search_range_mb=64)
     return model
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 1b2fc778d..6715b473a 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -342,12 +342,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
     from colossalai.nn.parallel import GeminiDDP
 
     model = GeminiDDP(model,
                       device=get_current_device(),
-                      placement_policy=placememt_policy,
+                      placement_policy=placement_policy,
                       pin_memory=True,
                       search_range_mb=64)
     return model
diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py
index 7923e4fc8..b16da1c77 100644
--- a/examples/language/palm/train.py
+++ b/examples/language/palm/train.py
@@ -102,23 +102,23 @@ def get_model_size(model: nn.Module):
 
 
 # Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
+def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
     cai_version = colossalai.__version__
     if version.parse(cai_version) > version.parse("0.1.10"):
         from colossalai.nn.parallel import GeminiDDP
         model = GeminiDDP(model,
                           device=get_current_device(),
-                          placement_policy=placememt_policy,
+                          placement_policy=placement_policy,
                           pin_memory=True,
                           search_range_mb=32)
     elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
         from colossalai.gemini import ChunkManager, GeminiManager
         chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placememt_policy, chunk_manager)
+        gemini_manager = GeminiManager(placement_policy, chunk_manager)
         chunk_manager = ChunkManager(chunk_size,
                                      pg,
                                      enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placememt_policy))
+                                     init_device=GeminiManager.get_default_device(placement_policy))
         model = ZeroDDP(model, gemini_manager)
     else:
         raise NotImplemented(f"CAI version {cai_version} is not supported")

From 84500b7799232974d3f230f13c2c018fc8424392 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 May 2023 14:59:40 +0800
Subject: [PATCH 03/26] [workflow] fixed testmon cache in build CI (#3806)

* [workflow] fixed testmon cache in build CI

* polish code
---
 .github/workflows/build_on_pr.yml  | 4 ++--
 .gitignore                         | 4 ++++
 requirements/requirements-test.txt | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 53355f560..a5a17d176 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -136,7 +136,7 @@ jobs:
       - name: Restore Testmon Cache
         run: |
           if [ -d /github/home/testmon_cache ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata /__w/ColossalAI/ColossalAI/
+            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
 
       - name: Execute Unit Testing
@@ -150,7 +150,7 @@ jobs:
       - name: Store Testmon Cache
         run: |
           [ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata /github/home/testmon_cache/
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/
 
       - name: Collate artifact
         env:
diff --git a/.gitignore b/.gitignore
index bf74a7538..81113fa99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -155,3 +155,7 @@ colossalai/version.py
 # ignore coverage test file
 coverage.lcov
 coverage.xml
+
+# ignore testmon and coverage files
+.coverage
+.testmondata*
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 55edb1b6a..6895113bc 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,6 +1,7 @@
 diffusers
 fbgemm-gpu==0.2.0
 pytest
+coverage==7.2.3
 git+https://github.com/hpcaitech/pytest-testmon
 torchvision
 transformers

From 3229f93e3081332f0734545436894151aae2cfc4 Mon Sep 17 00:00:00 2001
From: wukong1992 <wukong1992@users.noreply.github.com>
Date: Thu, 25 May 2023 14:00:02 +0800
Subject: [PATCH 04/26] [booster] add warning for torch fsdp plugin doc (#3833)

---
 colossalai/booster/plugin/torch_fsdp_plugin.py | 7 ++++++-
 docs/source/en/basics/booster_plugins.md       | 3 +++
 docs/source/zh-Hans/basics/booster_plugins.md  | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index 340555dc6..8d534ea4c 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -3,10 +3,10 @@ from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
+import warnings
 from packaging import version
 from torch.distributed import ProcessGroup
 
-
 if version.parse(torch.__version__) >= version.parse('1.12.0'):
     from torch.distributed.fsdp import FullStateDictConfig
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -202,6 +202,11 @@ class TorchFSDPPlugin(DPPluginBase):
 
         # wrap the model with PyTorch FSDP
         fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
+
+        if len(optimizer.param_groups) > 1:
+            warnings.warn(
+                'TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used.'
+            )
         optimizer.__init__(fsdp_model.parameters(), **optimizer.defaults)
 
         if not isinstance(optimizer, FSDPOptimizerWrapper):
diff --git a/docs/source/en/basics/booster_plugins.md b/docs/source/en/basics/booster_plugins.md
index 6ed49bfa7..5e2586b83 100644
--- a/docs/source/en/basics/booster_plugins.md
+++ b/docs/source/en/basics/booster_plugins.md
@@ -62,8 +62,11 @@ More details can be found in [Pytorch Docs](https://pytorch.org/docs/main/genera
 ### Torch FSDP Plugin
 
 > ⚠ This plugin is not available when torch version is lower than 1.12.0.
+
 > ⚠ This plugin does not support save/load sharded model checkpoint now.
 
+> ⚠ This plugin does not support optimizer that use multi params group.
+
 More details can be found in [Pytorch Docs](https://pytorch.org/docs/main/fsdp.html).
 
 {{ autodoc:colossalai.booster.plugin.TorchFSDPPlugin }}
diff --git a/docs/source/zh-Hans/basics/booster_plugins.md b/docs/source/zh-Hans/basics/booster_plugins.md
index 00e7d91e3..5bd88b679 100644
--- a/docs/source/zh-Hans/basics/booster_plugins.md
+++ b/docs/source/zh-Hans/basics/booster_plugins.md
@@ -62,8 +62,11 @@ Zero-2 不支持局部梯度累积。如果您坚持使用，虽然可以积累
 ### Torch FSDP 插件
 
 > ⚠ 如果 torch 版本低于 1.12.0，此插件将不可用。
+
 > ⚠ 该插件现在还不支持保存/加载分片的模型 checkpoint。
 
+> ⚠ 该插件现在还不支持使用了multi params group的optimizer。
+
 更多详细信息，请参阅 [Pytorch 文档](https://pytorch.org/docs/main/fsdp.html).
 
 {{ autodoc:colossalai.booster.plugin.TorchFSDPPlugin }}

From 54e97ed7ea0cff0563c066c53675d8a428973ec9 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 May 2023 14:14:34 +0800
Subject: [PATCH 05/26] [workflow] supported test on CUDA 10.2 (#3841)

---
 .../compatiblity_test_on_dispatch.yml         | 47 ++++++++++++-------
 .github/workflows/compatiblity_test_on_pr.yml | 16 ++++++-
 README.md                                     | 16 +++++++
 docs/source/en/get_started/installation.md    | 15 ++++++
 .../zh-Hans/get_started/installation.md       | 16 +++++++
 5 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 717cf729b..3dcc4dfd1 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -19,26 +19,26 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-    - id: set-matrix
-      env:
-        TORCH_VERSIONS: ${{ inputs.torch_version }}
-        CUDA_VERSIONS: ${{ inputs.cuda_version }}
-      run: |
-        IFS=','
-        DOCKER_IMAGE=()
+      - id: set-matrix
+        env:
+          TORCH_VERSIONS: ${{ inputs.torch_version }}
+          CUDA_VERSIONS: ${{ inputs.cuda_version }}
+        run: |
+          IFS=','
+          DOCKER_IMAGE=()
 
-        for tv in $TORCH_VERSIONS
-        do
-            for cv in $CUDA_VERSIONS
-            do
-                DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
-            done
-        done
+          for tv in $TORCH_VERSIONS
+          do
+              for cv in $CUDA_VERSIONS
+              do
+                  DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tv}-${cv}\"")
+              done
+          done
 
-        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
-        container="[${container}]"
-        echo "$container"
-        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+          container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+          container="[${container}]"
+          echo "$container"
+          echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
 
   build:
     name: Test for PyTorch Compatibility
@@ -70,6 +70,17 @@ jobs:
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
       - name: Install Colossal-AI
         run: |
           pip install -r requirements/requirements.txt
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index 2fca67b82..94a723388 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -3,8 +3,8 @@ name: Compatibility Test on PR
 on:
   pull_request:
     paths:
-      - 'version.txt'
-      - '.compatibility'
+      - "version.txt"
+      - ".compatibility"
 
 jobs:
   matrix_preparation:
@@ -58,6 +58,18 @@ jobs:
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Install Colossal-AI
         run: |
           pip install -v --no-cache-dir .
diff --git a/README.md b/README.md
index c33caba90..34c8a6b73 100644
--- a/README.md
+++ b/README.md
@@ -362,6 +362,22 @@ If you want to install and enable CUDA kernel fusion (compulsory installation wh
 CUDA_EXT=1 pip install .
 ```
 
+For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
+
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Use Docker
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index b626edb19..6fc4ce2c9 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -48,5 +48,20 @@ If you don't want to install and enable CUDA kernel fusion (compulsory installat
 pip install .
 ```
 
+For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
 
 <!-- doc-test-command: echo "installation.md does not need test" -->
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index e0d726c74..a32627db6 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -47,4 +47,20 @@ CUDA_EXT=1 pip install .
 pip install .
 ```
 
+如果您在使用CUDA 10.2，您仍然可以从源码安装ColossalA。但是您需要手动下载cub库并将其复制到相应的目录。
+
+```bash
+# clone the repository
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# download the cub library
+wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+unzip 1.8.0.zip
+cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+
+# install
+CUDA_EXT=1 pip install .
+```
+
 <!-- doc-test-command: echo "installation.md does not need test" -->

From a64df3fa97777f3563aa39145b473c42c250f49c Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Thu, 25 May 2023 14:58:01 +0800
Subject: [PATCH 06/26] [doc] update document of gemini instruction. (#3842)

* [doc] update meet_gemini.md

* [doc] update meet_gemini.md

* [doc] fix parentheses

* [doc] fix parentheses

* [doc] fix doc test

* [doc] fix doc test

* [doc] fix doc
---
 .../en/advanced_tutorials/meet_gemini.md      | 25 ++++++++++------
 .../zh-Hans/advanced_tutorials/meet_gemini.md | 30 ++++++++++---------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/advanced_tutorials/meet_gemini.md b/docs/source/en/advanced_tutorials/meet_gemini.md
index 8afb6705b..c1c23a355 100644
--- a/docs/source/en/advanced_tutorials/meet_gemini.md
+++ b/docs/source/en/advanced_tutorials/meet_gemini.md
@@ -9,16 +9,21 @@ When you only have a few GPUs for large model training tasks, **heterogeneous tr
 
 ## Usage
 
-At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini. Set attribute of zero model_config, i.e., tensor_placement_policy='auto'.
+At present, Gemini supports compatibility with ZeRO parallel mode, and it is really simple to use Gemini: Inject the feathures of `GeminiPlugin` into training components with `booster`. More instructions of `booster` please refer to [**usage of booster**](../basics/booster_api.md).
 
-```
-zero = dict(
-    model_config=dict(
-        tensor_placement_policy='auto',
-        shard_strategy=BucketTensorShardStrategy()
-    ),
-    optimizer_config=dict(
-    ...)
+```python
+from torchvision.models import resnet18
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+booster = Booster(plugin=plugin)
+ctx = ColoInitContext()
+with ctx:
+    model = resnet18()
+optimizer = HybridAdam(model.parameters(), lr=1e-3)
+criterion = lambda x: x.mean()
+model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 )
 ```
 
@@ -86,3 +91,5 @@ The important duty of MSC is to adjust the tensor layout position. For example,
 In the warmup stage, since we haven't finished a complete iteration yet, we don't know actual memory occupation. At this time, we limit the upper bound of memory usage of the model data. For example, only 30% of the GPU memory can be used. This ensures that we can successfully complete the warmup state.
 
 In the non-warmup stage, we need to use the memory information of non-model data collected in the warm-up stage to reserve the peak memory required by the computing device for the next Period, which requires us to move some model tensors. In order to avoid frequent replacement of the same tensor in and out of the CPU-GPU, causing a phenomenon similar to [cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science)). Using the iterative characteristics of DNN training, we design the OPT cache swap out strategy. Specifically, in the warmup stage, we record the sampling time required by each tensor computing device. If we need to expel some HOLD tensors, we will choose the latest tensor needed on this device as the victim.
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 meet_gemini.py  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
index a52bc6ac7..594823862 100644
--- a/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
+++ b/docs/source/zh-Hans/advanced_tutorials/meet_gemini.md
@@ -8,21 +8,21 @@
 
 ## 用法
 
-目前Gemini支持和ZeRO并行方式兼容，它的使用方法很简单，在训练策略的配置文件里设置zero的model_config属性tensor_placement_policy='auto'
+目前Gemini支持和ZeRO并行方式兼容，它的使用方法很简单：使用booster将`GeminiPlugin`中的特性注入到训练组件中。更多`booster`介绍请参考[booster使用](../basics/booster_api.md)。
 
-```
-zero = dict(
-    model_config=dict(
-        reduce_scatter_bucket_size_mb=25,
-        fp32_reduce_scatter=False,
-        gradient_predivide_factor=1.0,
-        tensor_placement_policy="auto",
-        shard_strategy=TensorShardStrategy(),
-        ...
-    ),
-    optimizer_config=dict(
-        ...
-    )
+```python
+from torchvision.models import resnet18
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+booster = Booster(plugin=plugin)
+ctx = ColoInitContext()
+with ctx:
+    model = resnet18()
+optimizer = HybridAdam(model.parameters(), lr=1e-3)
+criterion = lambda x: x.mean()
+model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 )
 ```
 
@@ -94,3 +94,5 @@ MSC的重要职责是在调整tensor layout位置，比如在上图S2时刻，
 
 在non-warmup阶段，我们需要利用预热阶段采集的非模型数据内存信息，预留出下一个Period在计算设备上需要的峰值内存，这需要我们移动出一些模型张量。
 为了避免频繁在CPU-GPU换入换出相同的tensor，引起类似[cache thrashing](https://en.wikipedia.org/wiki/Thrashing_(computer_science))的现象。我们利用DNN训练迭代特性，设计了OPT cache换出策略。具体来说，在warmup阶段，我们记录每个tensor被计算设备需要的采样时刻。如果我们需要驱逐一些HOLD tensor，那么我们选择在本设备上最晚被需要的tensor作为受害者。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 meet_gemini.py  -->

From e2d81eba0d526afec3e51f2212b5d7927bf72361 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 25 May 2023 16:19:41 +0800
Subject: [PATCH 07/26] [nfc] fix typo colossalai/ applications/ (#3831)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/
---
 .../Chat/coati/ray/src/detached_replay_buffer.py     |  2 +-
 .../Chat/coati/ray/src/experience_maker_holder.py    |  2 +-
 applications/Chat/coati/ray/src/pipeline_strategy.py |  2 +-
 applications/Chat/evaluate/evaluator.py              |  2 +-
 applications/Chat/evaluate/metrics.py                |  8 ++++----
 .../tensor_shard/node_handler/matmul_handler.py      |  2 +-
 .../auto_parallel/tensor_shard/utils/broadcast.py    | 12 ++++++------
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/applications/Chat/coati/ray/src/detached_replay_buffer.py b/applications/Chat/coati/ray/src/detached_replay_buffer.py
index 855eee48c..18c8db388 100644
--- a/applications/Chat/coati/ray/src/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/src/detached_replay_buffer.py
@@ -34,7 +34,7 @@ class DetachedReplayBuffer:
         '''
         Workers in the same tp group share this buffer and need same sample for one step.
             Therefore a held_sample should be returned tp_world_size times before it could be dropped.
-            worker_state records wheter a worker got the held_sample
+            worker_state records whether a worker got the held_sample
         '''
         self.tp_world_size = tp_world_size
         self.worker_state = [False] * self.tp_world_size
diff --git a/applications/Chat/coati/ray/src/experience_maker_holder.py b/applications/Chat/coati/ray/src/experience_maker_holder.py
index 94e4a3d53..0ae4e3125 100644
--- a/applications/Chat/coati/ray/src/experience_maker_holder.py
+++ b/applications/Chat/coati/ray/src/experience_maker_holder.py
@@ -22,7 +22,7 @@ from .utils import is_rank_0, get_strategy_from_args, set_dist_env
 class ExperienceMakerHolder:
     '''
     Args:
-        detached_trainer_name_list: str list to get ray actor handleskkk
+        detached_trainer_name_list: str list to get ray actor handles
         strategy: 
         experience_batch_size: batch size of generated experience
         kl_coef: the coefficient of kl divergence loss
diff --git a/applications/Chat/coati/ray/src/pipeline_strategy.py b/applications/Chat/coati/ray/src/pipeline_strategy.py
index 1780839c6..7ecb5d7d8 100644
--- a/applications/Chat/coati/ray/src/pipeline_strategy.py
+++ b/applications/Chat/coati/ray/src/pipeline_strategy.py
@@ -26,7 +26,7 @@ rpc_is_initialized = _is_current_rpc_agent_set
 class PipelineModel(torch.nn.Module):
     '''
     Actor has 2 kinds of jobs: forward and generate. 
-        better to just pipelinize the inner model
+        better to just pipeline the inner model
     '''
     def __init__(self,
                  model: torch.nn.Module,
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index b99509c99..d3d1c038b 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -119,7 +119,7 @@ class Evaluator(object):
             jdump(all_evaluations,
                   os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
 
-            # Start to calculate scores and save statictics.
+            # Start to calculate scores and save statistics.
             evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
             gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
                                                           evaluation_statistics_save_path)
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
index 590790ae7..5e657234c 100644
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -111,7 +111,7 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
     The calculation of precision, recall and f1-score is realized by counting
     the number f overlaps between the preds and target. The comparison length
     limited by the shorter one of preds and targets. This design is mainly
-    considered for classifiction and extraction categories.
+    considered for classification and extraction categories.
     """
     precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
     precision_scores = []
@@ -138,7 +138,7 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
 
 def precision(preds: list, targets: list) -> dict:
     """Calculate Precision Metric
-    (design for classifiction and extraction categories)
+    (design for classification and extraction categories)
 
     Calculating precision by counting the number of overlaps between the preds and target.
     """
@@ -149,7 +149,7 @@ def precision(preds: list, targets: list) -> dict:
 
 def recall(preds: list, targets: list) -> dict:
     """Calculate Recall Metric
-    (design for classifiction and extraction categories)
+    (design for classification and extraction categories)
 
     Calculating recall by counting the number of overlaps between the preds and target.
     """
@@ -160,7 +160,7 @@ def recall(preds: list, targets: list) -> dict:
 
 def F1_score(preds: list, targets: list) -> dict:
     """Calculate F1-score Metric
-    (design for classifiction and extraction categories)
+    (design for classification and extraction categories)
 
     Calculating f1-score by counting the number of overlaps between the preds and target.
     """
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
index bfebc3f59..fa51114a5 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -206,7 +206,7 @@ class Broadcaster(BmmTransform):
                     # e.g. [1, 2, 4] x [4, 4, 8] -> [4, 2, 8]
                     # the dim 0 of [1, 2, 4] is multiplied to 4
                     tensor_shape[dim_idx] = 1
-                elif broadcast_type == BroadcastType.PADDDING:
+                elif broadcast_type == BroadcastType.PADDING:
                     # if the dim is padded
                     # we remove its sharding
                     tensor_shape[dim_idx] = None
diff --git a/colossalai/auto_parallel/tensor_shard/utils/broadcast.py b/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
index 28aa55132..307348ea1 100644
--- a/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
@@ -21,7 +21,7 @@ __all__ = [
 
 class BroadcastType(Enum):
     EQUAL = auto()
-    PADDDING = auto()
+    PADDING = auto()
     MULTIPLE = auto()
 
 
@@ -69,18 +69,18 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
     for i in range(logical_num_dims):
         # get the trailing dim size
         logical_dim_idx = logical_num_dims - i - 1
-        phyiscal_dim_idx = physical_num_dims - i - 1
+        physical_dim_idx = physical_num_dims - i - 1
         logical_dim_size = logical_shape[logical_dim_idx]
 
-        if phyiscal_dim_idx >= 0:
-            physical_dim_size = physical_shape[phyiscal_dim_idx]
+        if physical_dim_idx >= 0:
+            physical_dim_size = physical_shape[physical_dim_idx]
 
             if physical_dim_size == logical_dim_size:
                 logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.EQUAL
             elif physical_dim_size == 1 and physical_dim_size != logical_dim_size:
                 logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.MULTIPLE
         else:
-            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDDING
+            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDING
 
     return logical_dim_broadcast_info
 
@@ -117,7 +117,7 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
     for shape_dim, mesh_dim in logical_dim_partition.items():
         logical_broadcast_type = logical_dim_broadcast_info[shape_dim]
 
-        if logical_broadcast_type == BroadcastType.PADDDING or logical_broadcast_type == BroadcastType.MULTIPLE:
+        if logical_broadcast_type == BroadcastType.PADDING or logical_broadcast_type == BroadcastType.MULTIPLE:
             removed_dims.extend(mesh_dim)
         else:
             # get the corresponding physical dim

From d42b1be09d95c00c806e175ec289ce9215904140 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 May 2023 16:20:07 +0800
Subject: [PATCH 08/26] [release] bump to v0.3.0 (#3830)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index a45be4627..0d91a54c7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.8
+0.3.0

From ae959a72a5da236f6656be5fab01a6f8f900097c Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 May 2023 16:42:34 +0800
Subject: [PATCH 09/26] [workflow] fixed workflow check for docker build
 (#3849)

---
 .github/workflows/release_docker_after_publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml
index 7d83fa371..22698ca19 100644
--- a/.github/workflows/release_docker_after_publish.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@@ -8,7 +8,7 @@ on:
 jobs:
   release:
     name: Publish Docker Image to DockerHub
-    if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI'
+    if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
       image: "hpcaitech/docker-in-docker:latest"

From b0474878bf043cf10ab4ce4b6cbdf54d30b24c8d Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Fri, 26 May 2023 01:22:01 +0800
Subject: [PATCH 10/26] [doc] update nvme offload documents. (#3850)

---
 docs/source/en/features/nvme_offload.md      | 19 +++++++++++------
 docs/source/zh-Hans/features/nvme_offload.md | 22 +++++++++++++-------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/features/nvme_offload.md b/docs/source/en/features/nvme_offload.md
index d940fd5ec..6ed6f2dee 100644
--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -78,8 +78,9 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
 import colossalai
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
 ```
 
 Then we define a loss function:
@@ -192,17 +193,23 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
     optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
     print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
 
-    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
-                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
-    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
-    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+    plugin = GeminiPlugin(
+                strict_ddp_mode=True,
+                device=torch.cuda.current_device(),
+                placement_policy='cpu',
+                pin_memory=True,
+                hidden_dim=config.n_embd,
+                initial_scale=2**5
+                )
+    booster = Booster(plugin)
+    model, optimizer, criterion, _* = booster.boost(model, optimizer, criterion)
 
     start = time.time()
     for step in range(3):
         data = get_data(4, 128, config.vocab_size)
         outputs = model(**data)
         loss = criterion(outputs.logits, data['input_ids'])
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
         optimizer.zero_grad()
         print(f'[{step}] loss: {loss.item():.3f}')
diff --git a/docs/source/zh-Hans/features/nvme_offload.md b/docs/source/zh-Hans/features/nvme_offload.md
index db5f10184..1feb9dde5 100644
--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -55,7 +55,6 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n
 
 ## Examples
 
-Let's start from two simple examples -- training GPT with different methods. These examples relies on `transformers`.
 首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖`transformers`。
 
 我们首先应该安装依赖：
@@ -77,8 +76,9 @@ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
 from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 import colossalai
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin
 ```
 
 然后我们定义一个损失函数：
@@ -182,16 +182,24 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
     criterion = GPTLMLoss()
     optimizer = HybridAdam(model.parameters(), nvme_offload_fraction=nvme_offload_fraction)
     print(f'Model numel: {get_model_numel(model) / 1024**3:.3f} B')
-    gemini_config = dict(strict_ddp_mode=True, device=torch.cuda.current_device(),
-                         placement_policy='cpu', pin_memory=True, hidden_dim=config.n_embd)
-    model = zero_model_wrapper(model, zero_stage=3, gemini_config=gemini_config)
-    optimizer = zero_optim_wrapper(model, optimizer, initial_scale=2**5)
+
+    plugin = GeminiPlugin(
+                strict_ddp_mode=True,
+                device=torch.cuda.current_device(),
+                placement_policy='cpu',
+                pin_memory=True,
+                hidden_dim=config.n_embd,
+                initial_scale=2**5
+                )
+    booster = Booster(plugin)
+    model, optimizer, criterion, _* = booster.boost(model, optimizer, criterion)
+
     start = time.time()
     for step in range(3):
         data = get_data(4, 128, config.vocab_size)
         outputs = model(**data)
         loss = criterion(outputs.logits, data['input_ids'])
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
         optimizer.zero_grad()
         print(f'[{step}] loss: {loss.item():.3f}')

From 2506e275b842103620439b7ddfa7ba9908d4eb7e Mon Sep 17 00:00:00 2001
From: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Date: Tue, 30 May 2023 11:48:41 +0800
Subject: [PATCH 11/26] [evaluation] improvement on evaluation (#3862)

* fix a bug when the config file contains one category but the answer file doesn't contains that category

* fix Chinese prompt file

* support gpt-3.5-turbo and gpt-4 evaluation

* polish and update README

* resolve pr comments

---------

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
---
 applications/Chat/evaluate/README.md          | 235 ++++++++++++------
 .../Chat/evaluate/config/config_cn.json       |  20 +-
 applications/Chat/evaluate/eval.py            |   7 +-
 applications/Chat/evaluate/evaluator.py       |  36 ++-
 applications/Chat/evaluate/gpt_evaluate.py    | 150 +++++++++--
 .../evaluation_prompt_cn.json                 |  26 +-
 applications/Chat/evaluate/utils.py           |   3 +-
 7 files changed, 335 insertions(+), 142 deletions(-)

diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index 1e86eadf1..ae3499bf2 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -4,7 +4,9 @@ In this directory, we introduce how you can evaluate your model with our pipelin
 evaluation of Chinese capability and the one for English capability is under preparation.
 
 ## Installation
+
 To start model evaluation, you need to install required packages which listed in `requirements.txt` under `evaluate` folder.
+
 ```shell
 pip install -r requirements.txt
 ```
@@ -12,84 +14,92 @@ pip install -r requirements.txt
 ## Evaluation Pipeline
 
 The whole evaluation pipeline consists of two methods:
-1. `GPT Evaluation`: evaluates model predictions using the GPT-3.5.
+
+1. `GPT Evaluation`: evaluates model predictions using GPT models.
    * Compare the performance of two different models (battle).
-   * Rate model according to pre-defined metrics using prompting design.
+   * Rate the model according to pre-defined metrics using prompting design.
 2. `Automatic Evaluation`: evaluates model predictions using automatic metrics.
 
 ### Evaluation Category
-The model capability is seperated into 10 evaluation categories, which refers to the user case mentioned in InstructGPT.
-Following table introduces each category:
 
-| Evaluation Category | Description                                                                                                                              |
-|:-------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------|
-|      Roleplay       | Given certain characteristic, the capability of chatting as the character                                                                |
-|        Chat         | Conduct multiple rounds of dialogue, the capability of understanding and memorization of previous rounds of dialogue                     |
-|       Open QA       | Given an open question, the capability of answering questions in opened-ended way                                                        |
-|      Closed QA      | Given a closed question, the capability of answering questions with limited scope (such as single/multiple choice question)              |
-|    Brainstorming    | Given a question requiring divergent answers, the capability of divergent answering and listing in points                                |
-|     Generation      | Given generation task, the capability of generating in high quality and human-written way (such as writing an email)                     |
-|      Rewriting      | Given rewriting task, the capability of rewriting sentences to meet task requirements (such as active and passive switches, translation) |
-|   Classification    | Given classification task, the capability of accurate classification                                                                     |
-|     Extraction      | Given extraction task, the capability of extracting required information                                                                 |
-|    Summarization    | Given a paragraph or passage, the capability of summarization                                                                            |
+Our evaluation pipeline examines the model's capability using 10 categories of questions. The following table introduces each category:
 
-To better understand each evaluation category, here are some prompt examples provided.
+| Evaluation Category | <center>Description</center>                                                  |
+| :-----------------: | :----------------------------------------------------------- |
+|    Brainstorming    | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
+|        Chat         | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
+|   Classification    | Models are asked to do classification tasks. The capability of accurate classification is required. |
+|      Closed QA      | Models are asked to answer a closed QA question. The capability of answering questions with limited scope (such as single/multiple choice question) is required. |
+|     Extraction      | Models are asked to extract information from a given material. The capability of extracting required information is required. |
+|     Generation      | Models are asked to generate an email, letter, article, etc. The capability of generating texts in a high quality and human-written way is required. |
+|       Open QA       | Models are asked to answer an open QA question(without context provided). The capability of answering questions with the models' own knowledge base is required. |
+|      Roleplay       | Models are asked to play the role provided. The capability of engaging in the scenario and effectively interacting with the user is required. |
+|      Rewriting      | Models are asked to do rewriting tasks such as translation and grammar correction. The capability of rewriting according to different instructions is required. |
+|    Summarization    | Models are asked to summarize the given paragraph or passage. The capability of summarization is required. |
+
+To better understand each evaluation category, here are some example questions provided.
 
 
-| Evaluation Category | Chinese Example                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | English Example                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-|:-------------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|      Roleplay       | **Example 1：**<br/>我想让你担任Android开发工程师面试官。我将成为候选人，您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题，等待我的回答。不要写解释。像面试官一样一个一个问我，等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2：**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众，您可以为您的讲故事环节选择特定的主题或主题，例如，如果是儿童，那么您可以谈论动物；如果是成人，那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。                                                                                                  | **Example 1：**<br/>I want you to act as a journalist. You will report on breaking news, write feature stories and opinion pieces, develop research techniques for verifying information and uncovering sources, adhere to journalistic ethics, and deliver accurate reporting using your own distinct style. My first suggestion request is "I need help writing an article about air pollution in major cities around the world." <br/><br/>**Example 2:**<br/> I want you to act as a debater. I will provide you with some topics related to current events and your task is to research both sides of the debates, present valid arguments for each side, refute opposing points of view, and draw persuasive conclusions based on evidence. Your goal is to help people come away from discussion with increased knowledge and insight into the topic at hand. My first request is "I want an opinion piece about Deno." |
-|        Chat         | **Example 1：**<br/>基于以下角色信息完成一段对话。小王，20岁，是一名大学生；小张，22岁，是一名大学生。<br/> 小王：小张，我们一起去看电影吧？ 小张：好啊，你想看什么电影呢？ 小王：我听说最近有一部很好看的科幻电影。你觉得怎么样？<br/>小张：<br/><br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生，一位老年病患者想要停药，但他对病情有所忽视并有担忧；王叔叔是老年病患者的儿子，希望能够听取医生的建议。<br/>小明：你好，王叔叔，我了解你想要让你父亲停药。<br/>王叔叔：是的，我父亲已经吃了那么久的药，我担心药物对他的身体会有副作用。<br/>小明：                                                                                                                                                        | **Example 1：**<br/>Human: Who is the highest try scorer in the current Sydney Roosters team?<br/>AI: As an AI language model, I don't have access to current statistics. However, you can check the current Sydney Roosters team's official website or relevant sports news websites to find the answer.<br/>Human: Thanks for the suggestion, but can you recommend a reliable sports news website that I can check?<br/>AI:<br/><br/>**Example 2:**<br/>Complete a dialogue based on the following role information. <br/>A: Elementary student  B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B:                                                                                                                                                                               |
-|       Open QA       | **Example 1：**<br/>请问万有引力定律由谁提出的？<br/><br/>**Example 2：**<br/>哪些国家参与了第一次世界大战？                                                                                                                                                                                                                                                                                                                                                                                                       | **Example 1：**<br/>Who are the indigenous people of New Zealand?<br/><br/>**Example 2：**<br/>How do you take the derivative of the sin function?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|      Closed QA      | **Example 1：**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰？ <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2：**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题：非洲最高的山是哪座山？<br/> 选项： <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山                                                                                                                                                                                                                                                                  | **Example 1：**<br/>Answer the following question:<br/>What shape is the Earth?<br/>A) A circle<br/>B) A sphere<br/>C) An ellipse<br/>D) A plane<br/><br/>**Example 2：**<br/>Choose the correct classification for the following question:<br/>"What type of restaurant is 'Burger King'"?<br/>fast food<br/>family style<br/>formal dining<br/>buffet<br/>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|    Brainstorming    | **Example 1：**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2：**<br/>请给出管理家庭财务的3个小技巧。<br/>                                                                                                                                                                                                                                                                                                                                                                                                | **Example 1：**<br/>What are 10 science fiction books I should read next?<br/><br/>**Example 2：**<br/>List five ideas for how to regain enthusiasm for my career.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|     Generation      | **Example 1：**<br/>请撰写一篇文章，介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2：**<br/>请根据以下情节撰写一篇短篇小说：一名年轻人被困在一个荒岛上，他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。                                                                                                                                                                                                                                                                                                                                          | **Example 1：**<br/>Can you help me write a formal email to a potential business partner proposing a joint venture?<br/><br/>**Example 2：**<br/>Please use the appropriate format to write a formal letter of recommendation for a student applying to a prestigious computer science graduate program at a university.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|      Rewriting      | **Example 1：**<br/>将以下句子改为被动语态：<br/>"他们正在洗车"<br/><br/>**Example 2：**<br/>将以下文本翻译成英语：<br/>“这个周末我要去海边玩”                                                                                                                                                                                                                                                                                                                                                                               | **Example 1：**<br/>Translate the following text into English: <br/>"我最喜欢的季节是春天，因为我可以看到美丽的花朵。"<br/><br/>**Example 2：**<br/>Please correct the following sentences and give them the correct sentence.<br/>"Their going to the party there."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|   Classification    | **Example 1：**<br/>新闻标题：今日立夏，有一上联，立夏万物并秀，下联怎么对？<br/>请根据以上新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。<br/><br/> **Example 2：**<br/>新闻标题：赵丽颖很久没有登上微博热搜了，但你们别急，她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。                                                                                                                                                                                                                             | **Example 1：**<br/>Classify the given email as spam or non-spam.<br/>"Hello, this is an email reminding you to pay your property fees"<br/><br/>**Example 2：**<br/>Classify the following text as news, ads or forum posts<br/>"The latest iPhone 13 is now available, shop now!"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-|     Extraction      | **Example 1：**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2007-4-7中新网4月7日电据中国消防在线消息，4月4日晚上7时30分左右，湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台，警力100余人。经过消防官兵近2个小时奋力扑救，大火被成功扑灭。据初步调查，有1人在此次事故中死亡。<br/><br/>**Example 2：**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2014年1月15日，据外媒《俄罗斯报》报道称，位于北半球的澳大利亚现在正处于炎热的夏季，而近日也到了高温酷暑的时候，当地时间1月14日晚，澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响，澳大利亚南部一夜间发生至少250起火灾，灾情多集中在维多利亚州。火灾发生后，救援人员立即展开救灾行动。目前，大部分起火点火势已被控制。 | **Example 1：**<br/>Extract all phenotypes of the following text:<br/>"The 55-year-old patient has fever and hypertension."<br/><br/>**Example 2：**<br/>Extract the location mentioned in the following text:<br/>"The student graduated from Harvard university, which is located in Boston"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-|    Summarization    | **Example 1：**<br/>请简要总结概括以下段落材料。<br/>新华社快讯：斯里兰卡政府部门21日说，首都科伦坡包括教堂、酒店等多个地点当天发生的爆炸已导致至少70人死亡，另有260多人受伤。<br/><br/> **Example 2：**<br/>请简要总结概括以下段落材料。<br/>近期，参与京雄高铁站站房建设的中铁十二局，因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后，引起社会广泛关注。近日，人民网记者从雄安新区相关部门及中铁十二局获悉，新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果，中铁十二局有关宣传负责人回应：“具体内容不清楚，最好找雄安新区相关部门了解情况。”新区有关部门负责人表示，此前涉及的环境违法行为，中铁十二局已基本整改到位，但约谈内容和结果暂不公开，接下来，将按部就班推进环境治理工作。（原题为《雄安新区：中铁十二局涉环境违法已基本整改到位》）                                                | **Example 1：**<br/>Please provide a summary based on the following news：<br/>"China plans to launch its first space station core module in 2022, an important development in the country's space program. The space station, called Tianhe, will include three modules: a core module, an experiment module and an astronomy module. The first launch of the core module will be used to test and verify the basic functions of the station, as well as to conduct related scientific research and technology experiments. "<br/><br/>**Example 2：**<br/>What information is provided in the table below? Summarize the core information in it？<br/>"Ranking, Player Name, Team, Position, Salary (in millions of dollars)<br/>1, LeBron James, Los Angeles Lakers, SF, 45.0<br/>2, Stephen Curry, Golden State Warriors, PG, 43.5"                                                                                           |
+| Evaluation Category | <center>Chinese Example</center>                                              | <center>English Example</center>                                              |
+| :-----------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+|    Brainstorming    | **Example 1:**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2:**<br/>请给出管理家庭财务的3个小技巧。<br/> | **Example 1:**<br/>How can I improve my memory? Any useful techniques you can suggest?<br/><br/>**Example 2:**<br/>What are some ways to increase productivity while working from home? |
+|        Chat         | **Example 1:**<br/>基于以下角色信息完成一段对话。小张是一名新手爱好者，对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。<br/>小张：您好，老李，我最近开始对养鸡感兴趣了，想请教您一些问题。 <br/>老李：你好，小张，我很乐意帮助你。你想问些什么？ <br/>小张：我想知道如何确定鸡的品种和性别？ <br/>老李：确切的品种可以通过鸡的外貌特征来确定，而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗？<br/> 小张：<br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生，一位老年病患者想要停药，但他对病情有所忽视并有担忧；王叔叔是老年病患者的儿子，希望能够听取医生的建议。<br/>小明：你好，王叔叔，我了解你想要让你父亲停药。<br/>王叔叔：是的，我父亲已经吃了那么久的药，我担心药物对他的身体会有副作用。<br/>小明： | **Example 1:**<br/>Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants.<br/>Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. <br/>Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. <br/>Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. <br/>Jack: <br/>**Example 2:**<br/>Complete a dialogue based on the following role information. A: Elementary student  B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
+|   Classification    | **Example 1:**<br/>新闻标题：今日立夏，有一上联，立夏万物并秀，下联怎么对？<br/>请根据以上新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。<br/><br/> **Example 2:**<br/>新闻标题：赵丽颖很久没有登上微博热搜了，但你们别急，她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。 | **Example 1:**<br/>Title:  Fighting for Love (2020) <br/>Description: Jasmine got obsessed with a man and now he's obsessed with her. Steamy nights, kisses and rules being broken awaits them. She turned his whole world upside down and now he's doing it to hers. In this free fall, can they survive each others love?\"<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\".<br/><br/>**Example2:** <br/>Title:  Summer Breeze: The Isley Brothers Greatest Hits Live (2005)<br/>Description: Filmed in the US in 2005 and captured in excellent form led by Ron Isley's vocals and Ernie Isley's hard edged guitar. Virtually every track is a hit including Shout, Who's That Lady, Twist And Shout, Summer Breeze and Harvest For The World.<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\"." |
+|      Closed QA      | **Example 1:**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰？ <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2:**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题：非洲最高的山是哪座山？<br/> 选项： <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1:**<br/>Which of the following options is NOT a primary color?<br/>(a) yellow<br/>(b) blue<br/>(c) orange<br/>(d) red<br/>**Example 2:**<br/>Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the ________ book in the Harry Potter series.<br/>(A) first<br/>(B) second<br/>(C) third<br/>(D) fourth |
+|     Extraction      | **Example 1:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2007-4-7中新网4月7日电据中国消防在线消息，4月4日晚上7时30分左右，湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台，警力100余人。经过消防官兵近2个小时奋力扑救，大火被成功扑灭。据初步调查，有1人在此次事故中死亡。<br/><br/>**Example 2:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2014年1月15日，据外媒《俄罗斯报》报道称，位于北半球的澳大利亚现在正处于炎热的夏季，而近日也到了高温酷暑的时候，当地时间1月14日晚，澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响，澳大利亚南部一夜间发生至少250起火灾，灾情多集中在维多利亚州。火灾发生后，救援人员立即展开救灾行动。目前，大部分起火点火势已被控制。 | **Example 1:**<br/>Ernest Hemingway, an American literary giant known for his spare and direct writing style, has penned timeless works such as 'The Old Man and the Sea', 'For Whom the Bell Tolls', and 'A Farewell to Arms', which have made a profound impact on the literary world and continue to be widely read and admired today.<br/>Extract the name of the author mentioned above.<br/><br/>**Example 2:**<br/>In the epic fantasy series 'A Song of Ice and Fire', George R.R. Martin weaves a complex web of political intrigue, war, and magic across the fictional continents of Westeros and Essos. Martin's richly developed characters and intricate plotlines have captivated readers worldwide, much like his other acclaimed works such as 'A Clash of Kings' and 'A Storm of Swords'.<br/>Extract the name of the author in the above material. |
+|     Generation      | **Example 1:**<br/>请撰写一篇文章，介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2:**<br/>请根据以下情节撰写一篇短篇小说：一名年轻人被困在一个荒岛上，他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1:**<br/>Write a descriptive paragraph about an island to relax and unwind, including details about the location and atmosphere.<br/><br/>**Example 2:**<br/>Can you help me write a persuasive email to my colleagues encouraging them to participate in a charitable fundraising event? |
+|       Open QA       | **Example 1:**<br/>请问万有引力定律由谁提出的？<br/><br/>**Example 2:**<br/>哪些国家参与了第一次世界大战？ | **Example 1:**<br/>What are the four basic tastes of the human palate?<br/><br/>**Example 2:**<br/>Who painted the The Scream? |
+|      Rewriting      | **Example 1:**<br/>请将以下句子改为正确的语序。 <br/>生日快乐你祝他了吗？<br/><br/>**Example 2:**<br/>将以下文本翻译成英语：<br/>“这个周末我要去海边玩” | **Example 1:**<br/>Please translate the following sentences, which are a mixture of Chinese and English, into full English. <br/>我需要买一些healthy snacks，比如nuts和dried fruits，作为我的office的午餐.<br/><br/>**Example 2:**<br/>Please rewrite the sentence using an inverted sentence structure.<br/>We won't begin our journey until the sun sets. |
+|      Roleplay       | **Example 1:**<br/>我想让你担任Android开发工程师面试官。我将成为候选人，您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题，等待我的回答。不要写解释。像面试官一样一个一个问我，等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2:**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众，您可以为您的讲故事环节选择特定的主题或主题，例如，如果是儿童，那么您可以谈论动物；如果是成人，那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:**<br/>Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust. <br/><br/>**Example 2: **<br/>I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
+|    Summarization    | **Example 1:**<br/>请简要总结概括以下段落材料。<br/>当地时间29日，泰国卫生部通报，新增143名新冠肺炎确诊病例和1名死亡病例。截止到当地时间29日上午，泰国累计确诊病例1388例，其中泰国籍1172例，非泰国籍216例。死亡病例累计7例。（原题为《泰国新增143例新冠肺炎确诊病例累计确诊1388例》）<br/><br/> **Example 2:**<br/>请简要总结概括以下段落材料。<br/>近期，参与京雄高铁站站房建设的中铁十二局，因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后，引起社会广泛关注。近日，人民网记者从雄安新区相关部门及中铁十二局获悉，新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果，中铁十二局有关宣传负责人回应：“具体内容不清楚，最好找雄安新区相关部门了解情况。”新区有关部门负责人表示，此前涉及的环境违法行为，中铁十二局已基本整改到位，但约谈内容和结果暂不公开，接下来，将按部就班推进环境治理工作。（原题为《雄安新区：中铁十二局涉环境违法已基本整改到位》） | **Example 1:**<br/>The 21 year-old-woman was treated by paramedics after the kitchen fire in Botfield Road in Shifnal, Shropshire. West Mercia Police said it is treating Wednesday morning's incident as arson and are appealing for any witnesses to contact them.The 50-year-old man has been arrested on suspicion of arson with intent to endanger life. For more on this and other stories from Shropshire.<br/>Please briefly summarize the above material within 20 words.<br/><br/>**Example 2:**<br/>South Wales Police were called to a property in Heolgerrig, Merthyr Tydfil, at about 13:40 BST on Sunday. The child was airlifted to Prince Charles Hospital but died shortly afterwards. Police are investigating the circumstances surrounding the incident and have appealed for witnesses. The girl's family are being supported by specially trained officers.<br/>Please briefly summarize the above material within 20 words. |
 
 
 ### Evaluation Metrics
+
 #### GPT Evaluation
-Use GPT-3.5 to evaluate the prediction of different models, and pre-define evaluation metrics for different categories. There are 10 pre-defined evaluation metrics and you can refer to the table below:
 
-|    Evaluation Metric    | Prompt Words                                                 | CoT                                                                                                                                                                                                                                                                       |
-|:-----------------------:|:-------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Language organization   | 语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。        | 1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2.检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。 |
-|        Relevance        | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。                           | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。                                                                                         |
-|       Creativity        | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。                          | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。                      |
-|      Practicality       | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。                         | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。                    |
-|       Correctness       | 正确性(1-5)：答案应该符合常识、生活实际等等                                     | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。                    |
-|       Naturalness       | 自然(1-5)：答案是否自然，并且符合问题给定的身份。                                  | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。                                                                                                                                                           |
-|      Engagingness       | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。                  | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。                                                                                                                               |
-|     Reasonableness      | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。        | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。                                                                                                        |
-|        Diversity        | 多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。 | 1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。<br/> 3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度，看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。                                                               |
-|        Fidelity         | 保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。                             | 1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求，确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。                                                                               |
-|       Conciseness       | 简明扼要(1-5)：答案是否简明扼要，没有冗余内容。                                   | 1. 阅读题目，提取出材料的重点。<br/> 2. 阅读该总结，并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。         |
+GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics in Chinese:
 
-GPT-3.5 evaluates the quality of model predictions based on the given prompt words and gives a score between 1-5.
+|   Evaluation Metric   | <center>Prompt Words</center>                                                 | <center>CoT(Chain-of-Thought)</center>                                        |
+| :-------------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
+| Language organization | 语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。 | 1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2.检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。 |
+|       Relevance       | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。 | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。 |
+|      Creativity       | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。 |
+|     Practicality      | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。 |
+|      Correctness      | 正确性(1-5)：答案应该符合常识、生活实际等等                  | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。 |
+|      Naturalness      | 自然(1-5)：答案是否自然，并且符合问题给定的身份。            | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。 |
+|     Engagingness      | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。 | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。 |
+|    Reasonableness     | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。 | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。 |
+|       Diversity       | 多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。 | 1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。<br/> 3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度，看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。 |
+|       Fidelity        | 保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。  | 1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求，确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。 |
+|      Conciseness      | 简明扼要(1-5)：答案是否简明扼要，没有冗余内容。              | 1. 阅读题目，提取出材料的重点。<br/> 2. 阅读该总结，并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。 |
+
+GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
 
 #### Automatic Evaluation
+
 Automated metrics evaluate the capability of a model by comparing model predictions with reference answers.
 There are two ways to obtain reference answers:
+
 * For instruction coming from human-designed problems, the reference answers are generated by GPT-3.5, such as roleplay, chat.
 * For instruction related with classic NLP problems, the reference answers are collected from open-sourced dataset with target answers, such as classification, extraction, summarization.
 
 There are 5 types of automatic evaluation metrics listed in the table below:
 
- |     Automatic Evaluation Metric     | Description                                                                                                                                                                                        |
-|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|               BLEU-n                | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level<br/> BLEU-n (n-gram) evaluate the fluency in sentence level.                         |
+|     Automatic Evaluation Metric     | <center>Description</center>                                                  |
+| :---------------------------------: | :----------------------------------------------------------- |
+|               BLEU-n                | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level.<br/> BLEU-n (n-gram) evaluate the fluency in sentence level. |
 |                ROUGE                | ROUGE-N measures the number of matching n-grams between prediction and reference. <br/> ROUGE-L measures the number of matching longest common subsequence (LCS) between prediction and reference. |
-|              Distinct               | Measure the diversity of generation text by counting the unique n-grams.                                                                                                                           |
-|              BERTScore              | Measure the semantic similarity between tokens of predictions and references with BERT.                                                                                                            |
-| Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories)                                                                              |
+|              Distinct               | Measure the diversity of generation text by counting the unique n-grams. |
+|              BERTScore              | Measure the semantic similarity between tokens of predictions and references with BERT. |
+| Precision<br/> Recall<br/> F1 Score | Measure the number of overlaps between prediction and reference (design for classification and extraction categories). |
 
 ## Evaluation Process
+
 ### Data Format
+
 #### Target Answers / Predictions
+
 A JSON file contains one list. Each element in the list is a target answer / prediction record for one instruction / question.
 An element should have the following fields:
 
@@ -103,7 +113,8 @@ An element should have the following fields:
 If the `input` has a target answer, the `output` can be empty. Otherwise, we generate answers from GPT-3.5 as the `output`, and the `target` field is empty.
 
 Example:
-```
+
+```json
 [
     {
         "category": "brainstorming",
@@ -138,7 +149,8 @@ An element should have the following fields:
 * `id` (int, compulsory): The ID of the instruction / question.
 
 Example:
-```
+
+```json
 [
     {
         "category": "brainstorming",
@@ -159,34 +171,79 @@ Example:
 ]
 ```
 
-### Evaluation
-#### Configuration
-The configuration file `config_cn.json` can control how evaluate the performance of the model.
-The following is an example showing the config structure:
+### Prompt
+
+#### Battle Prompt
+
+The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find an example battle prompt file in `prompt/battle_prompt`.
+
+```json
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答	案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}
 ```
+
+#### Evaluation Prompt
+
+The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`.  You can find an example evaluation prompt file in `prompt/evaluation_prompt`.
+
+```json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。"
+    },
+    "CoT": {
+      "language organization": "1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。\n2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说。\n3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。\n4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。\n5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。\n6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。\n\n语言组织："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
+```
+
+`"metrics"`: the metrics that can be used in GPT evaluation. This field determines which metrics can be added to your config file.
+
+`"CoT"`: evaluation steps you prompt to GPT models for each metric defined in `"metrics"`.
+
+### Evaluation
+
+#### Configuration
+
+The following is an example of a Chinese config file. The configuration file can control how the pipeline evaluates the model. You need to specify GPT evaluation metrics and automatic metrics in key `GPT` and `Metrics`. You can find an example Chinese config file in `config`.
+
+```json
 {
     "language": "cn",
     "category": {
         "brainstorming": {
-            "GPT-3.5": ["relevance", "creativity", "practicality", "correctness"],
+            "GPT": ["relevance", "creativity", "practicality", "correctness"],
             "Metrics": ["Distinct"]
         },
         "chat": {
-            "GPT-3.5": [ "relevance", "naturalness", "engagingness", "reasonableness"],
+            "GPT": [ "relevance", "naturalness", "engagingness", "reasonableness"],
             "Metrics": ["Distinct"]
         }
     }
 }
 ```
-`"language"`: evaluate the model capability in which language, we only support Chinese `"cn"` for now.
-`"category"`: evaluate the model capability in which category/categories.
-`"GPT-3.5"`: config metrics for GPT-3.5 evaluation.
-`"Metrics"`: config metrics for automatic metrics evaluation.
+
+`"language"`: the language used to evaluate the model capability. We only support Chinese `"cn"` for now.
+
+`"category"`: the category/categories needed to evaluate the model capability.
+
+`"GPT"`: the metrics you want to use for GPT evaluation.
+
+`"Metrics"`: the metrics you want to use for automatic metrics evaluation.
 
 You can create your config file based on available settings listed in following table.
 
-|    "category"    |        "GPT-3.5"        |  "Metrics"  |
-|:----------------:|:-----------------------:|:-----------:|
+|    "category"    |          "GPT"          |  "Metrics"  |
+| :--------------: | :---------------------: | :---------: |
 | "brainstorming"  | "language organization" |   "BLEU"    |
 |      "chat"      |       "relevance"       |   "ROUGE"   |
 | "classification" |      "creativity"       | "Distinct"  |
@@ -194,16 +251,19 @@ You can create your config file based on available settings listed in following
 |   "extraction"   |      "correctness"      | "Precision" |
 |   "generation"   |      "naturalness"      |  "Recall"   |
 |    "open_qa"     |     "engagingness"      | "F1 score"  |
-|   "rewriting"    |    "reasonableness"     |
-|    "roleplay"    |       "diversity"       |
-| "summarization"  |       "fidelity"        |
-|                  |      "conciseness"      |
+|   "rewriting"    |    "reasonableness"     |             |
+|    "roleplay"    |       "diversity"       |             |
+| "summarization"  |       "fidelity"        |             |
+|                  |      "conciseness"      |             |
+
+> **NOTE:**  For categories which don't have standard answers such as `brainstorming`, you should avoid using automatic metrics such as `BLEU` and `ROUGE` which are based on similarity measures and you should use `Distinct` instead in your config file.
 
 #### Evaluate
-After setting the configuration file, you can evaluate the model using `eval.py`.
 
+After setting the configuration file, you can evaluate the model using `eval.py`. If you want to make comparisons between answers of two different models, you should specify two answer files in the argument `answer_file_list` and two model names in the argument `model_name_list`. If you want to evaluate one answer file, the length of both `answer_file_list` and `model_name_list` should be 1 and the program will perform evaluation using automatic metrics and GPT models.
 
 An example script is provided as follows:
+
 ```shell
 python eval.py \
     --config_file "path to the config file" \
@@ -212,14 +272,40 @@ python eval.py \
     --target_file "path to the target answer file" \
     --answer_file_list "path to the answer files of at most 2 models" \
     --model_name_list "the names of at most 2 models" \
+    --gpt_model "which GPT model to use for evaluation" \
     --save_path "path to save results" \
     --openai_key "your openai key" \
 ```
 
+## FAQ
+
+<details><summary><b>How can I add a new GPT evaluation metric?</b></summary>
+
+For example, if you want to add a new metric `persuasiveness` into category `brainstorming`, you should add the metric definition and its corresponding CoT(Chain-of-thought) in the evaluation prompt file in `prompt/evaluation_promt`. The CoT can be generated using ChatGPT. You can prompt ChatGPT to generate evaluation steps for the new metric.
+
+```json
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "persuasiveness": "说服力(1-5)：XXX"
+    },
+    "CoT": {
+      "persuasiveness": "XXX\n\n说服力："
+    },
+    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
+  }
+}
+```
+
+</details>
+
 ## To Do
+
 - [ ] Add evaluation for English capability
 - [ ] Support UniEval
-- [ ] Support GPT-4 evaluation
+- [x] Support GPT-4 evaluation
 
 ## Citations
 
@@ -232,15 +318,6 @@ python eval.py \
     year = {2023}
 }
 
-@misc{ouyang2022training,
-      title={Training language models to follow instructions with human feedback},
-      author={Long Ouyang and Jeff Wu and Xu Jiang and Diogo Almeida and Carroll L. Wainwright and Pamela Mishkin and Chong Zhang and Sandhini Agarwal and Katarina Slama and Alex Ray and John Schulman and Jacob Hilton and Fraser Kelton and Luke Miller and Maddie Simens and Amanda Askell and Peter Welinder and Paul Christiano and Jan Leike and Ryan Lowe},
-      year={2022},
-      eprint={2203.02155},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-
 @misc{liu2023geval,
       title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
       author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu},
diff --git a/applications/Chat/evaluate/config/config_cn.json b/applications/Chat/evaluate/config/config_cn.json
index a7293f111..a8c7ea8a3 100644
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -2,7 +2,7 @@
   "language": "cn",
   "category": {
     "brainstorming": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "creativity",
@@ -14,7 +14,7 @@
       ]
     },
     "chat": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "naturalness",
@@ -26,7 +26,7 @@
       ]
     },
     "classification": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness"
@@ -38,7 +38,7 @@
       ]
     },
     "closed_qa": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness"
@@ -50,7 +50,7 @@
       ]
     },
     "extraction": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness"
@@ -62,7 +62,7 @@
       ]
     },
     "generation": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "diversity"
@@ -74,7 +74,7 @@
       ]
     },
     "open_qa": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness"
@@ -84,7 +84,7 @@
       ]
     },
     "rewriting": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness"
@@ -96,7 +96,7 @@
       ]
     },
     "roleplay": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "fidelity",
@@ -107,7 +107,7 @@
       ]
     },
     "summarization": {
-      "GPT-3.5": [
+      "GPT": [
         "language organization",
         "relevance",
         "correctness",
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index 69f2c272a..4067b15db 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -39,7 +39,8 @@ def main(args):
                 "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
 
         # initialize evaluator
-        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
+                              config["language"])
         if len(args.model_name_list) == 2:
             answers1 = jload(args.answer_file_list[0])
             answers2 = jload(args.answer_file_list[1])
@@ -87,6 +88,10 @@ if __name__ == '__main__':
                         default=[],
                         required=True,
                         help='the names of at most 2 models')
+    parser.add_argument('--gpt_model',
+                        default="gpt-3.5-turbo",
+                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+                        help='which GPT model to use for evaluation')
     parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
     parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
     args = parser.parse_args()
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index d3d1c038b..433d775d2 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -14,13 +14,15 @@ class Evaluator(object):
 
     """
 
-    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
-                                                                                                          Any]) -> None:
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
+                 gpt_model: str, language: str) -> None:
         self.params = params
         self.battle_prompt = battle_prompt
         self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.gpt_model = gpt_model
+        self.language = language
         self.automatic_metric_stats = dict()
-        self.gpt35_evaluation_results = dict()
+        self.gpt_evaluation_results = dict()
         self.battle_results = []
 
     def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
@@ -63,6 +65,10 @@ class Evaluator(object):
 
         # automatic evaluation
         for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
             category_metrics = self.params[category]["Metrics"]
             self.automatic_metric_stats[category] = {}
 
@@ -74,17 +80,21 @@ class Evaluator(object):
             for metric in category_metrics:
                 self.automatic_metric_stats[category].update(switch(metric=metric))
 
-        # gpt35 evaluation
+        # gpt evaluation
         for category in self.params:
-            category_metrics = self.params[category]["GPT-3.5"]
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            category_metrics = self.params[category]["GPT"]
 
             prompt = self.gpt_evaluation_prompt.get(category, None)
             if prompt is None:
                 print(f"No prompt for category {category}! Use prompt for category general now.")
                 prompt = self.gpt_evaluation_prompt["general"]
 
-            self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
-                                                                                  prompt, category_metrics, category)
+            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
+                                                                          category_metrics, category, self.gpt_model)
 
     def save(self, path: str, model_name_list: List[str]) -> None:
         """
@@ -106,10 +116,10 @@ class Evaluator(object):
 
             # Save evaluation results for GPT-3.5 evaluation metrics.
             all_evaluations = []
-            base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
+            base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
             evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
 
-            for category, evaluations in self.gpt35_evaluation_results.items():
+            for category, evaluations in self.gpt_evaluation_results.items():
                 jdump(
                     evaluations,
                     os.path.join(evaluation_results_save_path, model_name_list[0],
@@ -121,10 +131,10 @@ class Evaluator(object):
 
             # Start to calculate scores and save statistics.
             evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                          evaluation_statistics_save_path)
+            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                        evaluation_statistics_save_path)
 
             # Save charts and csv.
             evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
-                                                             evaluation_analyses_save_path)
+            gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
+                                                           evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index c7e668df9..61ce3456c 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -16,7 +16,7 @@ from utils import jdump, jload
 
 def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
     """
-    Get evaluation from GPT-4.
+    Get battle evaluation from GPT-4.
 
     Args:
         sys_prompt: prompt for the system.
@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
         except Exception as e:
             print(e)
             time.sleep(1)
-    print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
     return {"evaluation": "", "id": id}
 
 
@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
     print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
 
 
-def get_gpt35_evaluation(prompt: Dict[str, Any],
-                         inst: Dict[str, Any],
-                         metrics: List[str],
-                         max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
+                                        inst: Dict[str, Any],
+                                        metrics: List[str],
+                                        model: str = "gpt-3.5-turbo",
+                                        max_tokens: int = 2048) -> Dict[str, Any]:
     """
-    Use GPT-3.5 to evaluate one model answer.
+    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        model: the model used to evaluate answers.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role":
+                                "user",
+                            "content":
+                                prompt["prompt"].format(
+                                    question=question,
+                                    answer=answer,
+                                    metric=prompt["metrics"][metric],
+                                    steps=prompt["CoT"][metric],
+                                ),
+                        },
+                    ],
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["message"]["content"],
+                    "logprobs": None,
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
+    return inst
+
+
+def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
+                                     inst: Dict[str, Any],
+                                     metrics: List[str],
+                                     max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use completion model(text-davinci-003) to evaluate one model answer.
+    Only completion models can return log probabilities.
 
     Args:
         prompt: a dictionary including prompt template, CoT and metrics.
@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
             except Exception as e:
                 print(e)
                 time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
     return inst
 
 
-def gpt35_evaluate(
-    answers: List[Dict],
-    prompt: Dict[str, Any],
-    metrics: List[str],
-    category: str,
-) -> List[Dict]:
+def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
     """
-    Use GPT-3.5 to evaluate model answers and save evaluation results.
+    Use GPT models to evaluate model answers and save evaluation results.
 
     Args:
         answers: model answers.
-        prompt: prompt for GPT-3.5 evaluation.
-        metrics: metrics for GPT-3.5 evaluation.
+        prompt: prompt for GPT evaluation.
+        metrics: metrics for GPT evaluation.
         category: the category of the model answers for evaluation.
+        model: the specific GPT model used to evaluate answers.
 
     Returns:
         Evaluations of the given answers.
@@ -315,7 +379,12 @@ def gpt35_evaluate(
     with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
         futures = []
         for inst in answers:
-            future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
+            # Completion models can return log probabilities.
+            if model == "text-davinci-003":
+                future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
+            else:
+                future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
+
             futures.append(future)
 
         for future in tqdm.tqdm(
@@ -334,20 +403,19 @@ def gpt35_evaluate(
 
 def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
     """
-    Calculate score from log probabilities returned by text-davinci-003.
-    Only openai.Completion can return logprobs.
+    Calculate the score according to log probabilities returned by text-davinci-003.
 
     Calculation formula:
         score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
 
     Ref: https://arxiv.org/abs/2303.16634
-    This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
+    This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
 
     Args:
         logprobs: logprobs returned by openai.Completion.
 
     Returns:
-        Score of one answer.
+        The score of one answer.
     """
 
     # GPT-3.5 only returns score of 1 to 5.
@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
     return score
 
 
-def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
+    """
+    Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
+
+    Args:
+        response: logprobs returned by openai.Completion.
+        evaluation: the evaluation corresponds to the question.
+
+    Returns:
+        The score of one answer.
+    """
+
+    try:
+        results = re.findall(r"\d", response)
+        if len(results) == 1:
+            return int(results[0])
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return 0
+
+
+def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
     """
     Generate statistics for one model.
 
@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
         scores = {metric: [] for metric in metrics}
         for evaluation in data:
             for metric in metrics:
-                scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                if evaluation["evaluation"][metric] == {}:
+                    # This means after 3 retries, the server still returns an error and we set the score to 0.
+                    scores[metric].append(0)
+                elif evaluation["evaluation"][metric]["logprobs"] is not None:
+                    scores[metric].append(
+                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                else:
+                    scores[metric].append(
+                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
 
         statistics = {}
         for metric in metrics:
@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
     )
 
 
-def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
     """
     Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
 
@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
         os.makedirs(save_path)
 
     frame_all = pd.DataFrame(frame_all)
-    frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
+    frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
 
     for category in tqdm.tqdm(
             frame_per_category.keys(),
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
index d4b8d143e..ee6caae32 100644
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
@@ -1,5 +1,5 @@
-[
-  {
+{
+  "brainstorming": {
     "id": 1,
     "category": "brainstorming",
     "metrics": {
@@ -18,7 +18,7 @@
     },
     "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "chat": {
     "id": 2,
     "category": "chat",
     "metrics": {
@@ -37,7 +37,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "classification": {
     "id": 3,
     "category": "classification",
     "metrics": {
@@ -52,7 +52,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "closed_qa": {
     "id": 4,
     "category": "closed_qa",
     "metrics": {
@@ -67,7 +67,7 @@
     },
     "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "extraction": {
     "id": 5,
     "category": "extraction",
     "metrics": {
@@ -82,7 +82,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "generation": {
     "id": 6,
     "category": "generation",
     "metrics": {
@@ -97,7 +97,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "open_qa": {
     "id": 7,
     "category": "open_qa",
     "metrics": {
@@ -112,7 +112,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "rewriting": {
     "id": 8,
     "category": "rewriting",
     "metrics": {
@@ -127,7 +127,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "roleplay": {
     "id": 9,
     "category": "roleplay",
     "metrics": {
@@ -144,7 +144,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "summarization": {
     "id": 10,
     "category": "summarization",
     "metrics": {
@@ -161,7 +161,7 @@
     },
     "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   },
-  {
+  "general": {
     "id": 11,
     "category": "general",
     "metrics": {
@@ -176,4 +176,4 @@
     },
     "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
   }
-]
+}
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index e855cd452..517c0a1c3 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -57,6 +57,7 @@ def get_data_per_category(data, categories):
     data_per_category = {category: [] for category in categories}
     for item in data:
         category = item["category"]
-        data_per_category[category].append(item)
+        if category in categories:
+            data_per_category[category].append(item)
 
     return data_per_category

From 5f79008c4a7a0e854f53d7f1c0b29ca7f411eeab Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Tue, 30 May 2023 18:41:41 +0800
Subject: [PATCH 12/26] [example] update gemini examples (#3868)

* [example]update gemini examples

* [example]update gemini examples
---
 examples/language/gpt/gemini/test_ci.sh       |  2 +-
 .../language/gpt/gemini/train_gpt_demo.py     | 57 +++++++++----------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh
index 6079d5ed6..0ddfd3a62 100644
--- a/examples/language/gpt/gemini/test_ci.sh
+++ b/examples/language/gpt/gemini/test_ci.sh
@@ -3,7 +3,7 @@ $(cd `dirname $0`;pwd)
 export TRAIN_STEP=4
 
 for MODEL_TYPE in "gpt2_medium"; do
-  for DISTPLAN in "colossalai"; do
+  for DISTPLAN in "CAI_Gemini"; do
     for BATCH_SIZE in 2; do
       for GPUNUM in 1 4; do
         for TPDEGREE in 1 2; do
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index b2a7fa36d..92751c7e2 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -11,11 +11,13 @@ from packaging import version
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
+from colossalai.zero import ColoInitContext
 
 CAI_VERSION = colossalai.__version__
 
@@ -236,23 +238,6 @@ def main():
             tensor_parallelize(model, tp_pg)
 
         # asign running configurations
-        gemini_config = None
-        if args.distplan.startswith("CAI_ZeRO"):
-            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
-        elif args.distplan == "CAI_Gemini":
-            gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
-                                 device=get_current_device(),
-                                 placement_policy=args.placement,
-                                 pin_memory=True,
-                                 hidden_dim=model.config.n_embd,
-                                 search_range_mb=128)
-            optim_config = dict(gpu_margin_mem_ratio=0.)
-        else:
-            raise RuntimeError
-
-        # build a highly optimized gpu/cpu optimizer
-        optimizer = HybridAdam(model.parameters(), lr=1e-3)
-
         if args.distplan == "CAI_ZeRO1":
             zero_stage = 1
         elif args.distplan == "CAI_ZeRO2":
@@ -262,22 +247,42 @@ def main():
         else:
             raise RuntimeError
 
-        # wrap your model and optimizer
-        model = zero_model_wrapper(model, zero_stage, gemini_config)
-        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
+        plugin = None
+        if args.distplan.startswith("CAI_ZeRO"):
+            plugin = LowLevelZeroPlugin(stage=zero_stage,
+                                        reduce_bucket_size_in_m=12 * 1024 * 1024,
+                                        overlap_communication=True,
+                                        verbose=True)
+        elif args.distplan == "CAI_Gemini":
+            plugin = GeminiPlugin(device=get_current_device(),
+                                  placement_policy=args.placement,
+                                  pin_memory=True,
+                                  strict_ddp_mode=args.tp_degree == 1,
+                                  search_range_mb=128,
+                                  hidden_dim=model.config.n_embd,
+                                  gpu_margin_mem_ratio=0.)
+        else:
+            raise RuntimeError
+
+        # build a highly optimized gpu/cpu optimizer
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
 
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     elif args.distplan.startswith("Pytorch"):
         assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
         model = model_builder(args.model_type)(checkpoint=True).cuda()
-        model = DDP(model)
+        plugin = TorchDDPPlugin()
         if args.distplan.endswith("DDP"):
             optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
         elif args.distplan.endswith("ZeRO"):
             from torch.distributed.optim import ZeroRedundancyOptimizer
             optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
+
     else:
         raise RuntimeError
+    # wrap your model and optimizer
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
 
     # model is shared after TP
     numel = get_model_size(model)
@@ -305,13 +310,7 @@ def main():
         fwd_end = time()
         fwd_time = fwd_end - start
         logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
-
-        if args.distplan.startswith("CAI"):
-            optimizer.backward(loss)
-        elif args.distplan.startswith("Pytorch"):
-            loss.backward()
-        else:
-            raise RuntimeError
+        booster.backward(loss, optimizer)
 
         torch.cuda.synchronize()
         bwd_end = time()

From 281b33f3628feb6bc7cb26faa42d260a169cb386 Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Tue, 30 May 2023 18:41:56 +0800
Subject: [PATCH 13/26] [doc] update document of zero with chunk. (#3855)

* [doc] fix title of mixed precision

* [doc]update document of zero with chunk

* [doc] update document of zero with chunk, fix

* [doc] update document of zero with chunk, fix

* [doc] update document of zero with chunk, fix

* [doc] update document of zero with chunk, add doc test

* [doc] update document of zero with chunk, add doc test

* [doc] update document of zero with chunk, fix installation

* [doc] update document of zero with chunk, fix zero with chunk doc

* [doc] update document of zero with chunk, fix zero with chunk doc
---
 docs/source/en/features/zero_with_chunk.md    | 56 ++++++------------
 .../gradient_accumulation_with_booster.md     |  2 +-
 .../mixed_precision_training_with_booster.md  |  2 +-
 .../zh-Hans/features/zero_with_chunk.md       | 58 +++++++------------
 .../zh-Hans/get_started/installation.md       |  2 +-
 5 files changed, 43 insertions(+), 77 deletions(-)

diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index d7a99f2fb..d6f6f611a 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -3,7 +3,7 @@
 Author: [Hongxiu Liu](https://github.com/ver217), [Jiarui Fang](https://github.com/feifeibear), [Zijian Ye](https://github.com/ZijianYY)
 
 **Prerequisite:**
-- [Define Your Configuration](../basics/define_your_config.md)
+- [Train with booster](../basics/booster_api.md)
 
 **Example Code**
 
@@ -97,6 +97,7 @@ For simplicity, we just use randomly generated data here.
 
 First we only need to import `GPT2LMHeadModel` from `Huggingface transformers` to define our model, which does not require users to define or modify the model, so that users can use it more conveniently.
 
+Define a GPT model:
 ```python
 class GPTLMModel(nn.Module):
 
@@ -182,34 +183,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 ```
 
-Define a model which uses Gemini + ZeRO DDP:
-
-```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placement_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placement_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                     init_device=GeminiManager.get_default_device(placement_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
-```
-
-As we pre-train GPT in this example, we just use a simple language model loss.
-
 Write a function to get random inputs:
 
 ```python
@@ -219,9 +192,15 @@ def get_data(batch_size, seq_len, vocab_size):
     return input_ids, attention_mask
 ```
 
-Finally, we can define our training loop:
+Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss:
 
 ```python
+from torch.optim import Adam
+
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+
 def main():
     args = parse_args()
     BATCH_SIZE = 8
@@ -232,22 +211,23 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
+    optimizer = Adam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    default_dist_spec = ShardSpec([-1], [args.tp_degree])
     # build GPT model
     with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
       model = gpt2_medium(checkpoint=True)
     pg = default_pg
     # Tensor Parallelism (TP)
     tensor_parallelize(model, pg)
+
     # Gemini + ZeRO DP, Note it must be used after TP
-    model = gemini_zero_dpp(model, pg, args.placement)
-    # build optimizer
-    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
-    numel = sum([p.numel() for p in model.parameters()])
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
     torch.cuda.synchronize()
     model.train()
     for n in range(NUM_STEPS):
@@ -256,10 +236,12 @@ def main():
         optimizer.zero_grad()
         outputs = model(input_ids, attn_mask)
         loss = criterion(outputs, input_ids)
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
 
     torch.cuda.synchronize()
 ```
 > ⚠️ Note: If you want to use the Gemini module, please do not use the [Gradient Accumulation](../features/gradient_accumulation.md) we mentioned before。
 The complete example can be found on [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt).
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 zero_with_chunk.py  -->
diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
index ab86f34f2..a8422060f 100644
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -1,4 +1,4 @@
-# 梯度累积 (最新版本)
+# 梯度累积 (新版本)
 
 作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index 6954556a8..187aef1a6 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -1,4 +1,4 @@
-# 自动混合精度训练 (最新版本)
+# 自动混合精度训练 (新版本)
 
 作者: [Mingyan Jiang](https://github.com/jiangmingyan)
 
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index ba57ba4e8..9030464dd 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -4,7 +4,7 @@
 
 **前置教程:**
 
-- [定义配置文件](../basics/define_your_config.md)
+- [booster使用](../basics/booster_api.md)
 
 **示例代码**
 
@@ -97,6 +97,8 @@ optimizer.step()
 
 首先我们只需要引入`Huggingface transformers` 的 `GPT2LMHeadModel`来定义我们的模型，不需要用户进行模型的定义与修改，方便用户使用。
 
+定义GPT模型：
+
 ```python
 class GPTLMModel(nn.Module):
 
@@ -182,34 +184,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
     split_param_single_dim_tp1d(-1, param, pg)
 ```
 
-定义一个使用 Gemini + ZeRO DDP 的模型：
-
-```python
-def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    cai_version = colossalai.__version__
-    if version.parse(cai_version) > version.parse("0.1.10"):
-        from colossalai.nn.parallel import GeminiDDP
-        model = GeminiDDP(model,
-                          device=get_current_device(),
-                          placement_policy=placement_policy,
-                          pin_memory=True,
-                          search_range_mb=32)
-    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
-        from colossalai.gemini import ChunkManager, GeminiManager
-        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
-        gemini_manager = GeminiManager(placement_policy, chunk_manager)
-        chunk_manager = ChunkManager(chunk_size,
-                                     pg,
-                                     enable_distributed_storage=True,
-                                 			init_device=GeminiManager.get_default_device(placement_policy))
-        model = ZeroDDP(model, gemini_manager)
-    else:
-        raise NotImplemented(f"CAI version {cai_version} is not supported")
-    return model
-```
-
-由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数。
-
 写一个获得随机输入的函数:
 
 ```python
@@ -219,9 +193,16 @@ def get_data(batch_size, seq_len, vocab_size):
     return input_ids, attention_mask
 ```
 
-最后，我们可以定义我们的训练循环:
+
+最后，使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数：
 
 ```python
+from torch.optim import Adam
+
+from colossalai.booster import Booster
+from colossalai.zero import ColoInitContext
+from colossalai.booster.plugin import GeminiPlugin
+
 def main():
     args = parse_args()
     BATCH_SIZE = 8
@@ -232,22 +213,23 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
+    optimizer = Adam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
+    default_dist_spec = ShardSpec([-1], [args.tp_degree])
     # build GPT model
     with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
       model = gpt2_medium(checkpoint=True)
     pg = default_pg
     # Tensor Parallelism (TP)
     tensor_parallelize(model, pg)
+
     # Gemini + ZeRO DP, Note it must be used after TP
-    model = gemini_zero_dpp(model, pg, args.placement)
-    # build optimizer
-    optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
-    numel = sum([p.numel() for p in model.parameters()])
-    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
+
     torch.cuda.synchronize()
     model.train()
     for n in range(NUM_STEPS):
@@ -256,10 +238,12 @@ def main():
         optimizer.zero_grad()
         outputs = model(input_ids, attn_mask)
         loss = criterion(outputs, input_ids)
-        optimizer.backward(loss)
+        booster.backward(loss, optimizer)
         optimizer.step()
 
     torch.cuda.synchronize()
 ```
 > ⚠️ 注意：如果你使用Gemini模块的话，请不要使用我们之前提到过的[梯度累加](../features/gradient_accumulation.md)。
 完整的例子代码可以在 [Train GPT with Colossal-AI](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/gpt). 获得。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 zero_with_chunk.py  -->
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index a32627db6..a6c88672b 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -47,7 +47,7 @@ CUDA_EXT=1 pip install .
 pip install .
 ```
 
-如果您在使用CUDA 10.2，您仍然可以从源码安装ColossalA。但是您需要手动下载cub库并将其复制到相应的目录。
+如果您在使用CUDA 10.2，您仍然可以从源码安装ColossalAI。但是您需要手动下载cub库并将其复制到相应的目录。
 
 ```bash
 # clone the repository

From 70c8cdecf45386a115d659897ad8017c9205c6a5 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Fri, 2 Jun 2023 15:02:45 +0800
Subject: [PATCH 14/26] [nfc] fix typo colossalai/cli fx kernel (#3847)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel
---
 colossalai/cli/launcher/__init__.py                         | 2 +-
 colossalai/cli/launcher/hostinfo.py                         | 2 +-
 colossalai/cli/launcher/multinode_runner.py                 | 2 +-
 colossalai/cli/launcher/run.py                              | 2 +-
 colossalai/device/alpha_beta_profiler.py                    | 2 +-
 .../patched_bias_addition_module/bias_addition_module.py    | 2 +-
 colossalai/fx/tracer/experimental.py                        | 2 +-
 colossalai/fx/tracer/tracer.py                              | 6 +++---
 colossalai/kernel/cuda_native/flash_attention.py            | 2 +-
 colossalai/kernel/cuda_native/multihead_attention.py        | 2 +-
 colossalai/kernel/jit/option.py                             | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py
index 8d9ec147d..808e4e845 100644
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -28,7 +28,7 @@ from .run import launch_multi_processes
     type=str,
     default=None,
     help=
-    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
     " only effective when used with --hostfile.")
 @click.option("--num_nodes",
               type=int,
diff --git a/colossalai/cli/launcher/hostinfo.py b/colossalai/cli/launcher/hostinfo.py
index 065cbc371..d1b88b229 100644
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -38,7 +38,7 @@ class HostInfo:
 
         # socket.getfqdn("127.0.0.1") does not return localhost
         # on some users' machines
-        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
+        # thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
         if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
             return True
 
diff --git a/colossalai/cli/launcher/multinode_runner.py b/colossalai/cli/launcher/multinode_runner.py
index a51e1e371..85b241e96 100644
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -114,7 +114,7 @@ class MultiNodeRunner:
         Receive messages from all hosts
 
         Returns:
-            msg_from_node (dict): a dictionry which contains messages from each node
+            msg_from_node (dict): a dictionary which contains messages from each node
         """
 
         msg_from_node = dict()
diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 6411b4302..027a10aa8 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
     # receive the stop status
     msg_from_node = runner.recv_from_all()
 
-    # printe node status
+    # print node status
     click.echo("\n====== Stopping All Nodes =====")
     for hostname, msg in msg_from_node.items():
         click.echo(f"{hostname}: {msg}")
diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py
index f8b20de9b..f4e6cfffb 100644
--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -197,7 +197,7 @@ class AlphaBetaProfiler:
             dist.broadcast_object_list(broadcast_list, src=process_group[0])
             alpha_beta_dict[process_group] = tuple(broadcast_list)
 
-        # add symmetry pair to the apha_beta_dict
+        # add symmetry pair to the alpha_beta_dict
         symmetry_ab_dict = {}
         for process_group, alpha_beta_pair in alpha_beta_dict.items():
             symmetry_process_group = (process_group[1], process_group[0])
diff --git a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
index 85f1553e3..591485fdb 100644
--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
@@ -51,7 +51,7 @@ class BiasAdditionModule(ABC):
 
         For example:
             The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
-            considered during module initilizing. However, we need to consider those attributes as kwargs
+            considered during module initializing. However, we need to consider those attributes as kwargs
             in F.conv2d.
         """
         pass
diff --git a/colossalai/fx/tracer/experimental.py b/colossalai/fx/tracer/experimental.py
index 88b65b618..22a67d1ce 100644
--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -295,7 +295,7 @@ class ColoTracer(Tracer):
 
                 @staticmethod
                 def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                     self.inside_torch_checkpoint_func = True
                     out = run_function(*args)
                     self.inside_torch_checkpoint_func = False
diff --git a/colossalai/fx/tracer/tracer.py b/colossalai/fx/tracer/tracer.py
index 1ae31f958..28965a1b8 100644
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -92,7 +92,7 @@ class ColoTracer(Tracer):
             return proxy
 
         # if graph is traced for auto parallelism module, some extra node will be added during
-        # graph construction to deal with the compatability between bias addition and all reduce.
+        # graph construction to deal with the compatibility between bias addition and all reduce.
 
         # if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
         # to create node on computation graph
@@ -208,7 +208,7 @@ class ColoTracer(Tracer):
             self.proxy_cls = ColoProxy
             self.tracer_type = TracerType.META
         else:
-            raise ValueError(f"Unrecognised tracer type {tracer_type}")
+            raise ValueError(f"Unrecognized tracer type {tracer_type}")
 
     def _meta_data_computing(self, kind, target, args, kwargs):
 
@@ -445,7 +445,7 @@ class ColoTracer(Tracer):
 
                 @staticmethod
                 def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                     self.inside_torch_checkpoint_func = True
                     out = run_function(*args)
                     self.inside_torch_checkpoint_func = False
diff --git a/colossalai/kernel/cuda_native/flash_attention.py b/colossalai/kernel/cuda_native/flash_attention.py
index d793815ed..3db737450 100644
--- a/colossalai/kernel/cuda_native/flash_attention.py
+++ b/colossalai/kernel/cuda_native/flash_attention.py
@@ -138,7 +138,7 @@ if HAS_MEM_EFF_ATTN:
             elif attn_mask_type == AttnMaskType.causal:    # gpt style
                 attn_bias = LowerTriangularMask()
 
-            if bias is not None:    # alibi / relative position emebedding
+            if bias is not None:    # alibi / relative position embedding
                 assert allow_alibi, "flash attention with bias is not supported in this system."
                 assert attn_mask_type == AttnMaskType.causal, \
                     "attention with bias is only supported for causal attention so far."
diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py
index 3b6470cdc..69246f2f3 100644
--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@@ -43,7 +43,7 @@ class Config:
     attn_prob_dropout_ratio: float    # attention score dropout ratio
     hidden_dropout_ratio: float    # dropout ration before residual
     norm_first: bool    # norm_first
-    fp16: bool    # fp16 presion
+    fp16: bool    # fp16 precision
 
 
 class MultiHeadAttention1DFunc(Function):
diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index aa41f5767..e20c08b05 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int,
                       seq_length: int = 512,
                       vocab_size: int = 32768,
                       dtype: torch.dtype = torch.float32):
-    """ Compilie JIT functions before the main training steps """
+    """ Compile JIT functions before the main training steps """
 
     embed = Embedding(vocab_size, hidden_size).to(get_current_device())
     linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())

From dbb32692d293d865640d857e8ee16d653147f7bc Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 5 Jun 2023 14:20:47 +0800
Subject: [PATCH 15/26] [lazy] refactor lazy init (#3891)

* [lazy] remove old lazy init

* [lazy] refactor lazy init folder structure

* [lazy] fix lazy tensor deepcopy

* [test] update lazy init test
---
 colossalai/lazy/__init__.py                   |   6 +
 .../experimental.py => lazy/lazy_init.py}     |   9 +-
 colossalai/utils/model/lazy_init_context.py   | 242 ------------------
 colossalai/zero/gemini/gemini_ddp.py          |  58 +++--
 .../test_plugin/test_gemini_plugin.py         |   2 +-
 .../lazy_init_utils.py                        |  10 +-
 .../test_distribute.py                        |   2 +-
 .../test_models.py                            |   0
 tests/test_utils/test_lazy_init_ctx.py        |  51 ----
 9 files changed, 56 insertions(+), 324 deletions(-)
 create mode 100644 colossalai/lazy/__init__.py
 rename colossalai/{utils/model/experimental.py => lazy/lazy_init.py} (98%)
 delete mode 100644 colossalai/utils/model/lazy_init_context.py
 rename tests/{test_utils/test_lazy_init => test_lazy}/lazy_init_utils.py (85%)
 rename tests/{test_utils/test_lazy_init => test_lazy}/test_distribute.py (97%)
 rename tests/{test_utils/test_lazy_init => test_lazy}/test_models.py (100%)
 delete mode 100644 tests/test_utils/test_lazy_init_ctx.py

diff --git a/colossalai/lazy/__init__.py b/colossalai/lazy/__init__.py
new file mode 100644
index 000000000..4387107bf
--- /dev/null
+++ b/colossalai/lazy/__init__.py
@@ -0,0 +1,6 @@
+from .lazy_init import LazyInitContext, LazyTensor
+
+__all__ = [
+    'LazyInitContext',
+    'LazyTensor',
+]
diff --git a/colossalai/utils/model/experimental.py b/colossalai/lazy/lazy_init.py
similarity index 98%
rename from colossalai/utils/model/experimental.py
rename to colossalai/lazy/lazy_init.py
index bf3e3d05b..c1fda3c53 100644
--- a/colossalai/utils/model/experimental.py
+++ b/colossalai/lazy/lazy_init.py
@@ -350,7 +350,14 @@ class LazyTensor(torch.Tensor):
                 copied.requires_grad_()
             return copied
 
-        target = LazyTensor(factory_fn, meta_data=self._meta_data)
+        if self._materialized_data is not None:
+            # self is early materialized
+            copied = self._materialized_data.detach().clone()
+            if self.requires_grad:
+                copied.requires_grad_()
+            target = LazyTensor(lambda: None, concrete_data=copied)
+        else:
+            target = LazyTensor(factory_fn, meta_data=self._meta_data)
 
         memo[id(self)] = target
         return target
diff --git a/colossalai/utils/model/lazy_init_context.py b/colossalai/utils/model/lazy_init_context.py
deleted file mode 100644
index cf05f9660..000000000
--- a/colossalai/utils/model/lazy_init_context.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-import inspect
-import types
-from typing import Callable, List
-
-import torch
-import torch.nn as nn
-
-from colossalai.tensor import ColoParameter, ColoTensor
-from colossalai.utils.model.utils import substitute_init_recursively
-
-
-class LazyInitContext():
-    """
-    A context to allow for lazy weight initialization of PyTorch modules. It intercepts the tensor
-    initialization functions for lazy initialization
-
-    Note:
-        This API is only experimental and subject to future changes.
-
-    Usage:
-        with LazyInitContext() as ctx:
-            model = nn.Linear(10, 10)
-            model.weight.zero_()
-
-        # make sure the weight is a meta tensor
-        assert model.weight.is_meta
-
-        # initialize weights
-        ctx.lazy_init_parameters(model)
-
-        # make sure the weight is not a meta tensor
-        # and initialized correctly
-        assert not model.weight.is_meta and torch.all(model.weight == 0)
-
-    Args:
-        to_meta (bool): optional, whether to initialize the model with meta tensors, default is True. This
-            argument exists for now because some corner cases such as self.weight = torch.zeros(...) cannot be captured yet.
-        extra_torch_tensor_func (List[str]): extra torch tensor functions related
-            to value setting, such as `zero_` and `triu_`. `zero_` is pre-added by default.
-    """
-
-    tensor_set_value_func = ['zero_', 'fill_']
-
-    def __init__(self, to_meta: bool = True, extra_torch_tensor_func: List[str] = None):
-        # TODO: hijack the torch constructor functions as well
-        self._to_meta = to_meta
-        self._intercepted_nn_init_func_cache = {}
-        self._nn_init_methods = self._get_nn_init_methods()
-        self._torch_mod_cls = torch.nn.modules.module.Module
-
-        if extra_torch_tensor_func:
-            # use tuple to remove duplicates
-            self._torch_tensor_funcs = tuple(self.tensor_set_value_func + extra_torch_tensor_func)
-        else:
-            self._torch_tensor_funcs = self.tensor_set_value_func
-
-    @property
-    def to_meta(self):
-        return self._to_meta
-
-    def _cache_init_func(self, func):
-        """
-        This method wraps the ``torch.nn.init`` method and torch tensor value-setting functions
-        so that the function call is cached instead of being executed.
-        """
-
-        def wrapped_init_func(tensor, *args, **kwargs):
-            if tensor not in self._intercepted_nn_init_func_cache:
-                self._intercepted_nn_init_func_cache[tensor] = []
-            self._intercepted_nn_init_func_cache[tensor].append((func, args, kwargs))
-
-        return wrapped_init_func
-
-    def _get_nn_init_methods(self):
-        """
-        This method looks for all available functions in the ``torch.nn.init``
-        module.
-        """
-        nn_init_method_names = dir(torch.nn.init)
-        nn_init_methods = []
-
-        # look for all methods in ``torch.nn.init`` module
-        for name in nn_init_method_names:
-            nn_init_methods.append((name, getattr(torch.nn.init, name)))
-
-        def _is_init_method(item):
-            name, func = item
-
-            if (not isinstance(func, types.FunctionType) or name.startswith('_') or not name.endswith('_')):
-                return False
-            else:
-                return True
-
-        # remove methods which are not init functions
-        nn_init_methods = list(filter(_is_init_method, nn_init_methods))
-        return nn_init_methods
-
-    def _wrap_module_init(self, func):
-        """
-        This method wraps the calls to the `__init__` of ``torch.nn.Module`` and replaces
-        the argument device with value 'meta' so that all modules are created as meta tensors.
-        """
-        has_device = 'device' in inspect.signature(func).parameters
-
-        def layer_lazy_init(module, *args, **kwargs):
-            # if this module contains device argument
-            # we set it to meta to initialize as meta backend
-            if has_device:
-                kwargs['device'] = 'meta'
-            func(module, *args, **kwargs)
-
-            # if device is not found, we intialize it and convert to meta
-            if not has_device:
-                module.to('meta')
-
-        return layer_lazy_init
-
-    def _get_tmp_origin_func_ref(self, name):
-        """
-        Generate a function name for consistency during caching and retrieving.
-        """
-        return f'_orig_{name}'
-
-    def _patch_nn_init_funcs(self):
-        # patch nn.init functions
-        for name, func in self._nn_init_methods:
-            setattr(torch.nn.init, name, self._cache_init_func(func))
-
-    def _unpatch_nn_init_funcs(self):
-        # unpatch nn.init functions
-        for name, func in self._nn_init_methods:
-            setattr(torch.nn.init, name, func)
-
-    def _patch_submodule_init(self):
-        # patch classes __init__ methods
-        def _activate_wrap_init(cls):
-            cls.__orig_init__ = cls.__init__
-            cls.__init__ = self._wrap_module_init(cls.__init__)
-
-        substitute_init_recursively(self._torch_mod_cls, _activate_wrap_init, set())
-
-    def _unpatch_submodule_init(self):
-
-        def _recover_orig_init(cls):
-            cls.__init__ = cls.__orig_init__
-
-        substitute_init_recursively(self._torch_mod_cls, _recover_orig_init, set())
-
-    def _patch_torch_tensor_funcs(self):
-        # patch tensor value-setting functions
-        for func_name in self._torch_tensor_funcs:
-            origin_func_name = self._get_tmp_origin_func_ref(func_name)
-            origin_func = getattr(torch.Tensor, func_name)
-            setattr(torch.Tensor, origin_func_name, origin_func)
-            setattr(torch.Tensor, func_name, self._cache_init_func(origin_func))
-
-    def _unpatch_torch_tensor_funcs(self):
-        for func_name in self._torch_tensor_funcs:
-            origin_func_name = self._get_tmp_origin_func_ref(func_name)
-            origin_func = getattr(torch.Tensor, origin_func_name)
-            setattr(torch.Tensor, func_name, origin_func)
-
-    def __enter__(self):
-        self._patch_torch_tensor_funcs()
-        self._patch_nn_init_funcs()
-
-        if self._to_meta:
-            self._patch_submodule_init()
-        return self
-
-    def __exit__(self, *args, **kwargs):
-        if self._to_meta:
-            self._unpatch_submodule_init()
-        self._unpatch_nn_init_funcs()
-        self._unpatch_torch_tensor_funcs()
-
-    def lazy_init_parameters(self, model: torch.nn.Module, device='cpu'):
-        """
-        Initialize the weights of the meta-tensor model.
-
-        Args:
-            model (`torch.nn.Module`): the model instantiated under the context.
-            device (str): the device on which weights are initialized
-
-        """
-
-        def _init_recursively(module: nn.Module):
-            # recursively initialize the module
-            for mod in module.children():
-                _init_recursively(mod)
-
-            # initialize and shard tensors directly attached to the current module
-            for name, param in module.named_parameters(recurse=False):
-                _init_and_shard(module, name, param)
-
-            for name, buf in module.named_buffers(recurse=False):
-                _init_and_shard(module, name, buf)
-
-        @torch.no_grad()
-        def _init_and_shard(module, name, tensor):
-            # check whether the tensor is a buffer or parameter
-            is_param = isinstance(tensor, nn.parameter.Parameter)
-
-            # get sharding spec
-            dist_spec = getattr(tensor, 'dist_spec', None)
-            pg = getattr(tensor, 'pg', None)
-            comp_spec = getattr(tensor, 'comp_spec', None)
-
-            # convert the tensor from meta to materialized one
-            if tensor.is_meta:
-                materialized_tensor = torch.empty_like(tensor, device=device)
-                # if this tensor is a meta tensor, it must have an init function
-                assert tensor in self._intercepted_nn_init_func_cache
-            else:
-                materialized_tensor = tensor
-
-            # apply init function
-            if tensor in self._intercepted_nn_init_func_cache:
-                init_func, args, kwargs = self._intercepted_nn_init_func_cache[tensor][-1]
-                init_func(materialized_tensor, *args, **kwargs)
-
-            # convert it to ColoTensor or ColoParameter
-            if is_param:
-                tensor = ColoParameter.from_torch_tensor(materialized_tensor, requires_grad=tensor.requires_grad)
-            else:
-                tensor = ColoTensor.from_torch_tensor(materialized_tensor)
-
-            # override the original tensor
-            with torch.no_grad():
-                setattr(module, name, tensor)
-
-            # apply sharding
-            if dist_spec:
-                tensor.process_group = pg
-                tensor.set_tensor_spec(dist_spec, comp_spec)
-
-        _init_recursively(model)
-
-        return model
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 878c25be7..fd49362d6 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -2,13 +2,14 @@ import itertools
 from collections import OrderedDict
 from contextlib import nullcontext
 from functools import partial
-from typing import Dict, Iterator, List, Optional, Union, Tuple, Set
+from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 
 from colossalai.checkpoint_io.utils import calculate_tensor_size
+from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
 from colossalai.nn.parallel.data_parallel import ColoDDP, _cast_float, free_storage
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
@@ -16,7 +17,6 @@ from colossalai.tensor import ReplicaSpec
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device, is_ddp_ignored
-from colossalai.utils.model.experimental import LazyTensor
 
 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
 from .gemini_hook import GeminiZeROHook
@@ -96,34 +96,38 @@ class ZeroDDP(ColoDDP):
                 param_name = m_name + '.' + p_name if m_name else p_name
                 self.name2param[param_name] = p_var
         super().__init__(module, process_group=ColoProcessGroup())
-        self._non_persistent_buffers_set=self._get_non_persistent_buffers_set(module)
+        self._non_persistent_buffers_set = self._get_non_persistent_buffers_set(module)
         self._cast_buffers()
 
-    def _get_non_persistent_buffers_set(self, module, memo: Optional[Set[nn.Module]] = None, prefix: str = '', remove_duplicate: bool = True):
+    def _get_non_persistent_buffers_set(self,
+                                        module,
+                                        memo: Optional[Set[nn.Module]] = None,
+                                        prefix: str = '',
+                                        remove_duplicate: bool = True):
+        r"""
+        Args:
+            memo: a memo to store the set of modules already added to the result
+            prefix: a prefix that will be added to the name of the module
+            remove_duplicate: whether to remove the duplicated module instances in the result
+                or not
+        """
 
-            r"""
-            Args:
-                memo: a memo to store the set of modules already added to the result
-                prefix: a prefix that will be added to the name of the module
-                remove_duplicate: whether to remove the duplicated module instances in the result
-                    or not
-            """
-
-            if memo is None:
-                memo = set()
-            self_non_persistent_set = set()
-            if module not in memo:
-                if remove_duplicate:
-                    memo.add(module)
-                self_non_persistent_set = set(map(lambda key: prefix + ('.' if prefix else '') + key, module._non_persistent_buffers_set))
-                for name, sub_module in module._modules.items():
-                    if sub_module is None:
-                        continue
-                    submodule_prefix = prefix + ('.' if prefix else '') + name
-                    child_non_persistent_set = self._get_non_persistent_buffers_set(sub_module, memo, submodule_prefix, remove_duplicate)
-                    self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
-            return self_non_persistent_set
-    
+        if memo is None:
+            memo = set()
+        self_non_persistent_set = set()
+        if module not in memo:
+            if remove_duplicate:
+                memo.add(module)
+            self_non_persistent_set = set(
+                map(lambda key: prefix + ('.' if prefix else '') + key, module._non_persistent_buffers_set))
+            for name, sub_module in module._modules.items():
+                if sub_module is None:
+                    continue
+                submodule_prefix = prefix + ('.' if prefix else '') + name
+                child_non_persistent_set = self._get_non_persistent_buffers_set(sub_module, memo, submodule_prefix,
+                                                                                remove_duplicate)
+                self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
+        return self_non_persistent_set
 
     def _post_forward(self):
         """This function is only triggered for inference.
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index c7b3676fb..d606d6d89 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -8,10 +8,10 @@ import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin
 from colossalai.fx import is_compatible_with_meta
+from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils.model.experimental import LazyInitContext
 from colossalai.zero import ColoInitContext
 from tests.kit.model_zoo import model_zoo
 
diff --git a/tests/test_utils/test_lazy_init/lazy_init_utils.py b/tests/test_lazy/lazy_init_utils.py
similarity index 85%
rename from tests/test_utils/test_lazy_init/lazy_init_utils.py
rename to tests/test_lazy/lazy_init_utils.py
index aa87d32a8..85bfd0e27 100644
--- a/tests/test_utils/test_lazy_init/lazy_init_utils.py
+++ b/tests/test_lazy/lazy_init_utils.py
@@ -1,12 +1,13 @@
 import random
+from copy import deepcopy
 from typing import Any, Callable, Optional, Tuple
 
 import numpy as np
 import torch
 from packaging import version
 
+from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
 from colossalai.tensor.d_tensor.layout_converter import to_global
-from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 from tests.kit.model_zoo.registry import ModelAttribute
 
 SUPPORT_LAZY = version.parse(torch.__version__) >= version.parse('1.12.0')
@@ -31,6 +32,9 @@ def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
         assert n1 == n2
         assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'
 
+    for p1, p2 in zip(m1.parameters(), m2.parameters()):
+        assert p1.requires_grad == p2.requires_grad
+
 
 def assert_forward_equal(m1: torch.nn.Module, m2: torch.nn.Module, data_gen_fn: Callable[[], dict],
                          output_transform_fn: Callable[[Any], dict]) -> None:
@@ -65,10 +69,14 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
     ctx = LazyInitContext()
     with ctx:
         deferred_model = model_fn()
+        copied_deferred_model = deepcopy(deferred_model)
     deferred_model = ctx.materialize(deferred_model, verbose=verbose)
+    copied_deferred_model = ctx.materialize(copied_deferred_model, verbose=verbose)
     assert_model_equal(model, deferred_model)
+    assert_model_equal(deferred_model, copied_deferred_model)
     if check_forward:
         assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
+        assert_forward_equal(deferred_model, copied_deferred_model, data_gen_fn, output_transform_fn)
     if verbose:
         print(f'{model.__class__.__name__} pass')
 
diff --git a/tests/test_utils/test_lazy_init/test_distribute.py b/tests/test_lazy/test_distribute.py
similarity index 97%
rename from tests/test_utils/test_lazy_init/test_distribute.py
rename to tests/test_lazy/test_distribute.py
index fd91e7e91..d515b175a 100644
--- a/tests/test_utils/test_lazy_init/test_distribute.py
+++ b/tests/test_lazy/test_distribute.py
@@ -12,7 +12,7 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.utils.common import print_rank_0
 
 try:
-    from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
+    from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
 except:
     pass
 from lazy_init_utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
diff --git a/tests/test_utils/test_lazy_init/test_models.py b/tests/test_lazy/test_models.py
similarity index 100%
rename from tests/test_utils/test_lazy_init/test_models.py
rename to tests/test_lazy/test_models.py
diff --git a/tests/test_utils/test_lazy_init_ctx.py b/tests/test_utils/test_lazy_init_ctx.py
deleted file mode 100644
index 97efb3367..000000000
--- a/tests/test_utils/test_lazy_init_ctx.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from colossalai.utils.model.lazy_init_context import LazyInitContext
-from torchvision.models import resnet34
-import random
-import numpy as np
-
-MANUAL_SEED = 0
-random.seed(MANUAL_SEED)
-np.random.seed(MANUAL_SEED)
-torch.manual_seed(MANUAL_SEED)
-
-
-def test_lazy_init_with_meta():
-    ctx = LazyInitContext(to_meta=True)
-    with ctx:
-        model = resnet34(num_classes=10)
-
-    for param in model.parameters():
-        assert param.is_meta
-    for buffer in model.buffers():
-        assert buffer.is_meta
-
-    ctx.lazy_init_parameters(model)
-
-    for name, param in model.named_parameters():
-        assert not param.is_meta, name
-
-    for buffer in model.buffers():
-        assert not buffer.is_meta
-
-
-def test_lazy_init_without_meta():
-    ctx = LazyInitContext(to_meta=False)
-    with ctx:
-        model = resnet34(num_classes=10)
-
-    for param in model.parameters():
-        assert not param.is_meta
-    for buffer in model.buffers():
-        assert not buffer.is_meta
-
-    conv1_weight_before_init = model.conv1.weight.clone()
-    ctx.lazy_init_parameters(model)
-    conv1_weight_after_init = model.conv1.weight.clone()
-
-    assert not torch.allclose(conv1_weight_after_init, conv1_weight_before_init)
-
-
-if __name__ == '__main__':
-    test_lazy_init_with_meta()
-    test_lazy_init_without_meta()

From 8065cc5fbac578c9578209cd6768c2242c9aa20a Mon Sep 17 00:00:00 2001
From: Liu Ziming <38985202+MaruyamaAya@users.noreply.github.com>
Date: Mon, 5 Jun 2023 15:57:35 +0800
Subject: [PATCH 16/26] Modify torch version requirement to adapt torch 2.0
 (#3896)

---
 colossalai/cli/launcher/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 027a10aa8..daa5107ca 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def get_launch_command(
         extra_launch_args = dict()
 
     torch_version = version.parse(torch.__version__)
-    assert torch_version.major == 1
+    assert torch_version.major >= 1
 
     if torch_version.minor < 9:
         cmd = [

From 07cb21142fc1daaf1a402f827721d3fdeb56d075 Mon Sep 17 00:00:00 2001
From: jiangmingyan <1829166702@qq.com>
Date: Mon, 5 Jun 2023 15:57:54 +0800
Subject: [PATCH 17/26] [doc]update moe chinese document. (#3890)

* [doc]update-moe

* [doc]update-moe

* [doc]update-moe

* [doc]update-moe

* [doc]update-moe
---
 ...rate_mixture_of_experts_into_your_model.md |  1 +
 .../source/en/concepts/colossalai_overview.md |  4 +-
 ...rate_mixture_of_experts_into_your_model.md | 88 ++++++-------------
 .../zh-Hans/concepts/colossalai_overview.md   |  4 +-
 4 files changed, 33 insertions(+), 64 deletions(-)

diff --git a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index d5edd135c..bfa5539fe 100644
--- a/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/en/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -137,3 +137,4 @@ criterion = MoeLoss(
 
 Finally, just use trainer or engine in `colossalai` to do your training.
 Otherwise, you should take care of gradient by yourself.
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 integrate_mixture_of_experts_into_your_model.py  -->
diff --git a/docs/source/en/concepts/colossalai_overview.md b/docs/source/en/concepts/colossalai_overview.md
index 38b682d49..7617c62a4 100644
--- a/docs/source/en/concepts/colossalai_overview.md
+++ b/docs/source/en/concepts/colossalai_overview.md
@@ -19,7 +19,7 @@ We aim to make Colossal-AI easy to use and non-intrusive to user code. There is
 
 1. Prepare a configuration file where specifies the features you want to use and your parameters.
 2. Initialize distributed backend with `colossalai.launch`
-3. Inject the training features into your training components (e.g. model, optimizer) with `colossalai.initialize`.
+3. Inject the training features into your training components (e.g. model, optimizer) with `colossalai.booster`.
 4. Run training and testing
 
 We will cover the whole workflow in the `basic tutorials` section.
@@ -34,3 +34,5 @@ The Colossal-AI system will be expanded to include more training skills, these n
 4. expansion of existing parallelism methods
 
 We welcome ideas and contribution from the community and you can post your idea for future development in our forum.
+
+<!-- doc-test-command: echo "colossalai_overview.md does not need test"  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
index 276fcc261..8ed9a1e43 100644
--- a/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
+++ b/docs/source/zh-Hans/advanced_tutorials/integrate_mixture_of_experts_into_your_model.md
@@ -9,44 +9,24 @@
 - [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
 - [Go Wider Instead of Deeper](https://arxiv.org/abs/2107.11817)
 
-（中文版教程将会在近期提供）
-
 ## Introduction
 
-Since the advent of Switch Transformer, the AI community has found Mixture of Experts (MoE) a useful technique to enlarge the capacity of deep learning models.
+自从`Switch Transformer`出现以来，人工智能社区发现专家混合 (MoE) 是一种扩大深度学习模型容量的有用技术。
+Colossal-AI 提供了专为MoE模型设计的并行性的早期访问版本。Colossal-AI中MoE最突出的优势就是方便。我们的目标是帮助我们的用户轻松地将MoE与模型并行性和数据并行性结合起来。
+但是，当前的实施现在有两个主要缺点。第一个缺点是它在大批量和长序列长度训练中效率低下。第二个缺点是与张量并行性不兼容。我们正在致力于系统优化，以克服训练效率问题。与张量并行的兼容性问题需要更多的适应，我们将在未来解决这个问题。
+在这里，我们将介绍如何使用具有模型并行性和数据并行性的 MoE。
 
-Colossal-AI provides an early access version of parallelism specifically designed for MoE models.
-The most prominent advantage of MoE in Colossal-AI is convenience.
-We aim to help our users to easily combine MoE with model parallelism and data parallelism.
+## 目录
+在本教程中，我们将介绍：
+1. [搭建MoE运行环境](#搭建moe运行环境)
+2. [创建MoE层](#创建moe层)
+3. [定义训练模型](#训练模型)
 
-However, the current implementation has two main drawbacks now.
-The first drawback is its poor efficiency in large batch size and long sequence length training.
-The second drawback is incompatibility with tensor parallelism.
-We are working on system optimization to overcome the training efficiency problem.
-The compatibility problem with tensor parallelism requires more adaptation, and we will tackle this issue in the future.
+我们提供[示例](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet)， 详细介绍请参考 [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
+该示例使用 [WideNet](https://arxiv.org/abs/2107.11817) 作为基于 MoE 的模型的示例.
 
-Here, we will introduce how to use MoE with model parallelism and data parallelism.
-
-## Table of Content
-In this tutorial we will cover:
-1. Set up MoE running environment
-2. Create MoE layer
-3. Train your model
-
-We provided the [example code](https://github.com/hpcaitech/ColossalAI-Examples/tree/main/image/widenet) for this tutorial in [ColossalAI-Examples](https://github.com/hpcaitech/ColossalAI-Examples).
-This example uses [WideNet](https://arxiv.org/abs/2107.11817) as an example of MoE-based model.
-
-
-## Set up MoE running environment
-In your project folder, create a `config.py`.
-
-This file is to specify some features you may want to use to train your model.
-In order to enable MoE, you need to add a dict called parallel and specify the value of key moe.
-You can assign a value for the key size of moe, which represents the model parallel size of experts (i.e. the number of experts in one group to parallelize training).
-
-For example, if the size is 4, 4 processes will be assigned to 4 consecutive GPUs and these 4 processes form a moe model parallel group.
-Each process on the 4 GPUs will only get a portion of experts. Increasing the model parallel size will reduce communication cost, but increase computation cost in each GPU and activation cost in memory.
-The total data parallel size is auto-detected and set as the number of GPUs by default.
+## 搭建MoE运行环境
+在您的项目文件夹中，创建`config.py`文件。在该文件中，您可以指定希望用于训练模型的一些功能。为了启用 MoE，您需要在`config.py`中定义`parallel`字段，并指定`moe`的值。`moe`表示一组moe并行化训练组的并行大小。例如，`moe`设置为4，则4个进程将分配给4个连续的GPU，这4个进程组成一个moe模型并行组。每个进程只会得到一部分专家。增加mo e并行的大小将降低通信成本，但会增加每个GPU的计算成本和内存中activation的存储成本。总的数据并行的大小是自动检测的，默认情况下设置为GPU的数量。
 
 ```python
 MOE_MODEL_PARALLEL_SIZE = ...
@@ -55,37 +35,29 @@ parallel = dict(
 )
 ```
 
-If `MOE_MODEL_PARALLEL_SIZE = E` and set the number of experts as `E` where `E` is a constant number, the process flow of forward pass of a transformer encoder in a model parallel group is shown below.
+如果`MOE_MODEL_PARALLEL_SIZE = E`，即设置专家的总数为`E`（`E`为一个常数）。在模型并行中，transformer编码器中前向部分的处理流程如下图所示。
 
 <figure style={{textAlign: "center"}}>
 <img src="https://s2.loli.net/2022/01/28/oI59QcxdteKUTks.png"/>
 <figcaption>MoE Transformer, image source: <a href="https://arxiv.org/abs/2006.16668">GShard</a></figcaption>
 </figure>
 
-Since all experts are allocated to all GPUs in a model parallel group and a GPU only owns a portion of experts,
-original data parallel groups are no longer correct for the parameters of experts during gradient handling in backward pass anymore.
-So we create a new kind of parallel group called moe data parallel group.
-The difference among different kinds of parallel group, when the configuration is set as `WORLD_SIZE=4`,
-`MOE_MODEL_PARALLEL_SIZE=2`, is shown here.
+所有专家都分配给模型并行组中的GPU，每一个GPU只拥有一部分专家，原始数据并行组在反向传递的梯度处理期间不再适用于专家参数。所以我们创建了一个新的并行组，叫做moe数据并行组。当配置设置为`WORLD_SIZE=4`，`MOE_MODEL_PARALLEL_SIZE=2`时，两个并行组的区别如下图所示。
 
 <figure style={{textAlign: "center"}}>
 <img src="https://s2.loli.net/2022/01/28/Sn8FpmQPKIiBEq2.png"/>
-<figcaption>MoE process group</figcaption>
+<figcaption>MoE并行处理</figcaption>
 </figure>
 
+至于梯度处理，我们提供了`MoeGradientHandler`来all-reduce模型的每个参数。如果您使用`colossalai.initialize`函数创建您的训练引擎，MoE梯度处理程序将自动添加到您的引擎中。否则，你应该自己处理梯度。MoE运行环境的所有参数都保存在`colossalai.global_variables.moe_env`中。您可以访问您的配置参数来检查您的设置是否正确。
 
-As for gradient handling, we provide MoeGradientHandler to all-reduce every parameter of the model.
-If you use `colossalai.initialize` function to create your training engine, the MoE gradient handler will be added to your engine automatically.
-Otherwise, you should take care of gradient by yourself.
-All parameters of MoE running environment are stored in colossalai.global_variables.moe_env.
-You can access your configuration parameters to check whether your setup is correct.
 ```python
 from colossalai.global_variables import moe_env
 ```
 
-## Create MoE layer
-You can create a MoE layer from `colossalai.nn.moe`.
-But before doing that, you should set up random seeds for all processes like this.
+## 创建MoE层
+
+您可以从`colossalai.nn.moe`创建MoE层。但在此之前，您应该为所有进程设置随机种子。
 
 ```python
 from colossalai.context.random import moe_set_seed
@@ -95,10 +67,7 @@ moe_set_seed(42)
 model = Widenet(num_experts=4, capacity_factor=1.2)
 ```
 
-`moe_set_seed` will set different seed for different processes in a moe model parallel group.
-This helps initialize parameters in experts.
-Then create an instance of experts and an instance of router.
-Here is the example in model zoo.
+`moe_set_seed` 会为一个moe模型并行组中的不同进程设置不同的种子（这有助于在专家中初始化参数），创建一个专家实例和一个路由器实例，示例如下。
 
 ```python
 from colossalai.nn.layer.moe import Experts, MoeLayer, Top2Router, NormalNoiseGenerator
@@ -118,16 +87,11 @@ ffn=MoeLayer(dim_model=d_model, num_experts=num_experts,
              router=shared_router, experts=shared_experts)
 ```
 
-Inside the initialization of Experts, the local expert number of each GPU will be calculated automatically. You just need to specify the class of each expert and its parameters used in its initialization. As for routers, we have provided top1 router and top2 router. You can find them in colossalai.nn.layer.moe. After creating the instance of experts and router, the only thing initialized in Moelayer is gate module. More definitions of each class can be found in our API document and code.
+在Experts的初始化中，会自动计算每个GPU的本地expert数量，您只需指定每个专家的类型及其在初始化时使用的参数。此外，我们提供了`Top1Router`和`Top2Router`，您可以在`colossalai.nn.layer.moe` 找到它们。在创建experts和router的实例时，`Moelayer`只初始化了`gate`模块，类型的更多详细信息您可以参考我们的API文档和代码。
 
+## 定义训练模型
 
-## Train Your Model
-Do not to forget to use `colossalai.initialize` function in `colossalai` to add gradient handler for the engine.
-We handle the back-propagation of MoE models for you.
-In `colossalai.initialize`, we will automatically create a `MoeGradientHandler` object to process gradients.
-You can find more information about the handler `MoeGradientHandler` in colossal directory.
-
-The loss criterion should be wrapped by `Moeloss` to add auxiliary loss of MoE. Example is like this.
+使用colossalai中的`colossalai.initialize`函数为引擎添加梯度处理程序以处理 MoE模型的反向传播。在 `colossalai.initialize` 中，我们会自动创建一个`MoeGradientHandler`对象来处理梯度。您可以在colossal目录中找到有关`MoeGradientHandler`的更多信息。为了添加MoE的相关损失处理，损失函数应使用`Moeloss`封装，示例如下。
 ```python
 criterion = MoeLoss(
     aux_weight=0.01,
@@ -135,6 +99,6 @@ criterion = MoeLoss(
     label_smoothing=0.1
 )
 ```
+最后，您只需使用 `colossalai` 中的`trainer`或`engine`进行训练即可。
 
-Finally, just use trainer or engine in `colossalai` to do your training.
-Otherwise, you should take care of gradient by yourself.
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=1 integrate_mixture_of_experts_into_your_model.py  -->
diff --git a/docs/source/zh-Hans/concepts/colossalai_overview.md b/docs/source/zh-Hans/concepts/colossalai_overview.md
index cfb35e59e..8b28baf8e 100755
--- a/docs/source/zh-Hans/concepts/colossalai_overview.md
+++ b/docs/source/zh-Hans/concepts/colossalai_overview.md
@@ -19,7 +19,7 @@ Colossal-AI 是一个集成的系统，为用户提供一套综合的训练方
 
 1. 准备一个配置文件，指定您要使用的功能和参数。
 2. 用 `colossalai.launch` 初始化分布式后端。
-3. 用 `colossalai.initialize` 将训练特征注入您的训练组件（如模型、优化器）中。
+3. 用 `colossalai.booster` 将训练特征注入您的训练组件（如模型、优化器）中。
 4. 进行训练和测试.
 
 我们将在`基本教程`部分介绍整个工作流程。
@@ -34,3 +34,5 @@ Colossal-AI 系统将会进一步拓展和优化，包括但不限于:
 4. 拓展现有的并行方法
 
 **我们始终欢迎社区的建议和讨论，如果您遇到任何问题，我们将非常愿意帮助您。您可以在GitHub 提 [issue](https://github.com/hpcaitech/ColossalAI/issues) ，或在[论坛](https://github.com/hpcaitech/ColossalAI/discussions)上创建一个讨论主题。**
+
+<!-- doc-test-command: echo "colossalai_overview.md does not need test"  -->

From ae02d4e4f70e8ba4f8ae1058ac48bd08b06b6d24 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 5 Jun 2023 15:58:31 +0800
Subject: [PATCH 18/26] [bf16] add bf16 support (#3882)

* [bf16] add bf16 support for fused adam (#3844)

* [bf16] fused adam kernel support bf16

* [test] update fused adam kernel test

* [test] update fused adam test

* [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860)

* [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869)

* [bf16] add mixed precision mixin

* [bf16] low level zero optim support bf16

* [text] update low level zero test

* [text] fix low level zero grad acc test

* [bf16] add bf16 support for gemini (#3872)

* [bf16] gemini support bf16

* [test] update gemini bf16 test

* [doc] update gemini docstring

* [bf16] add bf16 support for plugins (#3877)

* [bf16] add bf16 support for legacy zero (#3879)

* [zero] init context support bf16

* [zero] legacy zero support bf16

* [test] add zero bf16 test

* [doc] add bf16 related docstring for legacy zero
---
 .../mixed_precision_mixin/__init__.py         |   9 ++
 .../naive_amp/mixed_precision_mixin/base.py   |  91 ++++++++++++
 .../naive_amp/mixed_precision_mixin/bf16.py   |  23 +++
 .../naive_amp/mixed_precision_mixin/fp16.py   |  84 +++++++++++
 colossalai/booster/plugin/gemini_plugin.py    |   9 +-
 .../booster/plugin/low_level_zero_plugin.py   |  33 +++--
 .../kernel/cuda_native/csrc/type_shim.h       |  15 ++
 colossalai/nn/optimizer/cpu_adam.py           |  23 ++-
 colossalai/nn/optimizer/fused_adam.py         |   4 +-
 colossalai/nn/optimizer/hybrid_adam.py        |  37 ++---
 colossalai/zero/gemini/gemini_ddp.py          |  24 +++-
 colossalai/zero/gemini/gemini_optimizer.py    |  92 ++++++------
 .../zero/legacy/init_ctx/init_context.py      |  11 +-
 .../zero/legacy/sharded_model/_utils.py       |  10 +-
 .../legacy/sharded_model/sharded_model_v2.py  |   7 +-
 .../legacy/sharded_optim/sharded_optim_v2.py  |  39 ++++--
 colossalai/zero/low_level/low_level_optim.py  | 106 +++++++-------
 tests/test_optimizer/test_adam_kernel.py      | 131 ++++++++++++++++++
 tests/test_optimizer/test_adam_optim.py       |  86 ++++++++++++
 tests/test_optimizer/test_cpu_adam.py         | 121 ----------------
 tests/test_optimizer/test_fused_adam.py       |  64 ---------
 .../test_optimizer/test_fused_adam_kernel.py  |  95 -------------
 tests/test_optimizer/test_hybrid_adam.py      |  42 ------
 tests/test_zero/test_gemini/test_optim.py     |  46 ++++--
 .../test_zero/test_legacy/test_zero_engine.py |  21 ++-
 .../test_zero/test_low_level/test_grad_acc.py |   5 +-
 .../test_zero/test_low_level/test_zero1_2.py  |  35 +++--
 27 files changed, 738 insertions(+), 525 deletions(-)
 create mode 100644 colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
 create mode 100644 colossalai/amp/naive_amp/mixed_precision_mixin/base.py
 create mode 100644 colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
 create mode 100644 colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
 create mode 100644 tests/test_optimizer/test_adam_kernel.py
 create mode 100644 tests/test_optimizer/test_adam_optim.py
 delete mode 100644 tests/test_optimizer/test_cpu_adam.py
 delete mode 100644 tests/test_optimizer/test_fused_adam.py
 delete mode 100644 tests/test_optimizer/test_fused_adam_kernel.py
 delete mode 100644 tests/test_optimizer/test_hybrid_adam.py

diff --git a/colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py b/colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
new file mode 100644
index 000000000..b0348e147
--- /dev/null
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
@@ -0,0 +1,9 @@
+from .base import MixedPrecisionMixin
+from .bf16 import BF16MixedPrecisionMixin
+from .fp16 import FP16MixedPrecisionMixin
+
+__all__ = [
+    'MixedPrecisionMixin',
+    'FP16MixedPrecisionMixin',
+    'BF16MixedPrecisionMixin',
+]
diff --git a/colossalai/amp/naive_amp/mixed_precision_mixin/base.py b/colossalai/amp/naive_amp/mixed_precision_mixin/base.py
new file mode 100644
index 000000000..a52a9747a
--- /dev/null
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/base.py
@@ -0,0 +1,91 @@
+from abc import ABC, abstractmethod
+
+import torch
+from torch import Tensor
+
+
+class MixedPrecisionMixin(ABC):
+    """A helper class for mixed precision training. This mixin is used in mixed precision optimizers.
+
+    Attributes:
+        dtype (torc.dtype): The expected dtype of the gradients.
+
+    Examples:
+        ```python
+        class MyMixedPrecisionOptimizer(OptimizerWrapper):
+            def __init__(self, optim: Optimizer):
+                super().__init__(optim)
+                self.mixed_precision = MixedPrecisionMixin()
+
+            def backward(self, loss):
+                loss = self.mixed_precision.pre_backward(loss)
+                loss.backward()
+
+            def backward_by_grad(self, tensor, grad):
+                grad = self.mixed_precision.pre_backward_by_grad(tensor, grad)
+                tensor.backward(grad)
+
+            def step(self):
+                if self.mixed_precision.should_skip_step():
+                    self.zero_grad()
+                    return
+                div_scale = self.mixed_precision.get_grad_div_scale()
+                # maybe clip grad here
+                # maybe scale grad here
+                self.optim.step()
+
+            def zero_grad(self):
+                self.mixed_precision.pre_zero_grad()
+                return self.optim.zero_grad()
+        ```
+    """
+    dtype: torch.dtype
+
+    @abstractmethod
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        """Called before backward.
+
+        Args:
+            loss (Tensor): Loss value.
+
+        Returns:
+            Tensor: Loss value (possibly scaled).
+        """
+        pass
+
+    @abstractmethod
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        """Called before backward by grad. This is helpful for pipeline parallelism.
+
+        Args:
+            tensor (Tensor): Tensor to backward.
+            grad (Tensor): Gradient of the tensor.
+
+        Returns:
+            Tensor: Gradient of the tensor (possibly scaled).
+        """
+        pass
+
+    @abstractmethod
+    def should_skip_step(self) -> bool:
+        """Called before step.
+
+        Returns:
+            bool: Whether to skip the step.
+        """
+        pass
+
+    @abstractmethod
+    def pre_zero_grad(self) -> None:
+        """Called before zero_grad.
+        """
+        pass
+
+    @abstractmethod
+    def get_grad_div_scale(self) -> float:
+        """Called before step or clip_grad. To keep computation efficiency, this method does not (maybe) unscale grads.
+
+        Returns:
+            float: A divisor for gradient clipping or step.
+        """
+        pass
diff --git a/colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py b/colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
new file mode 100644
index 000000000..9454f6eb8
--- /dev/null
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
@@ -0,0 +1,23 @@
+import torch
+from torch import Tensor
+
+from .base import MixedPrecisionMixin
+
+
+class BF16MixedPrecisionMixin(MixedPrecisionMixin):
+    dtype = torch.bfloat16
+
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        return loss
+
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        return grad
+
+    def should_skip_step(self) -> bool:
+        return False
+
+    def pre_zero_grad(self) -> None:
+        pass
+
+    def get_grad_div_scale(self) -> float:
+        return 1.0
diff --git a/colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py b/colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
new file mode 100644
index 000000000..1ce8e42eb
--- /dev/null
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
@@ -0,0 +1,84 @@
+from abc import abstractmethod
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.utils import get_current_device
+
+from .base import MixedPrecisionMixin
+
+
+class OptimState(Enum):
+    SCALED = 0
+    UNSCALED = 1
+
+
+class FP16MixedPrecisionMixin(MixedPrecisionMixin):
+    dtype = torch.float16
+
+    def __init__(self,
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32) -> None:
+        super().__init__()
+        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
+                                             min_scale=min_scale,
+                                             growth_factor=growth_factor,
+                                             backoff_factor=backoff_factor,
+                                             growth_interval=growth_interval,
+                                             hysteresis=hysteresis,
+                                             max_scale=max_scale)
+        self.optim_state = OptimState.UNSCALED
+        self.found_overflow = torch.zeros(1, dtype=torch.float, device=get_current_device())
+
+    @property
+    def loss_scale(self) -> float:
+        return self.grad_scaler.scale.item()
+
+    @abstractmethod
+    def check_local_overflow(self) -> bool:
+        """Check whether there is overflow in the local process. This method should be implemented by subclasses.
+
+        Returns:
+            bool: Whether there is overflow in the local process.
+        """
+        pass
+
+    def check_overflow(self) -> bool:
+        # clear previous overflow record
+        self.found_overflow.fill_(0.0)
+        if self.check_local_overflow():
+            self.found_overflow.fill_(1.0)
+        dist.all_reduce(self.found_overflow, op=dist.ReduceOp.MAX)
+        return self.found_overflow.item() > 0
+
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        loss = self.loss_scale * loss
+        self.optim_state = OptimState.SCALED
+        return loss
+
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        self.optim_state = OptimState.SCALED
+        return grad
+
+    def should_skip_step(self) -> bool:
+        found_inf = self.check_overflow()
+        self.grad_scaler.update(found_inf)
+        if found_inf:
+            self.optim_state = OptimState.UNSCALED
+        return found_inf
+
+    def pre_zero_grad(self) -> None:
+        pass
+
+    def get_grad_div_scale(self) -> float:
+        assert self.optim_state == OptimState.SCALED, 'grads should be scaled before clipping'
+        self.optim_state = OptimState.UNSCALED
+        return self.loss_scale
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index adbf4803e..46714fe1c 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -23,6 +23,9 @@ from .dp_plugin_base import DPPluginBase
 
 __all__ = ['GeminiPlugin']
 
+SUPPORTED_PRECISION = ['fp16', 'bf16']
+PRECISION_STR_TO_DTYPE = {'fp16': torch.half, 'bf16': torch.bfloat16}
+
 
 class GeminiCheckpointIO(GeneralCheckpointIO):
 
@@ -171,6 +174,7 @@ class GeminiPlugin(DPPluginBase):
     Args:
         device (torch.device): device to place the model.
         placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
+        precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
         pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
         force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
         strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
@@ -203,6 +207,7 @@ class GeminiPlugin(DPPluginBase):
         self,
         device: Optional[torch.device] = None,
         placement_policy: str = "cpu",
+        precision: str = "fp16",
         pin_memory: bool = False,
         force_outputs_fp32: bool = False,
         strict_ddp_mode: bool = False,
@@ -223,6 +228,7 @@ class GeminiPlugin(DPPluginBase):
         verbose: bool = False,
     ) -> None:
         super().__init__()
+        assert precision in SUPPORTED_PRECISION, f'precision {precision} is not supported'
         self.gemini_config = dict(
             device=(device or get_current_device()),
             placement_policy=placement_policy,
@@ -233,6 +239,7 @@ class GeminiPlugin(DPPluginBase):
             hidden_dim=hidden_dim,
             min_chunk_size_mb=min_chunk_size_mb,
             memstats=memstats,
+            mixed_precision=PRECISION_STR_TO_DTYPE[precision],
         )
         self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio,)
         self.optim_kwargs = dict(initial_scale=initial_scale,
@@ -253,7 +260,7 @@ class GeminiPlugin(DPPluginBase):
         return True
 
     def supported_precisions(self) -> List[str]:
-        return ['fp16']
+        return SUPPORTED_PRECISION
 
     def control_device(self) -> bool:
         return True
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 5d93cf0e3..2b312d0f9 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -1,4 +1,5 @@
 import warnings
+from functools import partial
 from typing import Callable, Iterator, List, Optional, Tuple, Union
 
 import torch
@@ -20,12 +21,15 @@ from .torch_ddp_plugin import TorchDDPCheckpointIO
 __all__ = ['LowLevelZeroPlugin']
 
 
-def _convert_to_fp16(x):
+def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
     if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
-        return x.half()
+        return x.to(dtype)
     return x
 
 
+SUPPORTED_PRECISION = ['fp16', 'bf16', 'fp32']
+
+
 class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
 
     def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
@@ -49,17 +53,24 @@ class LowLevelZeroModel(ModelWrapper):
 
     def __init__(self, module: nn.Module, stage: int, precision: str) -> None:
         super().__init__(module)
-        self.convert_inputs = (precision == 'fp16')
-        module = zero_model_wrapper(module, zero_stage=stage)
+        self.dtype = None
         if precision == 'fp16':
-            module = module.half()
+            self.dtype = torch.float16
+        elif precision == 'bf16':
+            self.dtype = torch.bfloat16
+        module = zero_model_wrapper(module, zero_stage=stage)
+        if self.dtype is not None:
+            module = module.to(self.dtype)
         module = module.to(get_current_device())
         self.module = module
+        self.convert_fn = None
+        if self.dtype is not None:
+            self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
 
     def forward(self, *args, **kwargs):
-        if self.convert_inputs:
-            args = tree_map(_convert_to_fp16, args)
-            kwargs = tree_map(_convert_to_fp16, kwargs)
+        if self.convert_fn is not None:
+            args = tree_map(self.convert_fn, args)
+            kwargs = tree_map(self.convert_fn, kwargs)
         return super().forward(*args, **kwargs)
 
 
@@ -110,7 +121,7 @@ class LowLevelZeroPlugin(DPPluginBase):
 
     Args:
         strage (int, optional): ZeRO stage. Defaults to 1.
-        precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
+        precision (str, optional): precision. Support 'fp16', 'bf16' and 'fp32'. Defaults to 'fp16'.
         initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
         min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
         growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
@@ -149,7 +160,7 @@ class LowLevelZeroPlugin(DPPluginBase):
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
-        assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
+        assert precision in SUPPORTED_PRECISION, f'LowLevelZeroPlugin only supports {SUPPORTED_PRECISION} training'
 
         self.stage = stage
         self.precision = precision
@@ -175,7 +186,7 @@ class LowLevelZeroPlugin(DPPluginBase):
         return True
 
     def supported_precisions(self) -> List[str]:
-        return ['fp16', 'fp32']
+        return SUPPORTED_PRECISION
 
     def control_device(self) -> bool:
         return True
diff --git a/colossalai/kernel/cuda_native/csrc/type_shim.h b/colossalai/kernel/cuda_native/csrc/type_shim.h
index 2f180a778..03ccc0263 100644
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@@ -171,6 +171,21 @@
     using g_scalar_t_##LEVEL = at::Half;                                       \
     using p_scalar_t_##LEVEL = at::Half;                                       \
     __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::Float &&                                 \
+             PTYPE == at::ScalarType::BFloat16) {                              \
+    using g_scalar_t_##LEVEL = float;                                          \
+    using p_scalar_t_##LEVEL = at::BFloat16;                                   \
+    __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::BFloat16 &&                              \
+             PTYPE == at::ScalarType::Float) {                                 \
+    using g_scalar_t_##LEVEL = at::BFloat16;                                   \
+    using p_scalar_t_##LEVEL = float;                                          \
+    __VA_ARGS__;                                                               \
+  } else if (GTYPE == at::ScalarType::BFloat16 &&                              \
+             PTYPE == at::ScalarType::BFloat16) {                              \
+    using g_scalar_t_##LEVEL = at::BFloat16;                                   \
+    using p_scalar_t_##LEVEL = at::BFloat16;                                   \
+    __VA_ARGS__;                                                               \
   } else {                                                                     \
     AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \
              "'");                                                             \
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index bb561a106..7070c0a1e 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -93,8 +93,7 @@ class CPUAdam(NVMeOptimizer):
                           bias_correction1,
                           bias_correction2,
                           use_adamw=False):
-        # FIXME(ver217): remove the below line when replace torch adam with fused adam
-        grad = grad.float()
+        grad = grad.to(data.dtype)
 
         if weight_decay != 0:
             if use_adamw:
@@ -133,10 +132,12 @@ class CPUAdam(NVMeOptimizer):
                 if len(state) == 0:
                     state['step'] = 0
 
+                    # FIXME(ver217): CPU adam kernel only supports fp32 states now
+                    assert p.dtype is torch.float, "CPUAdam only support fp32 parameters"
                     # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
+                    state['exp_avg'] = torch.zeros_like(p, device=target_device)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
+                    state['exp_avg_sq'] = torch.zeros_like(p, device=target_device)
                     self._post_state_init(p)
 
                 state['step'] += 1
@@ -147,9 +148,17 @@ class CPUAdam(NVMeOptimizer):
                     assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
                     assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
                     self._pre_update(p, 'exp_avg', 'exp_avg_sq')
-                    self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'], group['weight_decay'],
-                                          group['bias_correction'], p.data, p.grad.data, state['exp_avg'],
-                                          state['exp_avg_sq'], div_scale)
+                    if p.grad.dtype is torch.bfloat16:
+                        # cpu adam kernel does not support bf16 now
+                        bias_correction1 = 1 - beta1**state['step']
+                        bias_correction2 = 1 - beta2**state['step']
+                        self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],
+                                               beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,
+                                               bias_correction2, self.adamw_mode)
+                    else:
+                        self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'],
+                                              group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
+                                              state['exp_avg'], state['exp_avg_sq'], div_scale)
                     self._post_update(p, 'exp_avg', 'exp_avg_sq')
                 elif target_device.type == 'cuda':
                     assert div_scale == -1, "div_scale should remain default"
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 987af8a96..82a6250f1 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -134,8 +134,8 @@ class FusedAdam(torch.optim.Optimizer):
                     # Exponential moving average of squared gradient values
                     state['exp_avg_sq'] = torch.zeros_like(p)
 
-                if p.dtype not in [torch.float16, torch.float32]:
-                    raise RuntimeError('FusedAdam only support fp16 and fp32.')
+                if p.dtype not in [torch.float16, torch.float32, torch.bfloat16]:
+                    raise RuntimeError('FusedAdam only support fp16, fp32 and bf16.')
 
                 g_l.append(p.grad.data)
                 p_l.append(p.data)
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index be6311c6c..526071b06 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -1,16 +1,17 @@
 from typing import Any, Optional
 
 import torch
+from torch.optim import Adam
 
-from colossalai.kernel.op_builder import CPUAdamBuilder, FusedOptimBuilder
+from colossalai.kernel.op_builder import FusedOptimBuilder
 from colossalai.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
-from .nvme_optimizer import NVMeOptimizer
+from .cpu_adam import CPUAdam
 
 
 @OPTIMIZERS.register_module
-class HybridAdam(NVMeOptimizer):
+class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 
     Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
@@ -74,15 +75,9 @@ class HybridAdam(NVMeOptimizer):
                  nvme_offload_dir: Optional[str] = None,
                  **defaults: Any):
 
-        default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
-        super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
-        self.adamw_mode = adamw_mode
-
-        # build during runtime if not found
-        cpu_optim = CPUAdamBuilder().load()
+        super().__init__(model_params, lr, bias_correction, betas, eps, weight_decay, adamw_mode, nvme_offload_fraction,
+                         nvme_offload_dir)
         fused_optim = FusedOptimBuilder().load()
-        self.cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)
-
         self.gpu_adam_op = fused_optim.multi_tensor_adam
         self._dummy_overflow_buf = torch.cuda.IntTensor([0])
 
@@ -108,10 +103,12 @@ class HybridAdam(NVMeOptimizer):
                 if len(state) == 0:
                     state['step'] = 0
 
+                    # FIXME(ver217): CPU adam kernel only supports fp32 states now
+                    assert p.dtype is torch.float, "HybridAdam only support fp32 parameters"
                     # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
+                    state['exp_avg'] = torch.zeros_like(p, device=target_device)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
+                    state['exp_avg_sq'] = torch.zeros_like(p, device=target_device)
                     self._post_state_init(p)
 
                 state['step'] += 1
@@ -122,9 +119,17 @@ class HybridAdam(NVMeOptimizer):
                     assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
                     assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
                     self._pre_update(p, 'exp_avg', 'exp_avg_sq')
-                    self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'], group['weight_decay'],
-                                          group['bias_correction'], p.data, p.grad.data, state['exp_avg'],
-                                          state['exp_avg_sq'], div_scale)
+                    if p.grad.dtype is torch.bfloat16:
+                        # cpu adam kernel does not support bf16 now
+                        bias_correction1 = 1 - beta1**state['step']
+                        bias_correction2 = 1 - beta2**state['step']
+                        self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],
+                                               beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,
+                                               bias_correction2, self.adamw_mode)
+                    else:
+                        self.cpu_adam_op.step(state['step'], group['lr'], beta1, beta2, group['eps'],
+                                              group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
+                                              state['exp_avg'], state['exp_avg_sq'], div_scale)
                     self._post_update(p, 'exp_avg', 'exp_avg_sq')
 
                 elif target_device.type == 'cuda':
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index fd49362d6..7e23fdb42 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -51,6 +51,7 @@ class ZeroDDP(ColoDDP):
         strict_ddp_mode (bool): If set to True, there is no tensor sharding, each tensor is replicated.
             Defaults to False. Users can set it to True, when they clearly know that they only need DDP.
         scatter_after_inference (bool): If set to True, the model will be scattered after inference. This will save memory but slow down the consecutive inference.
+        mixed_precision (torch.dtype): If set to torch.float16, the model will be trained in fp16. Otherwise, the model will be trained in bf16. Defaults to torch.float16.
     """
 
     def __init__(self,
@@ -59,7 +60,9 @@ class ZeroDDP(ColoDDP):
                  pin_memory: bool = False,
                  force_outputs_fp32: bool = False,
                  strict_ddp_mode: bool = False,
-                 scatter_after_inference: bool = True) -> None:
+                 scatter_after_inference: bool = True,
+                 mixed_precision: torch.dtype = torch.float16) -> None:
+        assert mixed_precision in (torch.float16, torch.bfloat16)
         self.gemini_manager = gemini_manager
         self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
         self.force_outputs_fp32 = force_outputs_fp32
@@ -71,6 +74,7 @@ class ZeroDDP(ColoDDP):
         self.param2name: Dict[nn.Parameter, str] = dict()
         self.name2param: Dict[str, nn.Parameter] = dict()
         self.scatter_after_inference = scatter_after_inference
+        self.mixed_precision = mixed_precision
 
         self._logger = get_dist_logger()
 
@@ -151,7 +155,7 @@ class ZeroDDP(ColoDDP):
             assert not self.gemini_manager.need_warmup or not self.gemini_manager.is_warmup(
             ), "You should run a completed iteration as your warmup iter"
 
-        args, kwargs = _cast_float(args, torch.half), _cast_float(kwargs, torch.half)
+        args, kwargs = _cast_float(args, self.mixed_precision), _cast_float(kwargs, self.mixed_precision)
         self.module.zero_grad(set_to_none=True)
         if not grad_flag:
             outputs = self._inference_forward(*args, **kwargs)
@@ -570,14 +574,14 @@ class ZeroDDP(ColoDDP):
 
             # move ignored parameters to CUDA
             if is_ddp_ignored(p):
-                p.data = p.data.to(device=get_current_device(), dtype=torch.float16)
+                p.data = p.data.to(device=get_current_device(), dtype=self.mixed_precision)
                 continue
 
             # create a fp32 parameter
             fp32_data = p.data.float()
             fp32_p = ColoTensor(fp32_data, spec=ColoTensorSpec(p.process_group))
             # create a fp16 parameter
-            p.data = p.data.half()
+            p.data = p.data.to(self.mixed_precision)
 
             # register the fp16 parameter and fp32 parameter in the chunk manager
             dp_world_size = p.process_group.dp_world_size()
@@ -613,7 +617,7 @@ class ZeroDDP(ColoDDP):
                 buffer.materialize()
             buffer.data = buffer.cuda()
             if torch.is_floating_point(buffer):
-                buffer.data = buffer.half()
+                buffer.data = buffer.to(self.mixed_precision)
 
     def _preprocess_param(self, p: Union[nn.Parameter, ColoParameter, 'LazyTensor']) -> None:
         """Convert parameter to ColoParameter in-place.
@@ -736,6 +740,7 @@ class GeminiDDP(ZeroDDP):
                  hidden_dim: Optional[int] = None,
                  min_chunk_size_mb: float = 32,
                  memstats: Optional[MemStats] = None,
+                 mixed_precision: torch.dtype = torch.float16,
                  verbose: bool = False) -> None:
         """
         A torch.Module wrapper using ZeRO-DP and Gemini.
@@ -776,5 +781,10 @@ class GeminiDDP(ZeroDDP):
                                            strict_ddp_flag=strict_ddp_mode,
                                            verbose=verbose)
         gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
-        super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode,
-                         scatter_after_inference)
+        super().__init__(module,
+                         gemini_manager,
+                         pin_memory,
+                         force_outputs_fp32,
+                         strict_ddp_mode,
+                         scatter_after_inference,
+                         mixed_precision=mixed_precision)
diff --git a/colossalai/zero/gemini/gemini_optimizer.py b/colossalai/zero/gemini/gemini_optimizer.py
index 71c4f65cb..267deb1e8 100644
--- a/colossalai/zero/gemini/gemini_optimizer.py
+++ b/colossalai/zero/gemini/gemini_optimizer.py
@@ -1,7 +1,6 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import math
 import warnings
-from enum import Enum
 from typing import Any, Dict, Set, Tuple
 
 import torch
@@ -9,7 +8,7 @@ import torch.distributed as dist
 from torch.nn import Parameter
 from torch.optim import Optimizer
 
-from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.amp.naive_amp.mixed_precision_mixin import BF16MixedPrecisionMixin, FP16MixedPrecisionMixin
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer, CPUAdam, FusedAdam, HybridAdam
 from colossalai.utils import disposable, get_current_device, is_ddp_ignored
@@ -22,9 +21,26 @@ __all__ = ['ZeroOptimizer', 'GeminiAdamOptimizer']
 _AVAIL_OPTIM_LIST = {FusedAdam, CPUAdam, HybridAdam}
 
 
-class OptimState(Enum):
-    SCALED = 0
-    UNSCALED = 1
+class GeminiFP16MixedPrecisionMixin(FP16MixedPrecisionMixin):
+
+    def __init__(self,
+                 module: ZeroDDP,
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32) -> None:
+        super().__init__(initial_scale, min_scale, growth_factor, backoff_factor, growth_interval, hysteresis,
+                         max_scale)
+        self.module = module
+
+    def check_local_overflow(self) -> bool:
+        return self.module.overflow_counter > 0
+
+    def pre_zero_grad(self) -> None:
+        self.module.overflow_counter = 0
 
 
 class ZeroOptimizer(ColossalaiOptimizer):
@@ -79,7 +95,6 @@ class ZeroOptimizer(ColossalaiOptimizer):
         self.module = module
         self.gemini_manager = module.gemini_manager
         self.chunk_manager: ChunkManager = self.gemini_manager.chunk_manager
-        self.optim_state = OptimState.UNSCALED
         self.param_to_range: Dict[Parameter, Tuple[int, int]] = dict()
         self.param_to_chunk32: Dict[Parameter, Chunk] = dict()
         self.chunk16_set: Set[Chunk] = set()
@@ -107,15 +122,20 @@ class ZeroOptimizer(ColossalaiOptimizer):
 
         self.__init__optimizer()
 
-        # Grad scaler
-        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
-                                             min_scale=min_scale,
-                                             growth_factor=growth_factor,
-                                             backoff_factor=backoff_factor,
-                                             growth_interval=growth_interval,
-                                             hysteresis=hysteresis,
-                                             max_scale=max_scale)
-        self._found_overflow: torch.Tensor = torch.zeros(1, dtype=torch.int64, device=get_current_device())
+        if module.mixed_precision is torch.float16:
+            self.mix_precision_mixin = GeminiFP16MixedPrecisionMixin(module,
+                                                                     initial_scale=initial_scale,
+                                                                     min_scale=min_scale,
+                                                                     growth_factor=growth_factor,
+                                                                     backoff_factor=backoff_factor,
+                                                                     growth_interval=growth_interval,
+                                                                     hysteresis=hysteresis,
+                                                                     max_scale=max_scale)
+        elif module.mixed_precision is torch.bfloat16:
+            self.mix_precision_mixin = BF16MixedPrecisionMixin()
+        else:
+            raise RuntimeError(f"Unsupported mixed precision type: {module.mixed_precision}")
+
         self._logger = get_dist_logger()
 
         self.gpu_margin_mem_ratio: float = float(gpu_margin_mem_ratio)
@@ -151,15 +171,6 @@ class ZeroOptimizer(ColossalaiOptimizer):
         for chunk16 in self.chunk16_set:
             chunk16.optim_update()
 
-    def _check_overflow(self):
-        # clear previous overflow record
-        self._found_overflow.fill_(self.module.overflow_counter)
-
-        # all-reduce across global group
-        dist.all_reduce(self._found_overflow)
-
-        return self._found_overflow.item() > 0
-
     def _clear_global_norm(self) -> None:
         for c16 in self.chunk16_set:
             c16.l2_norm = None
@@ -190,40 +201,25 @@ class ZeroOptimizer(ColossalaiOptimizer):
         return global_norm
 
     def _get_combined_scale(self):
-        loss_scale = 1
+        div_scale = self.mix_precision_mixin.get_grad_div_scale()
 
-        if self.optim_state == OptimState.SCALED:
-            loss_scale = self.loss_scale
-            self.optim_state = OptimState.UNSCALED
-
-        combined_scale = loss_scale
         if self.clipping_flag:
             total_norm = self._calc_global_norm()
-            clip = ((total_norm / loss_scale) + 1e-6) / self.max_norm
+            clip = ((total_norm / div_scale) + 1e-6) / self.max_norm
             if clip > 1:
-                combined_scale = clip * loss_scale
+                div_scale = clip * div_scale
 
-        if combined_scale == 1:
-            return -1
-        else:
-            return combined_scale
-
-    @property
-    def loss_scale(self):
-        return self.grad_scaler.scale.item()
+        return -1 if div_scale == 1.0 else div_scale
 
     def zero_grad(self, *args, **kwargs):
-        self.module.overflow_counter = 0
+        self.mix_precision_mixin.pre_zero_grad()
         return self.optim.zero_grad(set_to_none=True)
 
     def step(self, *args, **kwargs):
         self._maybe_move_fp32_params()
         self._set_grad_ptr()
 
-        found_inf = self._check_overflow()
-        if found_inf:
-            self.optim_state = OptimState.UNSCALED    # no need to unscale grad
-            self.grad_scaler.update(found_inf)    # update gradient scaler
+        if self.mix_precision_mixin.should_skip_step():
             if self.verbose:
                 self._logger.info(f'Found overflow. Skip step')
             self._clear_global_norm()    # clear recorded norm
@@ -234,7 +230,6 @@ class ZeroOptimizer(ColossalaiOptimizer):
         # get combined scale. combined scale = loss scale * clipping norm
         # so that gradient = gradient / combined scale
         combined_scale = self._get_combined_scale()
-        self.grad_scaler.update(found_inf)
 
         ret = self.optim.step(div_scale=combined_scale, *args, **kwargs)
         self._register_states()
@@ -246,8 +241,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
         raise NotImplementedError
 
     def backward(self, loss: torch.Tensor):
-        loss = self.loss_scale * loss
-        self.optim_state = OptimState.SCALED
+        loss = self.mix_precision_mixin.pre_backward(loss)
         self.module.backward(loss)
 
     def backward_by_grad(self, tensor: torch.Tensor, grad: torch.Tensor):
@@ -255,7 +249,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
         # It receives the scaled grad from the previous rank
         # No need to scale the grad again
         # Need to unscale when optimizing
-        self.optim_state = OptimState.SCALED
+        grad = self.mix_precision_mixin.pre_backward_by_grad(grad)
         self.module.backward_by_grad(tensor, grad)
 
     def _maybe_move_fp32_params(self):
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/zero/legacy/init_ctx/init_context.py
index a921ca0aa..a3fa46b38 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/zero/legacy/init_ctx/init_context.py
@@ -14,7 +14,7 @@ from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp16
+from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
 from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
 from colossalai.zero.legacy.sharded_param import ShardedParamV2
 
@@ -55,6 +55,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
         seed (int, optional): Random seed for weight initialization
         shard_param (bool, optional): Is param sharded after exiting the context. Defaults to False.
         default_dtype (torch.dtype, optional): If it's not None, parameters will be initialized as ``default_dtype`` then converted to fp16.
+        bf16 (bool, optional): If it's True, parameters will be initialized as ``torch.bfloat16``. Otherwise, parameters will be initialized as ``torch.float16``. Defaults to False.
         model_numel_tensor (torch.Tensor, optional): A tensor which will store the number of elements of model. Defaults to torch.zeros(1, dtype=torch.int).
     """
 
@@ -64,6 +65,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
                  seed: int = 2**10 - 1,
                  shard_param: bool = False,
                  default_dtype: Optional[torch.dtype] = None,
+                 bf16: bool = False,
                  model_numel_tensor: torch.Tensor = torch.zeros(1, dtype=torch.long)):
 
         super().__init__(default_dtype=default_dtype)
@@ -71,6 +73,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
         self.param_list = []
         self.model_numel_tensor = model_numel_tensor
         self.seed = seed
+        self.bf16 = bf16
         self.dp_process_group = gpc.get_group(ParallelMode.DATA)
 
         self.config = ZeroContextConfig(target_device=target_device, is_replicated=True, shard_param=shard_param)
@@ -183,9 +186,10 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
         NOTE() The module may be passed to this function multiple times.
         """
         self.top_module = module
+        half_dtype = torch.float16 if not self.bf16 else torch.bfloat16
 
         def half_fn(t: torch.Tensor):
-            return t.half() if t.is_floating_point() else t
+            return t.to(half_dtype) if t.is_floating_point() else t
 
         for param in module.parameters(recurse=False):
             # avoid adapting a param to ShardedParam twice
@@ -226,9 +230,10 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
         # We must cast buffers
         # If we use BN, buffers may be on CPU and Float
         # We must cast them
+        cast_fn = cast_tensor_to_fp16 if not self.bf16 else cast_tensor_to_bf16
         for buffer in module.buffers(recurse=False):
             buffer.data = buffer.data.to(device=torch.cuda.current_device())
-            buffer.data = cast_tensor_to_fp16(buffer.data)
+            buffer.data = cast_fn(buffer.data)
 
 
 class ZeroContextMgr(metaclass=SingletonMeta):
diff --git a/colossalai/zero/legacy/sharded_model/_utils.py b/colossalai/zero/legacy/sharded_model/_utils.py
index 2bd01531a..f1d642cf3 100644
--- a/colossalai/zero/legacy/sharded_model/_utils.py
+++ b/colossalai/zero/legacy/sharded_model/_utils.py
@@ -43,11 +43,19 @@ def cast_tensor_to_fp32(tensor: Union[torch.Tensor, StatefulTensor]) -> torch.Te
     if isinstance(tensor, StatefulTensor):
         tensor = tensor.payload
 
-    if torch.is_floating_point(tensor) and tensor.dtype is torch.float16:
+    if torch.is_floating_point(tensor) and tensor.dtype in (torch.float16, torch.bfloat16):
         return tensor.float()
     return tensor
 
 
+def cast_tensor_to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    if isinstance(tensor, StatefulTensor):
+        tensor = tensor.payload
+    if torch.is_floating_point(tensor) and tensor.dtype is torch.float32:
+        return tensor.bfloat16()
+    return tensor
+
+
 def apply_to_tensors(x: Any, fn: Callable):
     if torch.is_tensor(x):
         return fn(x)
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
index b3a83b741..be3842beb 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
@@ -28,6 +28,7 @@ from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBuc
 
 from ._utils import (
     cast_float_arguments,
+    cast_tensor_to_bf16,
     cast_tensor_to_fp16,
     cast_tensor_to_fp32,
     chunk_and_pad,
@@ -74,6 +75,7 @@ class ShardedModelV2(nn.Module):
             In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
             We find that PyTorch's optimizers don't support mixed precision,
             so we recommend you enable this only when using our CPUAdam with CPU offload. Defaults to False.
+        bf16 (bool, optional): Whether to use bfloat16 for param and grad. Defaults to False.
     """
 
     def __init__(self,
@@ -86,11 +88,13 @@ class ShardedModelV2(nn.Module):
                  tensor_placement_policy: str = 'cuda',
                  gradient_predivide_factor: Optional[float] = 1.0,
                  reuse_fp16_shard: bool = False,
+                 bf16: bool = False,
                  *args,
                  **kwargs):
         assert not isinstance(module, ShardedModelV2), 'Nested ShardedModelV2 is not supported.'
         super().__init__()
         self.logger = get_dist_logger()
+        self.bf16 = bf16
 
         # We force users to use ZeroInitContext
         for submodule in module.modules():
@@ -232,7 +236,8 @@ class ShardedModelV2(nn.Module):
 
     def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
         self._pre_forward_operations(*args)
-        args, kwargs = cast_float_arguments(cast_tensor_to_fp16, *args, **kwargs)
+        cast_fn = cast_tensor_to_bf16 if self.bf16 else cast_tensor_to_fp16
+        args, kwargs = cast_float_arguments(cast_fn, *args, **kwargs)
         outputs = self.module(*args, **kwargs)
         self._post_forward_operations()
         return outputs
diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
index be60209af..41dd174cb 100644
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
@@ -94,6 +94,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         super().__init__(optimizer)
         self.shard_strategy = sharded_model.shard_strategy
         self.model: ShardedModelV2 = sharded_model
+        self.bf16 = sharded_model.bf16
 
         self.gpu_margin_mem_ratio: float = float(gpu_margin_mem_ratio)
         assert 0.0 <= self.gpu_margin_mem_ratio <= 1.0, f'gpu_margin_mem_ratio must >=0.0 and <=1.0'
@@ -117,6 +118,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         self._found_overflow: Tensor = torch.IntTensor([0]).to(torch.cuda.current_device())
         self._logger = get_dist_logger("ShardedOptimizerV2")
         self._verbose = verbose
+        self._grad_prepared: bool = False    # this should be set to true when _prepare_grads() and reset to false when backward
 
         # Store fp32 param shards
         self._register_master_weight()
@@ -166,8 +168,10 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         self._zero_grad()
 
     def backward(self, loss: Tensor) -> None:
-        loss = self.loss_scale * loss
-        self.optim_state = OptimState.SCALED
+        if not self.bf16:
+            loss = self.loss_scale * loss
+            self.optim_state = OptimState.SCALED
+        self._grad_prepared = False
         self.model.backward(loss)
 
     def backward_by_grad(self, tensor: Tensor, grad: Tensor) -> None:
@@ -175,30 +179,33 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
         # It receives the scaled grad from the previous rank
         # No need to scale the grad again
         # Need to unscale when optimizing
-        self.optim_state = OptimState.SCALED
+        if not self.bf16:
+            self.optim_state = OptimState.SCALED
+        self._grad_prepared = False
         self.model.backward_by_grad(tensor, grad)
 
     def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        if self.optim_state == OptimState.SCALED:
-            self._prepare_grads()
+        self._prepare_grads()
+        if not self.bf16 and self.optim_state == OptimState.SCALED:
             self._unscale_grads()
         return super().clip_grad_norm(model, max_norm)
 
     def step(self, *args, **kwargs):
 
+        self._prepare_grads()
         # unscale grads if scaled
-        if self.optim_state == OptimState.SCALED:
-            self._prepare_grads()
+        if not self.bf16 and self.optim_state == OptimState.SCALED:
             self._unscale_grads()
 
         self._maybe_move_fp32_shards()
-        found_inf = self._check_overflow()
-        self.grad_scaler.update(found_inf)
+        if not self.bf16:
+            found_inf = self._check_overflow()
+            self.grad_scaler.update(found_inf)
 
-        if found_inf:
-            self._logger.warning('found inf during ShardedOptimV2 step')
-            self._zero_grad(recover_data=True)
-            return
+            if found_inf:
+                self._logger.warning('found inf during ShardedOptimV2 step')
+                self._zero_grad(recover_data=True)
+                return
 
         self._point_param_fp16_to_master_param()
 
@@ -304,6 +311,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                                 state[k] = v.cuda()
 
     def _prepare_grads(self):
+        if self._grad_prepared:
+            return
         for group in self.optim.param_groups:
             for p in group['params']:
                 if p.colo_attr.saved_grad.is_null():
@@ -320,6 +329,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                 p.grad = p.colo_attr.grad_payload
                 # Set p.data to empty tensor, in case of memory leaking
                 p.colo_attr.set_data_none()
+        self._grad_prepared = True
 
     def _point_param_fp16_to_master_param(self):
         # assign master param pointers to p.data.
@@ -357,7 +367,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
                 torch.empty(p.data.shape, dtype=p.colo_attr.data_payload.dtype, device=p.colo_attr.data_payload.device))
 
         # TODO() optimize this line CPU (fp32) -> GPU (fp16)
-        p.colo_attr.sharded_data_tensor.payload_copy(p.half().detach())
+        half_dtype = torch.bfloat16 if self.bf16 else torch.float16
+        p.colo_attr.sharded_data_tensor.payload_copy(p.to(half_dtype).detach())
         p.colo_attr.set_data_none()
 
         if p.colo_attr.keep_not_shard and p.colo_attr.is_replicated:
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 3e7661eca..d4d03e5b5 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -6,7 +6,11 @@ import torch
 import torch.distributed as dist
 from torch.optim import Optimizer
 
-from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.amp.naive_amp.mixed_precision_mixin import (
+    BF16MixedPrecisionMixin,
+    FP16MixedPrecisionMixin,
+    MixedPrecisionMixin,
+)
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
@@ -27,6 +31,31 @@ from ._utils import (
 from .bookkeeping import BucketStore, GradientStore, ParameterStore, TensorBucket
 
 
+class LowLevelZeroFP16MixedPrecisionMixin(FP16MixedPrecisionMixin):
+
+    def __init__(self,
+                 num_working_param_groups: int,
+                 grad_store: GradientStore,
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32) -> None:
+        super().__init__(initial_scale, min_scale, growth_factor, backoff_factor, growth_interval, hysteresis,
+                         max_scale)
+        self.num_working_param_groups = num_working_param_groups
+        self.grad_store = grad_store
+
+    def check_local_overflow(self) -> bool:
+        for group_id in range(self.num_working_param_groups):
+            for avg_grad in self.grad_store.get_averaged_gradients_by_group(group_id):
+                if avg_grad is not None and has_inf_or_nan(avg_grad):
+                    return True
+        return False
+
+
 class LowLevelZeroOptimizer(ColossalaiOptimizer):
     """Optimizer used for ZeRO-1 and ZeRO-2.
     """
@@ -100,17 +129,6 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
         self._reduce_bucket_size = reduce_bucket_size
         self._communication_dtype = communication_dtype
 
-        # gradient scaler
-        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
-                                             min_scale=min_scale,
-                                             growth_factor=growth_factor,
-                                             backoff_factor=backoff_factor,
-                                             growth_interval=growth_interval,
-                                             hysteresis=hysteresis,
-                                             max_scale=max_scale,
-                                             verbose=verbose)
-        self._found_overflow = torch.FloatTensor([0]).to(get_current_device())
-
         # gradient clipping
         self._clip_grad_norm = clip_grad_norm
 
@@ -200,14 +218,25 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
         if self._overlap_communication or self._partition_grads:
             self._attach_reduction_hook()
 
+        # initialize mixed precision mixin
+        self.mixed_precision_mixin: Optional[MixedPrecisionMixin] = None
+        if self._dtype is torch.float16:
+            self.mixed_precision_mixin = LowLevelZeroFP16MixedPrecisionMixin(self.num_param_groups,
+                                                                             self._grad_store,
+                                                                             initial_scale=initial_scale,
+                                                                             min_scale=min_scale,
+                                                                             growth_factor=growth_factor,
+                                                                             backoff_factor=backoff_factor,
+                                                                             growth_interval=growth_interval,
+                                                                             hysteresis=hysteresis,
+                                                                             max_scale=max_scale)
+        elif self._dtype is torch.bfloat16:
+            self.mixed_precision_mixin = BF16MixedPrecisionMixin()
+
     @property
     def dtype(self):
         return self._dtype
 
-    @property
-    def loss_scale(self):
-        return self.grad_scaler.scale
-
     @property
     def num_param_groups(self):
         return len(self._working_param_groups)
@@ -392,7 +421,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     ################################
 
     def backward(self, loss, retain_graph=False, sync_grad=True):
-        loss = self.loss_scale * loss
+        if self.mixed_precision_mixin is not None:
+            loss = self.mixed_precision_mixin.pre_backward(loss)
         loss.backward(retain_graph=retain_graph)
 
         # finish gradient reduction
@@ -419,6 +449,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
         :param set_to_none: Whether set the gradient to None. Default value is True.
         :type set_to_none: bool
         """
+        if self.mixed_precision_mixin is not None:
+            self.mixed_precision_mixin.pre_zero_grad()
         for _, param_group in self._working_param_groups.items():
             for param in param_group:
                 if set_to_none:
@@ -435,12 +467,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     def step(self, closure=None):
         assert closure is None, 'closure is not supported by step()'
 
-        # check for overflow
-        found_inf = self._check_overflow()
-        self.grad_scaler.update(found_inf)
-
-        # update loss scale if overflow occurs
-        if found_inf:
+        if self.mixed_precision_mixin is not None and self.mixed_precision_mixin.should_skip_step():
             self._grad_store.reset_all_average_gradients()
             if self._verbose:
                 self._logger.info(f'Found overflow. Skip step')
@@ -507,41 +534,20 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
     # Mixed Precision Utilities #
     #############################
 
-    def _check_overflow(self):
-        # clear previous overflow record
-        self._found_overflow.fill_(0.0)
-
-        # check for overflow
-        for group_id in range(len(self._working_param_groups)):
-            for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
-                if avg_grad is not None and has_inf_or_nan(avg_grad):
-                    self._found_overflow.fill_(1.0)
-                    break
-
-        # all-reduce across dp group
-        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_torch_group)
-
-        # all-reduce over model parallel group
-        if self._mp_torch_group:
-            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_torch_group)
-
-        if self._found_overflow.item() > 0:
-            return True
-        else:
-            return False
-
     def _unscale_and_clip_grads(self, grad_groups_flat, total_norm):
         # compute combined scale factor for this group
-        combined_scale = self.loss_scale
+        div_scale = 1.0
+        if self.mixed_precision_mixin is not None:
+            div_scale = self.mixed_precision_mixin.get_grad_div_scale()
 
         if self._clip_grad_norm > 0.:
             # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self._clip_grad_norm
+            clip = ((total_norm / div_scale) + 1e-6) / self._clip_grad_norm
             if clip > 1:
-                combined_scale = clip * self.loss_scale
+                div_scale = clip * div_scale
 
         for grad in grad_groups_flat:
-            grad.data.mul_(1. / combined_scale)
+            grad.data.mul_(1. / div_scale)
 
     ############################
     # Gradient Synchronization #
diff --git a/tests/test_optimizer/test_adam_kernel.py b/tests/test_optimizer/test_adam_kernel.py
new file mode 100644
index 000000000..2186a421f
--- /dev/null
+++ b/tests/test_optimizer/test_adam_kernel.py
@@ -0,0 +1,131 @@
+# This test checks adam kernels
+# Baseline is pure fp32 torch adam optimizer
+import math
+from abc import abstractmethod
+from typing import Type
+
+import pytest
+import torch
+from torch import Tensor
+
+from colossalai.utils import get_current_device, multi_tensor_applier
+
+_FUSED_ALLOWED_P_G_TYPES = [(torch.float, torch.half), (torch.float, torch.float), (torch.half, torch.float),
+                            (torch.half, torch.half), (torch.bfloat16, torch.float), (torch.float, torch.bfloat16),
+                            (torch.bfloat16, torch.bfloat16)]
+
+_CPU_ALLOWED_P_G_TYPES = [(torch.float, torch.half), (torch.float, torch.float), (torch.half, torch.float),
+                          (torch.half, torch.half)]
+
+
+class AdamKernel:
+
+    def __init__(self, lr: float, beta1: float, beta2: float, eps: float, weight_decay: float, use_adamw: bool) -> None:
+        self.lr = lr
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.use_adamw = use_adamw
+
+    @abstractmethod
+    def update(self, step: int, param: Tensor, grad: Tensor, exp_avg: Tensor, exp_avg_sq: Tensor):
+        pass
+
+
+class TorchAdamKernel(AdamKernel):
+
+    def update(self, step: int, param: Tensor, grad: Tensor, exp_avg: Tensor, exp_avg_sq: Tensor):
+        bias_correction1 = 1 - self.beta1**step
+        bias_correction2 = 1 - self.beta2**step
+
+        if self.weight_decay != 0:
+            if self.use_adamw:
+                # Perform stepweight decay
+                param.mul_(1 - self.lr * self.weight_decay)
+            else:
+                grad = grad.add(param, alpha=self.weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(self.beta1).add_(grad, alpha=1 - self.beta1)
+        exp_avg_sq.mul_(self.beta2).addcmul_(grad, grad, value=1 - self.beta2)
+        denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(self.eps)
+
+        step_size = self.lr / bias_correction1
+
+        param.addcdiv_(exp_avg, denom, value=-step_size)
+
+
+class FusedAdamKernel(AdamKernel):
+
+    def __init__(self, lr: float, beta1: float, beta2: float, eps: float, weight_decay: float, use_adamw: bool) -> None:
+        super().__init__(lr, beta1, beta2, eps, weight_decay, use_adamw)
+        from colossalai.kernel.op_builder import FusedOptimBuilder
+        fused_optim = FusedOptimBuilder().load()
+        self.fused_adam = fused_optim.multi_tensor_adam
+        self.dummy_overflow_buf = torch.cuda.IntTensor([0])
+
+    def update(self, step: int, param: Tensor, grad: Tensor, exp_avg: Tensor, exp_avg_sq: Tensor):
+        multi_tensor_applier(self.fused_adam, self.dummy_overflow_buf, [[grad], [param], [exp_avg], [exp_avg_sq]],
+                             self.lr, self.beta1, self.beta2, self.eps, step, self.use_adamw, True, self.weight_decay,
+                             -1)
+
+
+class CPUAdamKernel(AdamKernel):
+
+    def __init__(self, lr: float, beta1: float, beta2: float, eps: float, weight_decay: float, use_adamw: bool) -> None:
+        super().__init__(lr, beta1, beta2, eps, weight_decay, use_adamw)
+        from colossalai.kernel.op_builder import CPUAdamBuilder
+        cpu_optim = CPUAdamBuilder().load()
+
+        self.cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, use_adamw)
+
+    def update(self, step: int, param: Tensor, grad: Tensor, exp_avg: Tensor, exp_avg_sq: Tensor):
+        self.cpu_adam_op.step(step, self.lr, self.beta1, self.beta2, self.eps, self.weight_decay, True, param.view(-1),
+                              grad.view(-1), exp_avg.view(-1), exp_avg_sq.view(-1), -1)
+
+
+def check_adam_kernel(kernel: Type[AdamKernel], adamw: bool, weight_decay: float, p_dtype: torch.dtype,
+                      g_dtype: torch.dtype, device: torch.device, n_steps: int, rtol: float, atol: float):
+    lr = 1e-3
+    beta1, beta2 = 0.9, 0.999
+    eps = 1e-8
+    torch_adam = TorchAdamKernel(lr, beta1, beta2, eps, weight_decay, adamw)
+    adam_kernel = kernel(lr, beta1, beta2, eps, weight_decay, adamw)
+    master_p = torch.rand(64, device=device)
+    master_g = torch.rand_like(master_p)
+    master_exp_avg = torch.zeros_like(master_p)
+    master_exp_avg_sq = torch.zeros_like(master_p)
+    p = master_p.clone().to(p_dtype)
+    g = master_g.clone().to(g_dtype)
+    exp_avg = master_exp_avg.clone()
+    exp_avg_sq = master_exp_avg_sq.clone()
+
+    for step in range(1, 1 + n_steps):
+        torch_adam.update(step, master_p, master_g, master_exp_avg, master_exp_avg_sq)
+        adam_kernel.update(step, p, g, exp_avg, exp_avg_sq)
+        # if overflow, the weight won't be updated. so there will be no nan in p
+        assert not torch.isnan(p).any()
+        assert torch.allclose(master_p, p.float(), rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize('adamw', [False, True])
+@pytest.mark.parametrize('weight_decay', [0.0, 0.1])
+@pytest.mark.parametrize('p_dtype, g_dtype', _FUSED_ALLOWED_P_G_TYPES)
+def test_fused_adam_kernel(adamw, weight_decay, p_dtype, g_dtype):
+    rtol, atol = 1e-5, 1e-8
+    if p_dtype is torch.float16 or g_dtype is torch.float16:
+        rtol, atol = 1e-3, 1e-3
+    if p_dtype is torch.bfloat16 or g_dtype is torch.bfloat16:
+        rtol, atol = 4e-3, 4e-3
+    check_adam_kernel(FusedAdamKernel, adamw, weight_decay, p_dtype, g_dtype, get_current_device(), 3, rtol, atol)
+
+
+@pytest.mark.parametrize('adamw', [False, True])
+@pytest.mark.parametrize('weight_decay', [0.0, 0.1])
+@pytest.mark.parametrize('p_dtype, g_dtype', _CPU_ALLOWED_P_G_TYPES)
+def test_cpu_adam_kernel(adamw, weight_decay, p_dtype, g_dtype):
+    rtol, atol = 1e-5, 1e-8
+    if p_dtype is torch.float16 or g_dtype is torch.float16:
+        rtol, atol = 1e-3, 1e-3
+    check_adam_kernel(CPUAdamKernel, adamw, weight_decay, p_dtype, g_dtype, torch.device('cpu'), 3, rtol, atol)
diff --git a/tests/test_optimizer/test_adam_optim.py b/tests/test_optimizer/test_adam_optim.py
new file mode 100644
index 000000000..0f72bc134
--- /dev/null
+++ b/tests/test_optimizer/test_adam_optim.py
@@ -0,0 +1,86 @@
+from copy import deepcopy
+from typing import Type, Union
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.optim import Adam, AdamW
+
+from colossalai.nn.optimizer import CPUAdam, FusedAdam, HybridAdam
+from tests.kit.model_zoo import model_zoo
+
+_ALLOWED_OPTIM_DEVICES = [
+    (FusedAdam, torch.device('cuda:0')),
+    (CPUAdam, torch.device('cpu')),
+    (CPUAdam, torch.device('cuda:0')),
+    (HybridAdam, torch.device('cpu')),
+    (HybridAdam, torch.device('cuda:0')),
+]
+
+_ALLOWED_P_G_TYPES = [
+    (torch.float, torch.float),    # pure fp32
+    (torch.float, torch.half),    # fp16 amp
+    (torch.float, torch.bfloat16),    # bfloat16 amp
+    # (torch.half, torch.half),  # FIXME(ver217): cpu adam kernel does not support pure fp16
+    # (torch.bfloat16, torch.bfloat16),  # FIXME(ver217): cpu adam kernel does not support pure bfloat16
+]
+
+N_STEPS = 3
+
+
+def setup_param_groups(bert_model: nn.Module) -> list:
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in bert_model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": 0.1,
+        },
+        {
+            "params": [p for n, p in bert_model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    return optimizer_grouped_parameters
+
+
+def set_grad(model: nn.Module, torch_model: nn.Module, g_dtype: torch.dtype) -> None:
+    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
+        torch_p.grad = torch.rand_like(torch_p)
+        # avoid inconsistent grad and param dtype error
+        orig_p = p.data
+        p.data = torch_p.grad.clone().to(g_dtype)
+        p.grad = p.data
+        p.data = orig_p
+
+
+@pytest.mark.parametrize('optim_cls, device', _ALLOWED_OPTIM_DEVICES)
+@pytest.mark.parametrize('adamw', [False, True])
+@pytest.mark.parametrize('p_dtype, g_dtype', _ALLOWED_P_G_TYPES)
+def test_adam_optim_on_bert(optim_cls: Union[Type[FusedAdam], Type[CPUAdam], Type[HybridAdam]], device: torch.device,
+                            adamw: bool, p_dtype: torch.dtype, g_dtype: torch.dtype) -> None:
+    model_fn, *_ = next(iter(model_zoo.get_sub_registry('transformers_bert_for_sequence_classification').values()))
+    torch_model = model_fn().to(device)
+    model = deepcopy(torch_model).to(p_dtype)
+    lr = 1e-3
+    beta1, beta2 = 0.9, 0.999
+    eps = 1e-8
+    torch_optim_cls = AdamW if adamw else Adam
+    torch_optim = torch_optim_cls(setup_param_groups(torch_model), lr=lr, betas=(beta1, beta2), eps=eps)
+    optim = optim_cls(setup_param_groups(model), lr=lr, betas=(beta1, beta2), eps=eps, adamw_mode=adamw)
+
+    rtol, atol = 1e-5, 1e-5
+    if p_dtype is torch.float16 or g_dtype is torch.float16:
+        rtol, atol = 2e-3, 2e-3
+    if p_dtype is torch.bfloat16 or g_dtype is torch.bfloat16:
+        rtol, atol = 4e-3, 4e-3
+
+    for _ in range(N_STEPS):
+        set_grad(model, torch_model, g_dtype)
+        torch_optim.step()
+        optim.step()
+        torch_optim.zero_grad()
+        optim.zero_grad()
+        for p, torch_p in zip(model.parameters(), torch_model.parameters()):
+            # if overflow, the weight won't be updated. so there will be no nan in p
+            assert not torch.isnan(p).any()
+            assert torch.allclose(p.float(), torch_p, rtol=rtol, atol=atol)
diff --git a/tests/test_optimizer/test_cpu_adam.py b/tests/test_optimizer/test_cpu_adam.py
deleted file mode 100644
index 8b3ecf851..000000000
--- a/tests/test_optimizer/test_cpu_adam.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import math
-
-import torch
-
-from colossalai.testing import clear_cache_before_run, parameterize
-
-
-def torch_adam_update(
-    step,
-    lr,
-    beta1,
-    beta2,
-    eps,
-    weight_decay,
-    param,
-    grad,
-    exp_avg,
-    exp_avg_sq,
-    use_adamw,
-):
-    bias_correction1 = 1 - beta1**step
-    bias_correction2 = 1 - beta2**step
-
-    if weight_decay != 0:
-        if use_adamw:
-            # Perform stepweight decay
-            param.mul_(1 - lr * weight_decay)
-        else:
-            grad = grad.add(param, alpha=weight_decay)
-
-    # Decay the first and second moment running average coefficient
-    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
-
-    step_size = lr / bias_correction1
-
-    param.addcdiv_(exp_avg, denom, value=-step_size)
-
-
-def assertLess(data_diff, threshold, msg):
-    assert data_diff < threshold, msg
-
-
-def assertTrue(condition, msg):
-    assert condition, msg
-
-
-@clear_cache_before_run()
-@parameterize('adamw', [True, False])
-@parameterize('step', [1, 2])
-@parameterize('p_dtype', [torch.float, torch.half])
-@parameterize('g_dtype', [torch.float, torch.half])
-def test_cpu_adam(adamw, step, p_dtype, g_dtype):
-    lr = 1e-3
-    beta1, beta2 = 0.9, 0.999
-    eps = 1e-8
-    weight_decay = 0
-
-    for i in range(3):
-        p_data = torch.rand(64, dtype=p_dtype)
-        p_data_copy = p_data.clone().float()
-        p_grad = torch.rand(64, dtype=g_dtype)
-        p_grad_copy = p_grad.clone().float()
-        exp_avg = torch.rand(p_data.shape)
-        exp_avg_copy = exp_avg.clone()
-        exp_avg_sq = torch.rand(p_data.shape)
-        exp_avg_sq_copy = exp_avg_sq.clone()
-
-        from colossalai.kernel.op_builder import CPUAdamBuilder
-        cpu_optim = CPUAdamBuilder().load()
-
-        cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)
-
-        cpu_adam_op.step(
-            step,
-            lr,
-            beta1,
-            beta2,
-            eps,
-            weight_decay,
-            True,
-            p_data.view(-1),    # fp32 data
-            p_grad.view(-1),    # fp32 grad
-            exp_avg.view(-1),
-            exp_avg_sq.view(-1),
-            -1,
-        )
-
-        torch_adam_update(
-            step,
-            lr,
-            beta1,
-            beta2,
-            eps,
-            weight_decay,
-            p_data_copy,    # fp32 data
-            p_grad_copy,    # fp32 grad
-            exp_avg_copy,
-            exp_avg_sq_copy,
-            adamw,
-        )
-        var = p_data_copy - p_data
-        data_diff = torch.max(torch.abs(var))
-        threshold = 1e-3
-        assertLess(
-            data_diff,
-            threshold,
-            f"p_data diff {data_diff}. failed check, step {step}, lr {lr}, eps "
-            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}",
-        )
-        max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))
-        assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
-        max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))
-        assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
-        max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))
-        assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")
-
-
-if __name__ == '__main__':
-    test_cpu_adam()
diff --git a/tests/test_optimizer/test_fused_adam.py b/tests/test_optimizer/test_fused_adam.py
deleted file mode 100644
index 114d5293d..000000000
--- a/tests/test_optimizer/test_fused_adam.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.optim import AdamW
-from torch.optim.adam import Adam
-
-from colossalai.nn.optimizer.fused_adam import FusedAdam
-from colossalai.testing import clear_cache_before_run, parameterize
-
-
-class FC(nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc = nn.Sequential(nn.Linear(64, 64))
-
-    def forward(self, x):
-        return self.fc(x)
-
-
-@clear_cache_before_run()
-@parameterize('adamw', [False, True])
-@parameterize('p_dtype', [torch.float, torch.half])
-@parameterize('g_dtype', [torch.float, torch.half])
-def test_adam(adamw, p_dtype, g_dtype):
-    model = FC().cuda().to(p_dtype)
-    state = model.state_dict()
-    model_copy = FC().cuda().to(p_dtype)
-    model_copy.load_state_dict(state.copy())
-
-    if adamw:
-        optim = FusedAdam(model.parameters(), lr=1e-3, adamw_mode=True)
-        torch_optim = AdamW(model_copy.parameters(), lr=1e-3)
-    else:
-        optim = FusedAdam(model.parameters(), lr=1e-3)
-        torch_optim = Adam(model_copy.parameters(), lr=1e-3)
-
-    data = torch.rand(1024, 64).cuda().to(p_dtype)
-    data_copy = data.clone()
-    label = torch.rand(1024, 64).cuda().to(p_dtype)
-
-    for d, l in zip(data, label):
-        y = model(d)
-        loss = ((l - y)**2).sum()
-        optim.zero_grad()
-        loss.backward()
-        if p_dtype != g_dtype:
-            for i in range(len(optim.param_groups[0]['params'])):
-                optim.param_groups[0]['params'][i].grad.data = optim.param_groups[0]['params'][i].grad.data.to(g_dtype)
-        optim.step()
-
-    for d, l in zip(data_copy, label):
-        y = model_copy(d)
-        loss = ((l - y)**2).sum()
-        torch_optim.zero_grad()
-        loss.backward()
-        torch_optim.step()
-
-    assert len(optim.param_groups[0]['params']) == len(torch_optim.param_groups[0]['params'])
-
-    for i in range(len(optim.param_groups[0]['params'])):
-        if torch.isnan(optim.param_groups[0]['params'][i]).any() \
-           or torch.isnan(torch_optim.param_groups[0]['params'][i]).any():
-            continue
-        assert torch.allclose(optim.param_groups[0]['params'][i], torch_optim.param_groups[0]['params'][i], 2e-3, 2e-3)
diff --git a/tests/test_optimizer/test_fused_adam_kernel.py b/tests/test_optimizer/test_fused_adam_kernel.py
deleted file mode 100644
index 4afa13349..000000000
--- a/tests/test_optimizer/test_fused_adam_kernel.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from numpy import dtype
-
-from colossalai.testing import clear_cache_before_run, parameterize
-from colossalai.utils import multi_tensor_applier
-
-
-def torch_adam_update(
-    step,
-    lr,
-    beta1,
-    beta2,
-    eps,
-    weight_decay,
-    param,
-    grad,
-    exp_avg,
-    exp_avg_sq,
-    use_adamw,
-):
-    bias_correction1 = 1 - beta1**step
-    bias_correction2 = 1 - beta2**step
-
-    if weight_decay != 0:
-        if use_adamw:
-            # Perform stepweight decay
-            param.mul_(1 - lr * weight_decay)
-        else:
-            grad = grad.add(param, alpha=weight_decay)
-
-    # Decay the first and second moment running average coefficient
-    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
-
-    step_size = lr / bias_correction1
-
-    param.addcdiv_(exp_avg, denom, value=-step_size)
-
-
-@clear_cache_before_run()
-@parameterize('adamw', [False, True])
-@parameterize('step', [1, 2])
-@parameterize('p_dtype', [torch.float, torch.half])
-@parameterize('g_dtype', [torch.float, torch.half])
-def test_adam(adamw, step, p_dtype, g_dtype):
-    from colossalai.kernel.op_builder import FusedOptimBuilder
-    fused_optim = FusedOptimBuilder().load()
-    fused_adam = fused_optim.multi_tensor_adam
-
-    dummy_overflow_buf = torch.cuda.IntTensor([0])
-
-    count = 0
-
-    for i in range(3):
-        p = torch.rand(64, dtype=p_dtype).cuda()
-        p_copy = p.clone().float()
-        g = torch.rand(p.shape, dtype=g_dtype).cuda()
-        g_copy = g.clone().float()
-        m = torch.rand(p.shape).cuda()
-        m_copy = m.clone()
-        v = torch.rand(p.shape).cuda()
-        v_copy = v.clone()
-
-        lr = 1e-3
-        beta1, beta2 = 0.9, 0.999
-        eps = 1e-8
-        weight_decay = 0
-
-        multi_tensor_applier(fused_adam, dummy_overflow_buf, [[g], [p], [m], [v]], lr, beta1, beta2, eps, step, adamw,
-                             True, weight_decay, -1)
-
-        torch_adam_update(
-            step,
-            lr,
-            beta1,
-            beta2,
-            eps,
-            weight_decay,
-            p_copy,    # fp32 data
-            g_copy,    # fp32 grad
-            m_copy,
-            v_copy,
-            adamw,
-        )
-
-        if torch.isnan(p).any() or torch.isnan(p_copy).any():
-            count += 1
-            continue
-        assert count < 200, "too many nans"
-        assert torch.allclose(p.to(torch.float), p_copy.to(torch.float), 1e-5,
-                              1e-5), f"failed check, adamw {adamw}, p_dtype {p_dtype}, g_dtype {g_dtype}"
diff --git a/tests/test_optimizer/test_hybrid_adam.py b/tests/test_optimizer/test_hybrid_adam.py
deleted file mode 100644
index d075149df..000000000
--- a/tests/test_optimizer/test_hybrid_adam.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.optim import AdamW
-from torch.optim.adam import Adam
-
-from colossalai.nn.optimizer.hybrid_adam import HybridAdam
-from colossalai.testing import clear_cache_before_run, parameterize
-
-RE = 3
-
-
-@clear_cache_before_run()
-@parameterize('adamw', [False, True])
-@parameterize('device', ['cpu', 'cuda:0'])
-@parameterize('p_dtype', [torch.float])
-@parameterize('g_dtype', [torch.float, torch.half])
-def test_adam(adamw, device, p_dtype, g_dtype):
-    rng_state = torch.get_rng_state()
-    p = nn.Parameter(torch.rand(64).to(device, p_dtype))
-    torch.set_rng_state(rng_state)
-    p_copy = nn.Parameter(torch.rand(64).to(device).float())
-
-    if adamw:
-        optim = HybridAdam([p], lr=1e-3, adamw_mode=True)
-        torch_optim = AdamW([p_copy], lr=1e-3)
-    else:
-        optim = HybridAdam([p], lr=1e-3)
-        torch_optim = Adam([p_copy], lr=1e-3)
-
-    print(f"adaw mode {adamw}, device {device}, p_dtype {p_dtype}, g_dtype {g_dtype}")
-    for i in range(RE):
-        p.grad = torch.rand(64).to(device, p_dtype)
-        p_copy.grad = p.grad.clone().float()
-        p.grad.data = p.grad.data.to(g_dtype)
-
-        optim.step()
-        torch_optim.step()
-
-        if torch.isnan(p.data).any() or torch.isnan(p_copy.data).any():
-            continue
-        assert torch.allclose(p.data, p_copy.data, 1e-4, 1e-2), \
-            f"adaw mode {adamw}, device {device}, p_dtype {p_dtype}, g_dtype {g_dtype}"
diff --git a/tests/test_zero/test_gemini/test_optim.py b/tests/test_zero/test_gemini/test_optim.py
index 8ce20c16e..66611bcd2 100644
--- a/tests/test_zero/test_gemini/test_optim.py
+++ b/tests/test_zero/test_gemini/test_optim.py
@@ -21,23 +21,40 @@ TEST_MODELS = ['gpt2']
 # these models are too small, all parameters in these models are compacted into one chunk
 EXAMPLE_MODELS = ['albert', 'beit', 'bert', 'hanging_param_model', 'nested_model', 'repeated_computed_layers']
 
+# bfloat16 cannot represent them exactly
+BF16_IGNORED_KEYS = [
+    'albert.embeddings.word_embeddings.weight',
+    'albert.embeddings.position_embeddings.weight',
+    'masked_bias',
+]
 
-def check_param(model: ZeroDDP, torch_model: torch.nn.Module):
-    zero_dict = model.state_dict(only_rank_0=False)
+
+def check_param(model: ZeroDDP, torch_model: torch.nn.Module, dtype: torch.dtype):
+    zero_dict = model.state_dict(only_rank_0=False, dtype=dtype)
     torch_dict = torch_model.state_dict()
 
     for key, value in torch_dict.items():
         # key is 'module.model.PARAMETER', so we truncate it
         key = key[7:]
         assert key in zero_dict, "{} not in ZeRO dictionary.".format(key)
-        temp_zero_value = zero_dict[key].to(device=value.device, dtype=value.dtype)
+        temp_zero_value = zero_dict[key].to(device=value.device)
+        if dtype is torch.bfloat16 and any(k in key for k in BF16_IGNORED_KEYS):
+            continue
+        rtol, atol = 1e-3, 4e-3
+        if dtype is torch.bfloat16:
+            rtol, atol = 4e-3, 8e-3
         # debug_print([0], "max range: ", key, torch.max(torch.abs(value - temp_zero_value)))
-        assert_close(value, temp_zero_value, rtol=1e-3, atol=4e-3)
+        assert_close(value.float(),
+                     temp_zero_value.float(),
+                     rtol=rtol,
+                     atol=atol,
+                     msg=lambda s: s + f'\n{key}\n{temp_zero_value.dtype}')
 
 
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
 @parameterize('model_name', TEST_MODELS)
-def exam_model_step(placement_policy, model_name: str):
+@parameterize('mixed_precision', [torch.half, torch.bfloat16])
+def exam_model_step(placement_policy, model_name: str, mixed_precision: torch.dtype):
     set_seed(42)
     get_components_func = non_distributed_component_funcs.get_callable(model_name)
     model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
@@ -65,7 +82,7 @@ def exam_model_step(placement_policy, model_name: str):
         init_device = None
     chunk_manager = ChunkManager(config_dict, init_device=init_device)
     gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    model = ZeroDDP(model, gemini_manager, pin_memory=True)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True, mixed_precision=mixed_precision)
 
     optimizer = HybridAdam(model.parameters(), lr=1e-3)
     zero_optim = ZeroOptimizer(optimizer, model, initial_scale=128)
@@ -74,6 +91,7 @@ def exam_model_step(placement_policy, model_name: str):
     torch_model.eval()
 
     set_seed(dist.get_rank() * 3 + 128)
+    rtol, atol = 1e-4, 1e-5
     for i, (input_ids, label) in enumerate(train_dataloader):
         if i > 2:
             break
@@ -83,17 +101,18 @@ def exam_model_step(placement_policy, model_name: str):
 
         torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
         loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
-        assert_close(torch_loss, loss)
+        assert_close(torch_loss, loss, rtol=rtol, atol=atol)
 
         zero_optim.step()
         torch_optim.step()
 
-        check_param(model, torch_model)
+        check_param(model, torch_model, mixed_precision)
 
 
 @parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
 @parameterize('model_name', EXAMPLE_MODELS)
-def exam_tiny_example(placement_policy, model_name: str):
+@parameterize('mixed_precision', [torch.half, torch.bfloat16])
+def exam_tiny_example(placement_policy, model_name: str, mixed_precision: torch.dtype):
     set_seed(2008)
     get_components_func = non_distributed_component_funcs.get_callable(model_name)
     model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
@@ -113,7 +132,7 @@ def exam_tiny_example(placement_policy, model_name: str):
 
     chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_mb=1)
     gemini_manager = GeminiManager(placement_policy, chunk_manager)
-    model = ZeroDDP(model, gemini_manager, pin_memory=True)
+    model = ZeroDDP(model, gemini_manager, pin_memory=True, mixed_precision=mixed_precision)
     optimizer = HybridAdam(model.parameters(), lr=1e-3)
     zero_optim = ZeroOptimizer(optimizer, model, initial_scale=2)
 
@@ -121,6 +140,9 @@ def exam_tiny_example(placement_policy, model_name: str):
     torch_model.eval()
 
     set_seed(dist.get_rank() * 3 + 128)
+    rtol, atol = 1.5e-6, 2e-5
+    if mixed_precision is torch.bfloat16:
+        rtol, atol = 2e-3, 2e-3
     for i, (input_ids, label) in enumerate(train_dataloader):
         if i > 2:
             break
@@ -133,12 +155,12 @@ def exam_tiny_example(placement_policy, model_name: str):
 
         torch_loss = run_fwd_bwd(torch_model, input_ids, label, criterion, torch_optim)
         loss = run_fwd_bwd(model, input_ids, label, criterion, zero_optim)
-        assert_close(torch_loss, loss, rtol=1.5e-6, atol=2e-5)    # atol should be 2e-5 for torch lower than 1.12
+        assert_close(torch_loss, loss, rtol=rtol, atol=atol)    # atol should be 2e-5 for torch lower than 1.12
 
         zero_optim.step()
         torch_optim.step()
 
-        check_param(model, torch_model)
+        check_param(model, torch_model, mixed_precision)
 
 
 def run_dist(rank, world_size, port):
diff --git a/tests/test_zero/test_legacy/test_zero_engine.py b/tests/test_zero/test_legacy/test_zero_engine.py
index dc8847ce5..826a543db 100644
--- a/tests/test_zero/test_legacy/test_zero_engine.py
+++ b/tests/test_zero/test_legacy/test_zero_engine.py
@@ -16,7 +16,11 @@ from colossalai.zero.low_level._utils import has_inf_or_nan
 from tests.components_to_test.registry import non_distributed_component_funcs
 
 
-def run_dist(rank, world_size, port, parallel_config):
+def run_dist(rank, world_size, port, parallel_config, bf16):
+    is_mp_config = parallel_config == MP_PARALLEL_CONFIG
+    is_zero_config = parallel_config == ZERO_PARALLEL_CONFIG
+    if bf16:
+        parallel_config['zero']['model_config']['bf16'] = True
     colossalai.launch(config=parallel_config,
                       rank=rank,
                       world_size=world_size,
@@ -30,7 +34,8 @@ def run_dist(rank, world_size, port, parallel_config):
         model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
         with ZeroInitContext(target_device=torch.cuda.current_device(),
                              shard_strategy=gpc.config.zero.model_config.shard_strategy,
-                             shard_param=True):
+                             shard_param=True,
+                             bf16=bf16):
             colo_model = model_builder(checkpoint=True)
 
         colo_optimizer = optimizer_class(colo_model.parameters(), lr=1e-3)
@@ -38,7 +43,8 @@ def run_dist(rank, world_size, port, parallel_config):
                                                                optimizer=colo_optimizer,
                                                                criterion=criterion,
                                                                train_dataloader=train_dataloader)
-        torch_model = model_builder(checkpoint=True).half()
+        dtype = torch.bfloat16 if bf16 else torch.float16
+        torch_model = model_builder(checkpoint=True).to(dtype)
         col_model_deepcopy(engine.model, torch_model)
         torch_model = torch_model.cuda().float()
 
@@ -80,9 +86,9 @@ def run_dist(rank, world_size, port, parallel_config):
             torch_optimizer.step()
             i += 1
 
-        if parallel_config == MP_PARALLEL_CONFIG:
+        if is_mp_config:
             check_params(torch_model, colo_model, loose=True)
-        elif parallel_config == ZERO_PARALLEL_CONFIG:
+        elif is_zero_config:
             check_sharded_model_params(torch_model, colo_model, loose=True)
 
 
@@ -97,9 +103,10 @@ def test_mp_engine(world_size):
 
 @pytest.mark.dist
 @pytest.mark.parametrize("world_size", [1, 2])
+@pytest.mark.parametrize("bf16", [True, False])
 @rerun_if_address_is_in_use()
-def test_zero_engine(world_size):
-    spawn(run_dist, world_size, parallel_config=ZERO_PARALLEL_CONFIG)
+def test_zero_engine(world_size, bf16):
+    spawn(run_dist, world_size, parallel_config=ZERO_PARALLEL_CONFIG, bf16=bf16)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_zero/test_low_level/test_grad_acc.py b/tests/test_zero/test_low_level/test_grad_acc.py
index 2ae1f3a99..c264a8077 100644
--- a/tests/test_zero/test_low_level/test_grad_acc.py
+++ b/tests/test_zero/test_low_level/test_grad_acc.py
@@ -82,7 +82,6 @@ def exam_zero_1_2_grad_acc():
 
 def exam_zero_1_grad_acc():
     local_rank = torch.distributed.get_rank()
-    grad_scale = 32
     seed_all(2008)
 
     # create models
@@ -101,7 +100,6 @@ def exam_zero_1_grad_acc():
     # level 1 and 2 will produce exactly the same results
     zero_optimizer = LowLevelZeroOptimizer(zero_optimizer,
                                            overlap_communication=False,
-                                           initial_scale=grad_scale,
                                            reduce_bucket_size=262144,
                                            clip_grad_norm=1.0)
 
@@ -128,9 +126,8 @@ def exam_zero_1_grad_acc():
         if check_flag:
             # check grad
             for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
-                unscale_grad = z1p.grad / grad_scale
                 # print(n, p.shape, torch.max(torch.abs(p.grad - unscale_grad)))
-                assert torch.equal(p.grad, unscale_grad)
+                assert torch.equal(p.grad, z1p.grad)
 
         zero_optimizer._sync_grad()
 
diff --git a/tests/test_zero/test_low_level/test_zero1_2.py b/tests/test_zero/test_low_level/test_zero1_2.py
index 4086af9d8..8e2206fe6 100644
--- a/tests/test_zero/test_low_level/test_zero1_2.py
+++ b/tests/test_zero/test_low_level/test_zero1_2.py
@@ -7,7 +7,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.testing import rerun_if_address_is_in_use, spawn
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.testing.random import seed_all
 from colossalai.zero import LowLevelZeroOptimizer
 
@@ -25,15 +25,18 @@ class MlpModel(nn.Module):
         return x
 
 
-def half_close(a, b, loose=False):
+def loose_close(a, b, dtype: torch.dtype = torch.float32):
     rtol = None
     atol = None
-    if loose:
+    if dtype is torch.float16:
         rtol = 5e-2
         atol = 5e-4
+    elif dtype is torch.bfloat16:
+        rtol = 4e-3
+        atol = 4e-3
 
-    a = a.detach().half()
-    b = b.detach().half()
+    a = a.detach().to(dtype)
+    b = b.detach().to(dtype)
 
     assert_close(a, b, rtol=rtol, atol=atol)
 
@@ -96,7 +99,8 @@ def exam_zero_1_2():
         assert torch.equal(z1p.data, z2p.data)
 
 
-def exam_zero_1_torch_ddp():
+@parameterize('dtype', [torch.float16, torch.bfloat16])
+def exam_zero_1_torch_ddp(dtype: torch.dtype):
     """
     In this test, two pairs of model and optimizers are created.
     1. zero: use sharded optimizer and fp16 parameters
@@ -109,15 +113,10 @@ def exam_zero_1_torch_ddp():
     seed_all(1453)
 
     # create models
-    zero_model = MlpModel()
-    torch_model = copy.deepcopy(zero_model)
+    torch_model = MlpModel().cuda()
+    zero_model = copy.deepcopy(torch_model).to(dtype)
 
-    zero_model = zero_model.cuda().half()
-    torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0)
-    torch_model = torch_model.cuda()
-
-    # for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
-    #    half_close(p.data, z1p.data)
+    torch_model = DDP(torch_model.cuda(), bucket_cap_mb=0).cuda()
 
     # create optimizer
     zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
@@ -137,11 +136,11 @@ def exam_zero_1_torch_ddp():
     input_data = torch.rand(32, 128).cuda()
 
     # zero-dp forward
-    zero_output = zero_model(input_data.half())
+    zero_output = zero_model(input_data.to(dtype))
 
     # torch-ddp forward
     torch_output = torch_model(input_data)
-    half_close(zero_output, torch_output, loose=True)
+    loose_close(zero_output, torch_output, dtype=dtype)
 
     # zero-dp backward
     zero_optimizer.backward(zero_output.mean().float(), sync_grad=False)
@@ -151,7 +150,7 @@ def exam_zero_1_torch_ddp():
 
     # check grad
     for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
-        half_close(p.grad, z1p.grad, loose=True)
+        loose_close(p.grad, z1p.grad, dtype=dtype)
 
     # zero-dp step
     zero_optimizer._sync_grad()
@@ -163,7 +162,7 @@ def exam_zero_1_torch_ddp():
     # check updated param
     for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
         # print(n, torch.max(torch.abs(p.data - z1p.data)))
-        half_close(p.data, z1p.data, loose=True)
+        loose_close(p.data, z1p.data, dtype=dtype)
 
 
 def run_dist(rank, world_size, port):

From 187874975325c4768b0850a818092de5bef1b071 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Mon, 5 Jun 2023 16:04:27 +0800
Subject: [PATCH 19/26] [nfc] fix typo colossalai/nn  (#3887)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel

* fix typo colossalai/nn

* revert change warmuped
---
 colossalai/nn/layer/parallel_sequence/layers.py        |  2 +-
 colossalai/nn/loss/loss_1d.py                          |  6 +++---
 colossalai/nn/loss/loss_2d.py                          |  2 +-
 colossalai/nn/loss/loss_2p5d.py                        |  2 +-
 colossalai/nn/loss/loss_3d.py                          |  4 ++--
 colossalai/nn/optimizer/cpu_adam.py                    |  2 +-
 colossalai/nn/optimizer/lamb.py                        |  2 +-
 colossalai/nn/optimizer/nvme_optimizer.py              |  2 +-
 .../layers/cache_embedding/cached_embedding.py         | 10 +++++-----
 .../nn/parallel/layers/cache_embedding/copyer.py       |  2 +-
 .../parallel_cached_embedding_tablewise_split_cache.py |  2 +-
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/nn/layer/parallel_sequence/layers.py
index d9486217b..0887f8389 100644
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@@ -195,7 +195,7 @@ class _Linear(nn.Module):
         keep_master_weight_for_test: This was added for testing and should be
                                      set to False. It returns the master weights
                                      used for initialization.
-        skip_bias_add: This was added to enable performance optimations where bias
+        skip_bias_add: This was added to enable performance optimizations where bias
                        can be fused with other elementwise operations. we skip
                        adding bias but instead return it.
     """
diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/nn/loss/loss_1d.py
index 2fabd954f..dd548c1d3 100644
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
@@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
         # Subtract the maximum value.
         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 
-        # Get the partition's vocab indecies
+        # Get the partition's vocab indices
         partition_vocab_size = vocab_parallel_logits.size()[-1]
         rank = dist.get_rank(process_group)
         vocab_start_index = partition_vocab_size * rank
@@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
     @custom_bwd
     def backward(ctx, grad_output):
 
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 
-        # All the inputs have softmax as thier gradient.
+        # All the inputs have softmax as their gradient.
         grad_input = softmax
         # For simplicity, work with the 2D gradient.
         partition_vocab_size = softmax.size()[-1]
diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/nn/loss/loss_2d.py
index cb12e723c..7da8b2d69 100644
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.
diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/nn/loss/loss_2p5d.py
index f8e3324fc..63dc4f33a 100644
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
         # All the inputs have softmax as their gradient.
diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/nn/loss/loss_3d.py
index e76439191..f27d57ad6 100644
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
+        # Retrieve tensors from the forward path.
         softmax, target_mask, masked_target = ctx.saved_tensors
 
-        # All the inputs have softmax as thier gradient.
+        # All the inputs have softmax as their gradient.
         input_grad = softmax
         # For simplicity, work with the 2D gradient.
         partition_vocab_size = softmax.size()[-1]
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 7070c0a1e..1ec8783c5 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
 
     `CPUAdam` requires CUDA extensions which can be built during installation or runtime.
 
-    This version of CPU Adam accelates parameters updating on CPU with SIMD.
+    This version of CPU Adam accelerates parameters updating on CPU with SIMD.
     Support of AVX2 or AVX512 is required.
 
     The GPU part is implemented in an naive way.
diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py
index 7ac210957..399ad39b6 100644
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@@ -59,7 +59,7 @@ class Lamb(Optimizer):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
+                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
 
                 state = self.state[p]
 
diff --git a/colossalai/nn/optimizer/nvme_optimizer.py b/colossalai/nn/optimizer/nvme_optimizer.py
index 53e4a46c9..fb3a4d87b 100644
--- a/colossalai/nn/optimizer/nvme_optimizer.py
+++ b/colossalai/nn/optimizer/nvme_optimizer.py
@@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
             self.offloader = None
         self.is_on_nvme: Dict[Parameter, bool] = {}
         self.offloaded_numel: int = 0
-        # As param may be not materialized here, these attributes are initalized when the first step
+        # As param may be not materialized here, these attributes are initialized when the first step
         self.total_numel: Optional[int] = None
         self.can_offload_numel: Optional[int] = None
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
index a0c45d8e8..a74cb8d94 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
@@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
 
     Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
     It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
-    You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
+    You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
 
     Args:
         num_embeddings (int): size of the dictionary of embeddings
         embedding_dim (int):  the size of each embedding vector
         padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
         max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
-        norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
+        norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
         scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
         sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
-        _weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
+        _weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
         mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
         include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
         dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
         device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
         cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
-        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
+        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
         warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
         buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
         pin_weight (bool, optional): pin the cpu weight. Defaults to False.
@@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
     def swap_in_bandwidth(self):
         if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
             return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
-                   self.cache_weight_mgr._cpu_to_cuda_elpase
+                   self.cache_weight_mgr._cpu_to_cuda_elapse
         else:
             return 0
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/nn/parallel/layers/cache_embedding/copyer.py
index b586be1dc..aa1f79448 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/copyer.py
@@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object):
     def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
         """copy 
         src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
-        The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
+        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
 
         Args:
             dim (int):  dimension along which to index
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
index cb4647028..80a54b4fa 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
 
         # get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
         local_output = torch.cat(local_output_list, 1)
-        # then concatenate those local_output on the second demension.
+        # then concatenate those local_output on the second dimension.
         # use all_to_all
         remains = batch_size % self.world_size
         scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]

From 57a6d7685cf05b0763eeb65eb62e7d8cce2f6955 Mon Sep 17 00:00:00 2001
From: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Date: Mon, 5 Jun 2023 21:24:21 +0800
Subject: [PATCH 20/26] support evaluation for english (#3880)

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
---
 applications/Chat/evaluate/README.md          |  49 ++---
 .../Chat/evaluate/config/config_en.json       | 123 ++++++++++++
 applications/Chat/evaluate/eval.py            |   2 +-
 applications/Chat/evaluate/evaluator.py       |  72 ++++---
 applications/Chat/evaluate/gpt_evaluate.py    |  27 ++-
 applications/Chat/evaluate/metrics.py         | 124 +++++++++---
 .../battle_prompt/battle_prompt_en.json       |   6 +
 .../evaluation_prompt_en.json                 | 179 ++++++++++++++++++
 applications/Chat/evaluate/requirements.txt   |   2 +
 applications/Chat/evaluate/utils.py           | 157 ++++++++++++++-
 10 files changed, 643 insertions(+), 98 deletions(-)
 create mode 100644 applications/Chat/evaluate/config/config_en.json
 create mode 100644 applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
 create mode 100644 applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json

diff --git a/applications/Chat/evaluate/README.md b/applications/Chat/evaluate/README.md
index ae3499bf2..e3510e352 100644
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
@@ -1,7 +1,6 @@
 # Evaluation
 
-In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is available for model
-evaluation of Chinese capability and the one for English capability is under preparation.
+In this directory, we introduce how you can evaluate your model with our pipeline. This pipeline is now available for evaluation of both Chinese and English capability.
 
 ## Installation
 
@@ -24,7 +23,7 @@ The whole evaluation pipeline consists of two methods:
 
 Our evaluation pipeline examines the model's capability using 10 categories of questions. The following table introduces each category:
 
-| Evaluation Category | <center>Description</center>                                                  |
+| Evaluation Category | Description                                                  |
 | :-----------------: | :----------------------------------------------------------- |
 |    Brainstorming    | Models are asked to generate a range of creative and diverse ideas according to the question. The capability of creativity is required. |
 |        Chat         | Models are asked to continue a multi-round dialogue given the roles involved. The capability of understanding, memorizing previous rounds of the dialogue and answering according to the persona provided is required. |
@@ -40,17 +39,17 @@ Our evaluation pipeline examines the model's capability using 10 categories of q
 To better understand each evaluation category, here are some example questions provided.
 
 
-| Evaluation Category | <center>Chinese Example</center>                                              | <center>English Example</center>                                              |
+| Evaluation Category | Chinese Example                                              | English Example                                              |
 | :-----------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
 |    Brainstorming    | **Example 1:**<br/>请介绍一下人工智能的多个领域。<br/><br/>**Example 2:**<br/>请给出管理家庭财务的3个小技巧。<br/> | **Example 1:**<br/>How can I improve my memory? Any useful techniques you can suggest?<br/><br/>**Example 2:**<br/>What are some ways to increase productivity while working from home? |
-|        Chat         | **Example 1:**<br/>基于以下角色信息完成一段对话。小张是一名新手爱好者，对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。<br/>小张：您好，老李，我最近开始对养鸡感兴趣了，想请教您一些问题。 <br/>老李：你好，小张，我很乐意帮助你。你想问些什么？ <br/>小张：我想知道如何确定鸡的品种和性别？ <br/>老李：确切的品种可以通过鸡的外貌特征来确定，而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗？<br/> 小张：<br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生，一位老年病患者想要停药，但他对病情有所忽视并有担忧；王叔叔是老年病患者的儿子，希望能够听取医生的建议。<br/>小明：你好，王叔叔，我了解你想要让你父亲停药。<br/>王叔叔：是的，我父亲已经吃了那么久的药，我担心药物对他的身体会有副作用。<br/>小明： | **Example 1:**<br/>Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants.<br/>Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. <br/>Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. <br/>Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. <br/>Jack: <br/>**Example 2:**<br/>Complete a dialogue based on the following role information. A: Elementary student  B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
+|        Chat         | **Example 1:**<br/>基于以下角色信息完成一段对话。小张是一名新手爱好者，对养鸡有浓厚的兴趣。老李是一名有丰富经验的养鸡大师。<br/>小张：您好，老李，我最近开始对养鸡感兴趣了，想请教您一些问题。 <br/>老李：你好，小张，我很乐意帮助你。你想问些什么？ <br/>小张：我想知道如何确定鸡的品种和性别？ <br/>老李：确切的品种可以通过鸡的外貌特征来确定，而性别一般是通过鸡卵的大小和形状来判断。还有什么问题吗？<br/> 小张：<br/><br/>**Example 2:**<br/>基于以下角色信息完成一段对话。小明是一名医生，一位老年病患者想要停药，但他对病情有所忽视并有担忧；王叔叔是老年病患者的儿子，希望能够听取医生的建议。<br/>小明：你好，王叔叔，我了解你想要让你父亲停药。<br/>王叔叔：是的，我父亲已经吃了那么久的药，我担心药物对他的身体会有副作用。<br/>小明： | **Example 1:**<br/>Complete a conversation based on the following character information. Amy is a 30-year-old chef who runs her own restaurant. Jack is a food blogger who specializes in reviewing local restaurants.<br/>Amy: Hi Jack, I heard that you're a food blogger. Nice to meet you. <br/>Jack: Hi Amy, yes I am. Your restaurant has been receiving a lot of good reviews lately. <br/>Amy: Yes, we use only fresh and quality ingredients, and every dish is carefully crafted. <br/>Jack: <br/><br/>**Example 2:**<br/>Complete a dialogue based on the following role information. A: Elementary student  B: Teacher<br/>B: Good morning, Student A. Today we're going to learn about addition and subtraction.<br/>A: Teacher, I already know this very well. Why do I need to learn it again?<br/>B: |
 |   Classification    | **Example 1:**<br/>新闻标题：今日立夏，有一上联，立夏万物并秀，下联怎么对？<br/>请根据以上新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。<br/><br/> **Example 2:**<br/>新闻标题：赵丽颖很久没有登上微博热搜了，但你们别急，她只是在憋大招而已。<br/>请根据新闻标题判断新闻所属的分类，你需要从文化，娱乐，体育，财经，房产，教育，科技，旅游，游戏，军事这十类中选择一个答案。 | **Example 1:**<br/>Title:  Fighting for Love (2020) <br/>Description: Jasmine got obsessed with a man and now he's obsessed with her. Steamy nights, kisses and rules being broken awaits them. She turned his whole world upside down and now he's doing it to hers. In this free fall, can they survive each others love?\"<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\".<br/><br/>**Example2:** <br/>Title:  Summer Breeze: The Isley Brothers Greatest Hits Live (2005)<br/>Description: Filmed in the US in 2005 and captured in excellent form led by Ron Isley's vocals and Ernie Isley's hard edged guitar. Virtually every track is a hit including Shout, Who's That Lady, Twist And Shout, Summer Breeze and Harvest For The World.<br/>Based on the above information, determine which genre the work of art belongs to. You can only choose one from \"sport\", \"horror\", \"drama\", \"history\", \"romance\", \"biography\", \"science fiction\", \"comedy\", \"animation\", \"documentary\", \"music\" and \"news\"." |
-|      Closed QA      | **Example 1:**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰？ <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2:**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题：非洲最高的山是哪座山？<br/> 选项： <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1:**<br/>Which of the following options is NOT a primary color?<br/>(a) yellow<br/>(b) blue<br/>(c) orange<br/>(d) red<br/>**Example 2:**<br/>Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the ________ book in the Harry Potter series.<br/>(A) first<br/>(B) second<br/>(C) third<br/>(D) fourth |
+|      Closed QA      | **Example 1:**<br/>请从以下选项中选择正确答案。以下哪个是世界上最高山峰？ <br/>A. 长城 <br/>B. 泰山 <br/>C. 珠穆朗玛峰 <br/>D. 黄山<br/><br/>**Example 2:**<br/>请从以下选项中选择一个最佳答案回答下面的问题。问题：非洲最高的山是哪座山？<br/> 选项： <br/>A. 麦金利山 <br/>B. 喜马拉雅山 <br/>C. 乞力马扎罗山 | **Example 1:**<br/>Which of the following options is NOT a primary color?<br/>(a) yellow<br/>(b) blue<br/>(c) orange<br/>(d) red<br/><br/>**Example 2:**<br/>Choose the correct option to complete the following sentence: \"Harry Potter and the Chamber of Secrets\" is the ________ book in the Harry Potter series.<br/>(A) first<br/>(B) second<br/>(C) third<br/>(D) fourth |
 |     Extraction      | **Example 1:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2007-4-7中新网4月7日电据中国消防在线消息，4月4日晚上7时30分左右，湖南长潭高速公路上发生一起6车连环相撞失火事故。长株潭三地消防部门共出动消防车21台，警力100余人。经过消防官兵近2个小时奋力扑救，大火被成功扑灭。据初步调查，有1人在此次事故中死亡。<br/><br/>**Example 2:**<br/>根据以下新闻文本，提取新闻报道时间，例如回答时按照格式“新闻报道时间：2007年8月10日”<br/>新闻文本如下：2014年1月15日，据外媒《俄罗斯报》报道称，位于北半球的澳大利亚现在正处于炎热的夏季，而近日也到了高温酷暑的时候，当地时间1月14日晚，澳大利亚南部一夜间发生至少250起火灾。受炎热天气及雷雨天气影响，澳大利亚南部一夜间发生至少250起火灾，灾情多集中在维多利亚州。火灾发生后，救援人员立即展开救灾行动。目前，大部分起火点火势已被控制。 | **Example 1:**<br/>Ernest Hemingway, an American literary giant known for his spare and direct writing style, has penned timeless works such as 'The Old Man and the Sea', 'For Whom the Bell Tolls', and 'A Farewell to Arms', which have made a profound impact on the literary world and continue to be widely read and admired today.<br/>Extract the name of the author mentioned above.<br/><br/>**Example 2:**<br/>In the epic fantasy series 'A Song of Ice and Fire', George R.R. Martin weaves a complex web of political intrigue, war, and magic across the fictional continents of Westeros and Essos. Martin's richly developed characters and intricate plotlines have captivated readers worldwide, much like his other acclaimed works such as 'A Clash of Kings' and 'A Storm of Swords'.<br/>Extract the name of the author in the above material. |
 |     Generation      | **Example 1:**<br/>请撰写一篇文章，介绍如何通过改善生活习惯来预防疾病和延长寿命。<br/><br/>**Example 2:**<br/>请根据以下情节撰写一篇短篇小说：一名年轻人被困在一个荒岛上，他必须想办法生存下去直到被救援。但他很快发现自己并不孤单。 | **Example 1:**<br/>Write a descriptive paragraph about an island to relax and unwind, including details about the location and atmosphere.<br/><br/>**Example 2:**<br/>Can you help me write a persuasive email to my colleagues encouraging them to participate in a charitable fundraising event? |
 |       Open QA       | **Example 1:**<br/>请问万有引力定律由谁提出的？<br/><br/>**Example 2:**<br/>哪些国家参与了第一次世界大战？ | **Example 1:**<br/>What are the four basic tastes of the human palate?<br/><br/>**Example 2:**<br/>Who painted the The Scream? |
 |      Rewriting      | **Example 1:**<br/>请将以下句子改为正确的语序。 <br/>生日快乐你祝他了吗？<br/><br/>**Example 2:**<br/>将以下文本翻译成英语：<br/>“这个周末我要去海边玩” | **Example 1:**<br/>Please translate the following sentences, which are a mixture of Chinese and English, into full English. <br/>我需要买一些healthy snacks，比如nuts和dried fruits，作为我的office的午餐.<br/><br/>**Example 2:**<br/>Please rewrite the sentence using an inverted sentence structure.<br/>We won't begin our journey until the sun sets. |
-|      Roleplay       | **Example 1:**<br/>我想让你担任Android开发工程师面试官。我将成为候选人，您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题，等待我的回答。不要写解释。像面试官一样一个一个问我，等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2:**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众，您可以为您的讲故事环节选择特定的主题或主题，例如，如果是儿童，那么您可以谈论动物；如果是成人，那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:**<br/>Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust. <br/><br/>**Example 2: **<br/>I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
+|      Roleplay       | **Example 1:**<br/>我想让你担任Android开发工程师面试官。我将成为候选人，您将向我询问Android开发工程师职位的面试问题。我希望你只作为面试官回答。不要一次写出所有的问题。我希望你只对我进行采访。问我问题，等待我的回答。不要写解释。像面试官一样一个一个问我，等我回答。我的第一句话是“面试官你好”。 <br/><br/>**Example 2:**<br/>我想让你扮演讲故事的角色。你会想出引人入胜、富有想象力和吸引观众的有趣故事。它可以是童话故事、教育故事或任何其他类型的有潜力的故事以吸引人们的注意力和想象力。根据目标受众，您可以为您的讲故事环节选择特定的主题或主题，例如，如果是儿童，那么您可以谈论动物；如果是成人，那么基于历史的故事可能会更好地吸引他们等。我的第一个请求是我需要一个关于毅力的有趣故事。 | **Example 1:**<br/>Assume the role of a marriage counselor. Develop a series of communication exercises for a couple who are experiencing difficulties in their relationship. These exercises should promote active listening, empathy, and effective expression of emotions. Your first assignment is to provide a set of three exercises that focus on resolving conflicts and rebuilding trust. <br/><br/>**Example 2:**<br/>I want you to act as a travel agent. I will tell you my desired destination, travel dates, and budget, and it will be your job to suggest the best travel itinerary for me. Your recommendations should include the best transportation options, hotel accommodations, and any popular tourist attractions nearby. My first request is "I want to plan a trip to Tokyo for a week, with a budget of $2000. I want to explore the culture and food of the city." |
 |    Summarization    | **Example 1:**<br/>请简要总结概括以下段落材料。<br/>当地时间29日，泰国卫生部通报，新增143名新冠肺炎确诊病例和1名死亡病例。截止到当地时间29日上午，泰国累计确诊病例1388例，其中泰国籍1172例，非泰国籍216例。死亡病例累计7例。（原题为《泰国新增143例新冠肺炎确诊病例累计确诊1388例》）<br/><br/> **Example 2:**<br/>请简要总结概括以下段落材料。<br/>近期，参与京雄高铁站站房建设的中铁十二局，因在施工过程中存在环境违法行为被雄安新区公开通报。通报发出后，引起社会广泛关注。近日，人民网记者从雄安新区相关部门及中铁十二局获悉，新区有关部门已经集中约谈了中铁十二局等24个参与雄安建设的项目单位。对于约谈内容和结果，中铁十二局有关宣传负责人回应：“具体内容不清楚，最好找雄安新区相关部门了解情况。”新区有关部门负责人表示，此前涉及的环境违法行为，中铁十二局已基本整改到位，但约谈内容和结果暂不公开，接下来，将按部就班推进环境治理工作。（原题为《雄安新区：中铁十二局涉环境违法已基本整改到位》） | **Example 1:**<br/>The 21 year-old-woman was treated by paramedics after the kitchen fire in Botfield Road in Shifnal, Shropshire. West Mercia Police said it is treating Wednesday morning's incident as arson and are appealing for any witnesses to contact them.The 50-year-old man has been arrested on suspicion of arson with intent to endanger life. For more on this and other stories from Shropshire.<br/>Please briefly summarize the above material within 20 words.<br/><br/>**Example 2:**<br/>South Wales Police were called to a property in Heolgerrig, Merthyr Tydfil, at about 13:40 BST on Sunday. The child was airlifted to Prince Charles Hospital but died shortly afterwards. Police are investigating the circumstances surrounding the incident and have appealed for witnesses. The girl's family are being supported by specially trained officers.<br/>Please briefly summarize the above material within 20 words. |
 
 
@@ -58,24 +57,26 @@ To better understand each evaluation category, here are some example questions p
 
 #### GPT Evaluation
 
-GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics in Chinese:
+GPT evaluation uses GPT models to evaluate the prediction of different models and different pre-defined evaluation metrics are applied to different categories. The following table shows the 11 pre-defined evaluation metrics both in Chinese and English:
 
-|   Evaluation Metric   | <center>Prompt Words</center>                                                 | <center>CoT(Chain-of-Thought)</center>                                        |
+|   Evaluation Metric   | Prompt Words                                                 | CoT(Chain-of-Thought)                                        |
 | :-------------------: | :----------------------------------------------------------- | :----------------------------------------------------------- |
-| Language organization | 语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。 | 1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2.检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。 |
-|       Relevance       | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。 | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。 |
-|      Creativity       | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。 |
-|     Practicality      | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。 | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。 |
-|      Correctness      | 正确性(1-5)：答案应该符合常识、生活实际等等                  | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。 |
-|      Naturalness      | 自然(1-5)：答案是否自然，并且符合问题给定的身份。            | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。 |
-|     Engagingness      | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。 | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。 |
-|    Reasonableness     | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。 | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。 |
-|       Diversity       | 多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。 | 1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。<br/> 3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度，看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。 |
-|       Fidelity        | 保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。  | 1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求，确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。 |
-|      Conciseness      | 简明扼要(1-5)：答案是否简明扼要，没有冗余内容。              | 1. 阅读题目，提取出材料的重点。<br/> 2. 阅读该总结，并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。 |
+| 语言组织<br/>(Language organization) | 语言组织(1-5)：答案语言是否流畅、连贯，使用正确的语法，具有一定逻辑性，使用恰当的连接词、过渡词等等。</br></br>Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc. | 1. 阅读答案，并检查是否有语法错误、用词不当或其他显著的错误。<br/> 2. 检查答案是否具有逻辑性，能够按照合理的顺序传达信息并且能够自圆其说<br/> 3. 确定答案是否与问题或主题相关，并且能够传达清晰的信息。<br/> 4. 检查答案是否连贯，是否使用适当的转换和过渡来保持句子和段落之间的连贯性。<br/> 5. 检查答案是否具有明确的结构和组织方式，使得读者可以轻松理解信息的层次和结构。<br/> 6. 根据以上因素综合评估答案的语言组织，并给出一个1到5的分数，其中5表示语言组织非常好，而1表示语言组织非常差。</br></br>1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.<br>2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.<br>3. Determine if the answer is relevant to the question or topic and conveys a clear message.<br>4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.<br>5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.<br>6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization. |
+|       切题<br/>(Relevance)       | 切题(1-5)：答案内容是否切题，不答非所问，并且严格遵照题目要求。</br></br>Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic. | 1. 阅读题目，确定题目所问的问题是什么，以及需要回答哪些方面的问题。<br/> 2. 阅读答案，确认答案是否直接回答了题目所问的问题。<br/> 3. 检查答案是否严格遵照了题目的要求，包括答题方式、答题长度、答题格式等等。<br/> 4. 根据以上因素综合评估答案的切题程度，并给出一个1到5的分数，其中5表示答案非常切题，而1表示答案完全没有切题。</br></br>1. Read the question to determine what the question asks and what aspects of the question need to be answered.<br>2. Read the answers to make sure that they directly answer the question asked.<br>3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.<br>4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all. |
+|      创意性<br/>(Creativity)       | 创意性(1-5)：某些头脑风暴问题可能需要答案具有创意，提出新的思路。</br></br>Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则创意性评分可能会受到影响。<br/> 3. 考虑答案中是否包含新颖的想法或独特的思路。答案可能与已知的解决方案有所重叠，但仍然可以被认为是有创意的，只要它提供了新的角度或方法来解决问题。<br/> 4. 根据答案的创意性，给出一个1到5的评分。如果答案缺乏创意，则应给出一个较低的评分。如果答案具有创意并提供了新的思路，应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.<br>3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.<br>4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given. |
+|     实用性<br/>(Practicality)      | 实用性(1-5)：某些头脑风暴问题可能需要答案提出实用的建议或解决方法。</br></br>Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则实用性评分可能会受到影响。<br/> 3. 考虑答案中提出的建议或解决方法是否实用并可行。答案可能看起来很好，但如果无法实现或应用，则实用性评分可能会受到影响。<br/> 4. 根据答案的实用性，给出一个1到5的评分。如果答案缺乏实用性，则应给出一个较低的评分。如果答案提出了实用的建议或解决方法，并且可以很好地解决问题，则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.<br>3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.<br>4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given. |
+|      正确性<br/>(Correctness)      | 正确性(1-5)：答案应该符合常识、生活实际等等。 </br></br> Correctness (1-5): The answer should be in line with common sense, life experience, etc. | 1. 仔细阅读所提供的头脑风暴问题，确保你理解问题的要点和背景。<br/> 2. 根据你的知识和经验，判断所提供的答案是否可行。如果答案不可行，则正确性评分可能会受到影响。<br/> 3. 考虑答案中所提供的信息是否正确、符合常识、生活实际等等。如果答案中存在明显的错误或不合理之处，则正确性评分可能会受到影响。<br/> 4. 根据答案的正确性，给出一个1到5的评分。如果答案存在明显的错误或不合理之处，则应给出一个较低的评分。如果答案正确、符合常识、生活实际等等，则应给出一个较高的评分。</br></br>1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.<br>2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.<br>3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.<br>4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc. |
+|      自然<br/>(Naturalness)      | 自然(1-5)：答案是否自然，并且符合问题给定的身份。</br></br>Naturalness (1-5): whether the answer is natural and fits the identity given by the question. | 1. 阅读题目，确定题目提供的身份信息。<br/> 2. 检查答案内容是否符合题目给定的身份。<br/> 3. 根据以上因素，对该回答的自然性进行打分，分数从1到5，其中1表示不自然，5表示非常自然，并符合问题给定的身份。</br></br>1. Read the question and determine the identity information provided in the question.<br>2. Check whether the content of the answer matches the identity given in the question.<br>3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question. |
+|     参与感<br/>(Engagingness)      | 参与感(1-5)：答案是否对前面的对话内容做出了恰当的反应，是否理解对话的语境和背景。</br></br>Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation. | 1. 阅读题目，确定对话的语境和背景。<br/> 2. 检查答案是否充分理解对话的语境和背景，能否自然地融入到对话中而不显得突兀。<br/> 3. 根据以上因素，对该回答的参与感进行打分，分数从1到5，其中1表示没有参与感，5表示非常有参与感，并且恰当地理解了对话的语境和背景。</br></br>1. Read the questions to determine the context and background of the dialogue.<br>2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.<br>3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation. |
+|    合理性<br/>(Reasonableness)     | 合理性(1-5)：答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。</br></br>Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context. | 1. 阅读题目，确定对话的主题以及问题期望的回答方向。<br/> 2. 判断答案是否能够与前面的对话内容形成逻辑上的衔接，是否符合常理，能否在这个上下文中合理存在。<br/> 3. 根据以上因素，对该回答的合理性进行打分，分数从1到5，其中1表示不合理，5表示非常合理，并且能够与前面的对话内容形成逻辑上的衔接，并符合常理。</br></br>1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.<br>2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.<br>3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense. |
+|       多样性<br/>(Diversity)       | 多样性(1-5)：答案使用语言是否优美，具有有一定的创造性和想象力。然而，回答也应该保持合理和适度，不要过于夸张或离题。</br></br>Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic. | 1. 仔细阅读整个回答，确保完全理解回答所表达的内容和主题。<br/> 2. 在阅读回答的同时，注意语言的质量，例如措辞是否正确，语言是否生动等。<br/> 3. 检查回答的创造性和想象力，看看回答是否能够吸引人阅读下去。<br/> 4. 检查回答的合理性和适度，看看回答是否夸张或离题。5. 将多样性的评分打分在1到5之间，5分表示回答的质量很好，能够吸引人阅读，1分表示回答的内容生硬或者有离题的问题。</br></br>1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.<br>2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.<br>3. Check the creativity and imagination of the response to see if the response is engaging to read on.<br>4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.<br>5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic. |
+|       保真度<br/>(Fidelity)        | 保真度(1-5)：答案是否能够严格遵守角色的设定回答给定的请求。</br></br>Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting. | 1. 仔细阅读问题，了解角色在问题中的设定和表现，包括职业、背景、观点、性格等方面。<br/> 阅读题目的请求，确认回答请求时需要注意的细节。<br/> 3. 对比提供的回答与该角色的设定，评估回答是否能够严格遵守角色的设定。<br/> 4. 结合以上评估结果给出保真度的评分，范围从1到5分，其中1分表示回答与角色设定完全不符，5分表示回答完全符合角色设定且满足给定请求。</br></br>1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.<br>2. Read the question's request and confirm the details that need to be taken into account when answering the request.<br>3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.<br>4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request. |
+|      简明扼要<br/>(Conciseness)      | 简明扼要(1-5)：答案是否简明扼要，没有冗余内容。</br></br>Conciseness (1-5): answers should be concise and without redundant content. | 1. 阅读题目，提取出材料的重点。<br/> 2. 阅读该总结，并注意其中的主要观点和信息。<br/> 3. 评估总结的长度。一个简明扼要的总结通常应该在几句话或几段文字内传达关键信息，而不是冗长的段落或文章。<br/> 4. 检查总结是否包含与主要观点无关的信息或冗余信息。<br/> 5. 确定总结涵盖了材料中的关键信息，并且没有忽略任何重要细节。<br/> 6. 给总结打出1-5的分数，其中5表示总结简明扼要，没有冗余内容，而1表示总结冗长或包含不必要的信息，难以理解或记忆。根据您的判断，打出适当的得分。</br></br>1. Read the title and extract the main points of the material.<br>2. Read the summary and note the main ideas and messages in it.<br>3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.<br>4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.<br>5. Make sure that the summary covers the key information in the material and that no important details have been omitted.<br>6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score. |
 
 GPT models evaluate the quality of model predictions based on the given prompt words and gives a score between 1-5.
 
+> **NOTE:**  Even for the same metric, the details of its prompt words and CoT(Chain-of-Thought) can differ based on which category you want to evaluate. For example, prompt words for metric `correctness` showed here is "The answer should be in line with common sense, life experience, etc."(this is for category `brainstorming`), but for category `extraction`, prompt words can be "Answers should extract the required information accurately and should not contain any incorrect or misleading information." You can find all the prompt words and CoT(Chain-of-Thought) in `prompt/evaluation_prompt`.
+
 #### Automatic Evaluation
 
 Automated metrics evaluate the capability of a model by comparing model predictions with reference answers.
@@ -86,7 +87,7 @@ There are two ways to obtain reference answers:
 
 There are 5 types of automatic evaluation metrics listed in the table below:
 
-|     Automatic Evaluation Metric     | <center>Description</center>                                                  |
+|     Automatic Evaluation Metric     | Description                                                  |
 | :---------------------------------: | :----------------------------------------------------------- |
 |               BLEU-n                | Measure the accuracy between prediction and reference.<br/> BLEU-1 (Unigram) evaluates accuracy in word level.<br/> BLEU-n (n-gram) evaluate the fluency in sentence level. |
 |                ROUGE                | ROUGE-N measures the number of matching n-grams between prediction and reference. <br/> ROUGE-L measures the number of matching longest common subsequence (LCS) between prediction and reference. |
@@ -175,7 +176,7 @@ Example:
 
 #### Battle Prompt
 
-The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find an example battle prompt file in `prompt/battle_prompt`.
+The following is the Chinese battle prompt. In the battle prompt, the question and answers from two different models are fed into the prompt template. You can find example battle prompt files for Chinese and English in `prompt/battle_prompt`.
 
 ```json
 {
@@ -188,7 +189,7 @@ The following is the Chinese battle prompt. In the battle prompt, the question a
 
 #### Evaluation Prompt
 
-The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`.  You can find an example evaluation prompt file in `prompt/evaluation_prompt`.
+The following is an example of a Chinese GPT evaluation prompt. In an evaluation prompt, you should define your metrics in `metrics` and provide CoT(Chain-of-Thought) in `CoT`.  You can find example evaluation prompt files for Chinese and English in `prompt/evaluation_prompt`.
 
 ```json
 {
@@ -303,7 +304,7 @@ For example, if you want to add a new metric `persuasiveness` into category `bra
 
 ## To Do
 
-- [ ] Add evaluation for English capability
+- [x] Add evaluation for English capability
 - [ ] Support UniEval
 - [x] Support GPT-4 evaluation
 
diff --git a/applications/Chat/evaluate/config/config_en.json b/applications/Chat/evaluate/config/config_en.json
new file mode 100644
index 000000000..5b6272b97
--- /dev/null
+++ b/applications/Chat/evaluate/config/config_en.json
@@ -0,0 +1,123 @@
+{
+  "language": "en",
+  "category": {
+    "brainstorming": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
diff --git a/applications/Chat/evaluate/eval.py b/applications/Chat/evaluate/eval.py
index 4067b15db..8388d95f7 100644
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -14,7 +14,7 @@ def main(args):
     # load config
     config = jload(args.config_file)
 
-    if config["language"] == "cn":
+    if config["language"] in ["cn", "en"]:
         # get metric settings for all categories
         metrics_per_category = {}
         for category in config["category"].keys():
diff --git a/applications/Chat/evaluate/evaluator.py b/applications/Chat/evaluate/evaluator.py
index 433d775d2..0bf55ca80 100644
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List
 import gpt_evaluate
 import metrics
 import pandas as pd
-from utils import get_data_per_category, jdump
+from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
 
 
 class Evaluator(object):
@@ -42,21 +42,21 @@ class Evaluator(object):
 
         """
 
-        def switch(metric):
+        def switch(metric, language):
             if metric == "BLEU":
-                return metrics.bleu_score(preds=predicts_list, targets=targets_list)
+                return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
             elif metric == "ROUGE":
-                return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
+                return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
             elif (metric == "Distinct"):
-                return metrics.distinct_score(preds=predicts_list)
+                return metrics.distinct_score(preds=predicts_list, language=language)
             elif (metric == "BERTScore"):
-                return metrics.bert_score(preds=predicts_list, targets=targets_list)
+                return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
             elif (metric == "Precision"):
-                return metrics.precision(preds=predicts_list, targets=targets_list)
+                return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
             elif (metric == "Recall"):
-                return metrics.recall(preds=predicts_list, targets=targets_list)
+                return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
             elif (metric == "F1 score"):
-                return metrics.F1_score(preds=predicts_list, targets=targets_list)
+                return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
             else:
                 raise ValueError(f"Unexpected metric")
 
@@ -78,7 +78,7 @@ class Evaluator(object):
             predicts_list = [answer["output"] for answer in answers_per_category[category]]
 
             for metric in category_metrics:
-                self.automatic_metric_stats[category].update(switch(metric=metric))
+                self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
 
         # gpt evaluation
         for category in self.params:
@@ -106,35 +106,29 @@ class Evaluator(object):
             save_path = os.path.join(path, "gpt_evaluate", "battle_results")
             gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
         else:
-            # save evaluation results for automatic metrics
-            automatic_df = pd.DataFrame(self.automatic_metric_stats)
+            # Save evaluation results for automatic metrics
+            automatic_base_save_path = os.path.join(path, "automatic_results")
+            automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
 
-            automatic_results_save_path = os.path.join(path, "automatic_results")
-            if not os.path.exists(automatic_results_save_path):
-                os.makedirs(automatic_results_save_path)
-            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
-
-            # Save evaluation results for GPT-3.5 evaluation metrics.
-            all_evaluations = []
-            base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
-            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
-
-            for category, evaluations in self.gpt_evaluation_results.items():
-                jdump(
-                    evaluations,
-                    os.path.join(evaluation_results_save_path, model_name_list[0],
-                                 f"{category}_evaluation_results.json"))
-                all_evaluations.extend(evaluations)
-
-            jdump(all_evaluations,
-                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
-
-            # Start to calculate scores and save statistics.
-            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                        evaluation_statistics_save_path)
+            save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
 
             # Save charts and csv.
-            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
-                                                           evaluation_analyses_save_path)
+            automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
+            analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
+
+            # Save evaluation results for GPT evaluation metrics.
+            gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+            gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+
+            all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
+                                                                       gpt_evaluation_results_save_path)
+
+            # Start to calculate scores and save statistics.
+            gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
+            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                        gpt_evaluation_statistics_save_path)
+
+            # Save charts and csv.
+            gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
+            gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
+                                                           gpt_evaluation_analyses_save_path)
diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py
index 61ce3456c..b433500df 100644
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -461,6 +461,27 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
         return 0
 
 
+def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
+                                save_path: str) -> Dict[str, Any]:
+    """
+    Save evaluation results for different categories for one model.
+
+    Args:
+        model_name: name of the model for saving evaluation results.
+        gpt_evaluation_results: evaluations results for all of the model answers.
+        save_path: path to save GPT evaluation statistics.
+    """
+
+    all_evaluations = []
+    for category, evaluations in gpt_evaluation_results.items():
+        jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
+        all_evaluations.extend(evaluations)
+
+    jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
+
+    return all_evaluations
+
+
 def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
     """
     Generate statistics for one model.
@@ -468,7 +489,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
     Args:
         model_name: name of the model for saving statistics.
         evaluations: evaluations for all of the model answers.
-        save_path: path to save GPT-3.5 evaluation statistics.
+        save_path: path to save GPT evaluation statistics.
     """
 
     if not os.path.exists(save_path):
@@ -516,7 +537,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
 
 def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
     """
-    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+    Analyze and visualize all GPT evaluation statistics in the given directory.
 
     Args:
         statistics_path: path to all the models' statistics.
@@ -594,3 +615,5 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
 
         figure = fig.get_figure()
         figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
+
+        plt.close()
diff --git a/applications/Chat/evaluate/metrics.py b/applications/Chat/evaluate/metrics.py
index 5e657234c..031f6fa83 100644
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
@@ -1,13 +1,16 @@
 import statistics
+from typing import Dict, List
 
 import jieba
 from bert_score import score
 from nltk.translate.bleu_score import sentence_bleu
 from rouge_chinese import Rouge as Rouge_cn
+from rouge_score import rouge_scorer as Rouge_en
 from sklearn.metrics import f1_score, precision_score, recall_score
+from utils import preprocessing_text, remove_redundant_space
 
 
-def bleu_score(preds: list, targets: list) -> dict:
+def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Calculate BLEU Score Metric
 
     The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
@@ -21,8 +24,12 @@ def bleu_score(preds: list, targets: list) -> dict:
                (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
 
     for pred, target in zip(preds, targets):
-        pred_list = (' '.join(jieba.cut(pred))).split()
-        target_list = [(' '.join(jieba.cut(target))).split()]
+        if language == "cn":
+            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
+        elif language == "en":
+            pred_list = preprocessing_text(pred).split()
+            target_list = [preprocessing_text(target).split()]
 
         bleu = sentence_bleu(target_list, pred_list, weights=weights)
         cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
@@ -33,7 +40,7 @@ def bleu_score(preds: list, targets: list) -> dict:
     return bleu_scores
 
 
-def rouge_cn_score(preds: list, targets: list) -> dict:
+def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
     """Calculate Chinese ROUGE Score Metric
 
     The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
@@ -41,13 +48,13 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
     the preds and targets. ROUGE-L measures the number of matching
     longest common subsequence (LCS) between preds and targets.
     """
-    rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
+    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
     all_preds = []
     all_targets = []
 
     for pred, target in zip(preds, targets):
-        pred_list = ' '.join(jieba.cut(pred))
-        target_list = ' '.join(jieba.cut(target))
+        pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
+        target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
         all_preds.append(pred_list)
         all_targets.append(target_list)
 
@@ -61,7 +68,42 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
     return rouge_scores
 
 
-def distinct_score(preds: list) -> dict:
+def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
+    """Calculate English ROUGE Score Metric
+
+    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
+    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
+    the preds and targets. ROUGE-L measures the number of matching
+    longest common subsequence (LCS) between preds and targets.
+    """
+    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
+    all_preds = []
+    all_targets = []
+
+    rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
+
+    for pred, target in zip(preds, targets):
+        score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
+        rouge_scores["rouge1"] += score['rouge1'].fmeasure
+        rouge_scores["rouge2"] += score['rouge2'].fmeasure
+        rouge_scores["rougeL"] += score['rougeL'].fmeasure
+
+    rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
+    rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
+    rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
+
+    return rouge_scores
+
+
+def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
+    """Calculate ROUGE Score Metric"""
+    if language == "cn":
+        return rouge_cn_score(preds, targets)
+    elif language == "en":
+        return rouge_en_score(preds, targets)
+
+
+def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
     """Calculate Distinct Score Metric
 
     This metric refers to https://arxiv.org/abs/1510.03055.
@@ -72,19 +114,36 @@ def distinct_score(preds: list) -> dict:
     cumulative_distinct = []
 
     for pred in preds:
-        pred_seg_list = list(' '.join(jieba.cut(pred)))
-        count_segs = len(pred_seg_list)
-        unique_segs = set(pred_seg_list)
-        count_unique_chars = len(unique_segs)
+        if language == "cn":
+            pred_seg_list = ' '.join(jieba.cut(pred)).split()
+            count_segs = len(pred_seg_list)
+            unique_segs = set(pred_seg_list)
+            count_unique_chars = len(unique_segs)
 
-        cumulative_distinct.append(count_unique_chars / count_segs)
+            cumulative_distinct.append(count_unique_chars / count_segs)
+        elif language == "en":
+            # calculate distinct 1-gram, 2-gram, 3-gram
+            unique_ngram = [set() for _ in range(0, 3)]
+            all_ngram_count = [0 for _ in range(0, 3)]
+
+            split_pred = preprocessing_text(pred).split()
+            for n in range(0, 3):
+                for i in range(0, len(split_pred) - n):
+                    ngram = ' '.join(split_pred[i:i + n + 1])
+                    unique_ngram[n].add(ngram)
+                    all_ngram_count[n] += 1
+
+            # Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
+            avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
+
+            cumulative_distinct.append(statistics.mean(avg_distinct))
 
     distinct_score["distinct"] = statistics.mean(cumulative_distinct)
 
     return distinct_score
 
 
-def bert_score(preds: list, targets: list) -> dict:
+def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Calculate BERTScore Metric
 
     The BERTScore evaluates the semantic similarity between
@@ -95,23 +154,25 @@ def bert_score(preds: list, targets: list) -> dict:
     target_list = []
 
     for pred, target in zip(preds, targets):
-        pred_list.append(' '.join(jieba.cut(pred)))
-        target_list.append(' '.join(jieba.cut(target)))
+        pred_list.append(pred)
+        target_list.append(target)
 
-    _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+    if language == "cn":
+        _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+    elif language == "en":
+        _, _, F = score(pred_list, target_list, lang="en", verbose=True)
 
     bert_score["bert_score"] = F.mean().item()
 
     return bert_score
 
 
-def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
+def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Precision, Recall and F1-Score Calculation
 
     The calculation of precision, recall and f1-score is realized by counting
     the number f overlaps between the preds and target. The comparison length
-    limited by the shorter one of preds and targets. This design is mainly
-    considered for classification and extraction categories.
+    limited by the shorter one of preds and targets.
     """
     precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
     precision_scores = []
@@ -119,8 +180,12 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
     f1_scores = []
 
     for pred, target in zip(preds, targets):
-        pred_list = [char for char in pred]
-        target_list = [char for char in target]
+        if language == "cn":
+            pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
+            target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
+        elif language == "en":
+            pred_list = [char for char in preprocessing_text(pred).split()]
+            target_list = [char for char in preprocessing_text(target).split()]
 
         target_labels = [1] * min(len(target_list), len(pred_list))
         pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
@@ -136,34 +201,31 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
     return precision_recall_f1
 
 
-def precision(preds: list, targets: list) -> dict:
+def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Calculate Precision Metric
-    (design for classification and extraction categories)
 
     Calculating precision by counting the number of overlaps between the preds and target.
     """
     precision = {"precision": 0}
-    precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
+    precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
     return precision
 
 
-def recall(preds: list, targets: list) -> dict:
+def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Calculate Recall Metric
-    (design for classification and extraction categories)
 
     Calculating recall by counting the number of overlaps between the preds and target.
     """
     recall = {"recall": 0}
-    recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
+    recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
     return recall
 
 
-def F1_score(preds: list, targets: list) -> dict:
+def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
     """Calculate F1-score Metric
-    (design for classification and extraction categories)
 
     Calculating f1-score by counting the number of overlaps between the preds and target.
     """
     f1 = {"f1_score": 0}
-    f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
+    f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
     return f1
diff --git a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
new file mode 100644
index 000000000..2b35d1958
--- /dev/null
+++ b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
@@ -0,0 +1,6 @@
+{
+  "id": 1,
+  "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
+  "prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
+  "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
+}
diff --git a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
new file mode 100644
index 000000000..0b2053746
--- /dev/null
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
@@ -0,0 +1,179 @@
+{
+  "brainstorming": {
+    "id": 1,
+    "category": "brainstorming",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "creativity": "Creativity (1-5): Some brainstorming questions may require answers that are creative and suggest new ideas.",
+      "practicality": "Practicality (1-5): Some brainstorming questions may require answers to suggest practical suggestions or solutions.",
+      "correctness": "Correctness (1-5): The answer should be in line with common sense, life experience, etc."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "creativity": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the creativity score may be affected.\n3. Consider whether the answer contains novel ideas or unique thoughts. An answer may overlap with a known solution and still be considered creative, as long as it offers a new perspective or approach to the problem.\n4. Give a score of 1 to 5 depending on the creativity of the answer. If the answer lacks creativity, a lower score should be given. If the answer is creative and provides a new idea, a higher score should be given.\n\nCreativity:",
+      "practicality": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the practicality score may be affected.\n3. Consider whether the suggestions or solutions presented in the answer are practical and workable. The answer may look good, but if it cannot be implemented or applied, the practicality score may be affected.\n4. Give a score of 1 to 5 depending on the practicality of the answer. If the answer lacks practicality, a lower score should be given. If the answer makes a practical suggestion or solution and solves the problem well, a higher score should be given.\n\nPracticality:",
+      "correctness": "1. Read the provided brainstorming questions carefully to make sure you understand the gist and context of the questions.\n2. Based on your knowledge and experience, determine if the answers provided are feasible. If the answer is not feasible, the correctness score may be affected.\n3. Consider whether the information provided in the answer is correct, consistent with common sense, real life, etc. If there are obvious errors or implausibilities in the answer, the correctness score may be affected.\n4. Give a score of 1 to 5 depending on the correctness of the answer. If the answer contains obvious errors or unreasonable points, a lower score should be given. A higher score should be given if the answer is correct, consistent with common sense, real life, etc.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"brainstorming\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "chat": {
+    "id": 2,
+    "category": "chat",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "naturalness": "Naturalness (1-5): whether the answer is natural and fits the identity given by the question.",
+      "engagingness": "Engagingness (1-5): whether the answer responds appropriately to the content of the preceding conversation and whether it understands the context and background of the conversation.",
+      "reasonableness": "Reasonableness (1-5): Whether the answer can form a logical connection with the content of the previous dialogue, whether it is consistent with common sense, and whether it can reasonably exist in this context."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "naturalness": "1. Read the question and determine the identity information provided in the question.\n2. Check whether the content of the answer matches the identity given in the question.\n3. Based on the above factors, score the naturalness of the response on a scale from 1 to 5, where 1 means unnatural and 5 means very natural and in accordance with the identity given in the question.\n\nNaturalness:",
+      "engagingness": "1. Read the questions to determine the context and background of the dialogue.\n2. Check that the answer fully understands the context and background of the conversation and that it fits naturally into the conversation without seeming abrupt.\n3. Based on the above factors, rate the response's engagement on a scale from 1 to 5, where 1 means not engaged and 5 means very engaged and appropriately understands the context and background of the conversation.\n\nEngagingness:",
+      "reasonableness": "1. Read the question and determine the topic of the conversation and the direction the question expects the answer to go.\n2. Determine whether the answer can be logically connected to the preceding conversation, whether it makes common sense, and whether it can reasonably exist in this context.\n3. Based on the above factors, rate the reasonableness of the answer on a scale from 1 to 5, where 1 means unreasonable and 5 means very reasonable and able to form a logical connection with the preceding dialogue content and consistent with common sense.\n\nReasonableness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"chat\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "classification": {
+    "id": 3,
+    "category": "classification",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"classification\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "closed_qa": {
+    "id": 4,
+    "category": "closed_qa",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question by yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"closed qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "extraction": {
+    "id": 5,
+    "category": "extraction",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "correctness (1-5): Answers should extract the required information accurately and should not contain any incorrect or misleading information."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the questions carefully and identify the information that needs to be extracted from the material.\n2. Read the answer carefully and make sure it covers all the information that needs to be extracted.\n3. Use the material provided to verify the correctness of the response. If the response is inaccurate or contains incorrect or misleading information, a high score cannot be given.\n4. Check that the answer contains all the information required to be extracted and do not leave out any important details.\n5. Give a score between 1 and 5 based on the correctness and completeness of the response, with a score of 5 indicating a very accurate and complete response and a score of 1 indicating that the response barely extracts the required information.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"extraction\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "generation": {
+    "id": 6,
+    "category": "generation",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "diversity": "Diversity (1-5): Whether the answers use beautiful language and have some creativity and imagination. However, answers should also be kept reasonable and moderate, not overly exaggerated or off-topic."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "diversity": "1. Read the entire response carefully to ensure that you fully understand the content and theme expressed in the response.\n2. While reading the response, pay attention to the quality of the language, such as whether the wording is correct and the language is vivid.\n3. Check the creativity and imagination of the response to see if the response is engaging to read on.\n4. Check the reasonableness and appropriateness of the responses to see if the responses are exaggerated or off-topic.\n5. Rate the diversity on a scale of 1 to 5, with a 5 indicating a good quality response that is engaging to read and a 1 indicating a raw response or a question that is off-topic.\n\nDiversity:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"generation\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "open_qa": {
+    "id": 7,
+    "category": "open_qa",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be given. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the answers to the \"open qa\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "rewriting": {
+    "id": 8,
+    "category": "rewriting",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the answers to the \"rewriting\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "roleplay": {
+    "id": 9,
+    "category": "roleplay",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "fidelity": "Fidelity (1-5): whether the answer is able to answer the given request in strict compliance with the role setting.",
+      "creativity": "Creativity (1-5): The answers to the role-play questions need to be somewhat creative, but at the same time they need to adhere to the setting of the role."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "fidelity": "1. Read the question carefully to understand how the character is set up and represented in the question, including aspects such as occupation, background, point of view, and personality.\n2. Read the question's request and confirm the details that need to be taken into account when answering the request.\n3. Compare the provided answer with the setting of the role and assess whether the answer can strictly adhere to the setting of the role.\n4. Combine the results of the above assessment to give a fidelity score ranging from 1 to 5, where a score of 1 means that the response does not match the persona at all, and a score of 5 means that the response fully complies with the persona and satisfies the given request.\n\nFidelity:",
+      "creativity": "1. Read the question carefully to understand how the character is set up and represented in the question, including career, background, perspective, and personality.\n2. Evaluate whether the answer has unique ideas and suggestions that bring new ideas and insights to the questioner.\n3. Compare the creativity in the response to the setting of the persona and assess whether the response adheres to the setting and essential characteristics of the persona.\n4. Evaluate the quality of the responses in general and combine the results of the above assessment to give a creativity score ranging from 1 to 5, where a score of 1 indicates that the response lacks creativity and a score of 5 indicates that the response has unique ideas and suggestions and is able to adhere to the set-up of the persona.\n\nCreativity:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"role-play\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "summarization": {
+    "id": 10,
+    "category": "summarization",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): answers should summarize the main points of the material accurately and unambiguously.",
+      "conciseness": "Conciseness (1-5): answers should be concise and without redundant content."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the material given in the question carefully to understand its content and main points.\n2. Assess whether the answer accurately summarizes the key points of the source material.\n3. assess whether the response contains all the key information in the source material.\n4. Based on the above steps, give a score of 1-5, where 1 means that the response does not accurately summarize the main points of the material and 5 means that the response completely accurately summarizes the main points of the material.\n\nCorrectness:",
+      "conciseness": "1. Read the title and extract the main points of the material.\n2. Read the summary and note the main ideas and messages in it.\n3. Assess the length of the summary. A concise summary should usually convey key information within a few sentences or paragraphs, rather than lengthy paragraphs or essays.\n4. Check that the summary does not contain information that is not relevant to the main ideas or that is redundant.\n5. Make sure that the summary covers the key information in the material and that no important details have been omitted.\n6. Rate the summary on a scale of 1-5, where 5 means the summary is concise and free of redundancy, and 1 means the summary is lengthy or contains unnecessary information that is difficult to understand or remember. Based on your judgment, assign the appropriate score.\n\nConciseness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the \"summarization\" question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  },
+  "general": {
+    "id": 11,
+    "category": "general",
+    "metrics": {
+      "language organization": "Language organization (1-5): whether the answer language is fluent and coherent, uses correct grammar, has a certain logic, uses appropriate connecting words, transition words, etc.",
+      "relevance": "Relevance (1-5): whether the content of the answer is relevant to the topic, does not answer the wrong question, and strictly follows the requirements of the topic.",
+      "correctness": "Correctness (1-5): whether the answer is correct or not."
+    },
+    "CoT": {
+      "language organization": "1. Read the answers and check for grammatical errors, poor word choice, or other significant mistakes.\n2. Check that the answer is logical, conveys the information in a logical order, and is self-explanatory.\n3. Determine if the answer is relevant to the question or topic and conveys a clear message.\n4. Check that the answer is coherent and that appropriate transitions and switches are used to maintain coherence between sentences and paragraphs.\n5. Check that the answer is clearly structured and organized in such a way that the reader can easily understand the hierarchy and structure of the information.\n6. Evaluate the linguistic organization of the answer based on a combination of the above factors and give a score of 1 to 5, where 5 indicates very good linguistic organization and 1 indicates very poor linguistic organization.\n\nLanguage organization:",
+      "relevance": "1. Read the question to determine what the question asks and what aspects of the question need to be answered.\n2. Read the answers to make sure that they directly answer the question asked.\n3. Check that the answer follows the requirements of the question, including the way it is answered, the length of the answer, the format of the answer, etc.\n4. Evaluate how relevant the answer is based on the above factors and give a score of 1 to 5, where 5 means the answer is very relevant and 1 means the answer is not relevant at all.\n\nRelevance:",
+      "correctness": "1. Read the question carefully and try to answer the question yourself.\n2. Check the correctness of the answer. You can use known facts or research to verify that the answer is correct. If the answer is correct, you can give a score of 5 for correctness. If the answer is partially correct, an appropriate score, such as 2, 3, or 4, may be assigned. If the answer is completely incorrect, only 1 point is awarded.\n\nCorrectness:"
+    },
+    "prompt": "You are a good assistant. Please rate the given answer to the question below.\n\nThe question is as follows:\n\n{question}\n\nThe answer is as follows:\n\n{answer}\n\nThe metric for evaluation is as follows:\n\n{metric}\n\nYou should follow the following evaluation steps:\n\n{steps}"
+  }
+}
diff --git a/applications/Chat/evaluate/requirements.txt b/applications/Chat/evaluate/requirements.txt
index b0301c2f1..27d317ed8 100644
--- a/applications/Chat/evaluate/requirements.txt
+++ b/applications/Chat/evaluate/requirements.txt
@@ -8,3 +8,5 @@ seaborn
 pandas
 matplotlib
 numpy
+zhon
+rouge_score
diff --git a/applications/Chat/evaluate/utils.py b/applications/Chat/evaluate/utils.py
index 517c0a1c3..1f4069386 100644
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -1,6 +1,15 @@
 import io
 import json
 import os
+import re
+import string
+from typing import Dict
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import tqdm
+from zhon import hanzi
 
 
 def _make_w_io_base(f, mode: str):
@@ -29,7 +38,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
     """
     f = _make_w_io_base(f, mode)
     if isinstance(obj, (dict, list)):
-        json.dump(obj, f, indent=indent, default=default)
+        json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
     elif isinstance(obj, str):
         f.write(obj)
     else:
@@ -61,3 +70,149 @@ def get_data_per_category(data, categories):
             data_per_category[category].append(item)
 
     return data_per_category
+
+
+def remove_articles(text: str) -> str:
+    """
+    Remove articles "a, an, the" in the given text.
+    It is used in evaluation of automatic metrics.
+
+    """
+
+    pattern = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(pattern, " ", text)
+
+
+def remove_punctuations(text: str) -> str:
+    """
+    Remove punctuations in the given text.
+    It is used in evaluation of automatic metrics.
+
+    """
+
+    punctuation = string.punctuation + hanzi.punctuation
+    punctuation = set([char for char in punctuation])
+    punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
+
+    out = []
+    for char in text:
+        if char in punctuation:
+            continue
+        else:
+            out.append(char)
+
+    return "".join(out)
+
+
+def remove_redundant_space(text: str) -> str:
+    """
+    Remove redundant spaces in the given text.
+    It is used in evaluation of automatic metrics.
+
+    """
+
+    return " ".join(text.split())
+
+
+def preprocessing_text(text: str) -> str:
+    """
+    Preprocess the given text.
+    It is used in evaluation of automatic metrics.
+
+    """
+
+    return remove_redundant_space(remove_articles(remove_punctuations(text.lower())))
+
+
+def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
+    """
+    Save automatic evaluation results of different categories for one model.
+
+    """
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    automatic_df = pd.DataFrame(automatic_metric_stats)
+    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
+
+
+def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
+    """
+    Read a csv file and return a dictionary which stores scores per metric.
+
+    """
+
+    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
+
+    results_dict = {metric: {} for metric in list(results.index)}
+    for i, metric in enumerate(results_dict.keys()):
+        for j, category in enumerate(list(results.columns)):
+            if pd.isnull(results.iloc[i][j]):
+                continue
+            results_dict[metric][category] = results.iloc[i][j]
+
+    return results_dict
+
+
+def analyze_automatic_results(results_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all csv files in the given folder.
+
+    """
+
+    if not os.path.exists(results_path):
+        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
+
+    all_statistics = {}
+
+    for file_name in os.listdir(results_path):
+        if file_name.endswith("_results.csv"):
+            model_name = file_name.split("_results.csv")[0]
+            all_statistics[model_name] = read_automatic_results(results_path, file_name)
+
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
+
+    frame_all = {"model": [], "category": [], "metric": [], "score": []}
+    frame_per_metric = {}
+    for model_name, model_statistics in all_statistics.items():
+        for metric, metric_statistics in model_statistics.items():
+            if frame_per_metric.get(metric) is None:
+                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
+
+            for category, category_score in metric_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["score"].append(category_score)
+
+                frame_per_metric[metric]["model"].append(model_name)
+                frame_per_metric[metric]["category"].append(category)
+                frame_per_metric[metric]["score"].append(category_score)
+
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
+
+    for metric in tqdm.tqdm(
+            frame_per_metric.keys(),
+            desc=f"metric: ",
+            total=len(frame_per_metric.keys()),
+    ):
+        data = pd.DataFrame(frame_per_metric[metric])
+
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+
+        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
+        fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
+        plt.xlabel("Evaluation Category")
+        plt.ylabel("Score")
+
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
+
+        plt.close()

From ec9bbc0094f25f69c8bfa8f9653537c40bf91e36 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 6 Jun 2023 11:32:31 +0800
Subject: [PATCH 21/26] [devops] improving testmon cache (#3902)

* [devops] improving testmon cache

* [devops] fix branch name with slash

* [devops] fix branch name with slash

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] fix edit action

* [devops] update readme
---
 .github/workflows/README.md       |  10 ++-
 .github/workflows/build_on_pr.yml | 118 ++++++++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index f40f4cc86..3fad7e36f 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -43,10 +43,18 @@ I will provide the details of each workflow below.
 
 | Workflow Name          | File name                  | Description                                                                                                                                       |
 | ---------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files. It will run all the unit tests in the repository with 4 GPUs. |
+| `Build on PR`          | `build_on_pr.yml`          | This workflow is triggered when a PR changes essential files and a branch is created/deleted. It will run all the unit tests in the repository with 4 GPUs. |
 | `Build on Schedule`    | `build_on_schedule.yml`    | This workflow will run the unit tests everyday with 8 GPUs. The result is sent to Lark.                                                           |
 | `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results when `Build` is done.                                                           |
 
+To reduce the average time of the unit test on PR, `Build on PR` workflow manages testmon cache.
+
+1. When creating a new branch, it copies `cache/main/.testmondata*` to `cache/<branch>/`.
+2. When creating a new PR or change the base branch of a PR, it copies `cache/<base_ref>/.testmondata*` to `cache/_pull/<pr_number>/`.
+3. When running unit tests for each PR, it restores testmon cache from `cache/_pull/<pr_number>/`. After the test, it stores the cache back to `cache/_pull/<pr_number>/`.
+4. When a PR is closed, if it's merged, it copies `cache/_pull/<pr_number>/.testmondata*` to `cache/<base_ref>/`. Otherwise, it just removes `cache/_pull/<pr_number>`.
+5. When a branch is deleted, it removes `cache/<ref>`.
+
 ### Example Test
 
 | Workflow Name              | File name                       | Description                                                                    |
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index a5a17d176..b5f293107 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -2,7 +2,7 @@ name: Build on PR
 
 on:
   pull_request:
-    types: [synchronize, opened, reopened]
+    types: [synchronize, opened, reopened, ready_for_review, closed, edited]
     branches:
       - "main"
       - "develop"
@@ -18,11 +18,63 @@ on:
       - "!tests/**.md" # ignore doc change
       - "pytest.ini" # test config change
       - "setup.py" # install command change
+  create:
+  delete:
 
 jobs:
+  prepare_cache:
+    name: Prepare testmon cache
+    if: |
+      github.event_name == 'create' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
+          fi
+        env:
+          MAIN_BRANCH: ${{ github.event.master_branch }}
+
+  prepare_cache_for_pr:
+    name: Prepare testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Copy testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d "/github/home/testmon_cache/${BASE}" ]; then
+            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.head.ref }}
+
   detect:
     name: Detect file change
     if: |
+      github.event_name == 'pull_request' &&
+      (github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
       github.event.pull_request.draft == false &&
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     outputs:
@@ -135,9 +187,11 @@ jobs:
 
       - name: Restore Testmon Cache
         run: |
-          if [ -d /github/home/testmon_cache ]; then
-            [ ! -z "$(ls -A /github/home/testmon_cache)" ] && cp -p -r /github/home/testmon_cache/.testmondata* /__w/ColossalAI/ColossalAI/
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
           fi
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Execute Unit Testing
         run: |
@@ -149,8 +203,10 @@ jobs:
 
       - name: Store Testmon Cache
         run: |
-          [ -d /github/home/testmon_cache ] || mkdir /github/home/testmon_cache
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/
+          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
+          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
+        env:
+          PR_NUMBER: ${{ github.event.number }}
 
       - name: Collate artifact
         env:
@@ -188,3 +244,55 @@ jobs:
         with:
           name: report
           path: report/
+
+  store_cache:
+    name: Store testmon cache for PR
+    if: |
+      github.event_name == 'pull_request' &&
+      github.event.action == 'closed' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Store testmon cache if possible
+        if: github.event.pull_request.merged == true
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
+          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
+            [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
+          fi
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Remove testmon cache
+        if: github.event.pull_request.merged != true
+        run: |
+          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+  remove_cache:
+    name: Remove testmon cache
+    if: |
+      github.event_name == 'delete' &&
+      github.event.ref_type == 'branch' &&
+      github.event.repository.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --rm
+    timeout-minutes: 5
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Remove testmon cache
+        run: | # branch name may contain slash, we need to replace it with space
+          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
+          rm -rf "/github/home/testmon_cache/${BASE}"

From c1535ccbba2688682708b0203cce97b01d7750ef Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <56809903+Fridge003@users.noreply.github.com>
Date: Tue, 6 Jun 2023 13:36:11 +0800
Subject: [PATCH 22/26] [doc] fix docs about booster api usage (#3898)

---
 colossalai/booster/booster.py                   | 4 ++--
 docs/source/en/features/zero_with_chunk.md      | 4 ++--
 docs/source/zh-Hans/features/zero_with_chunk.md | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 61d912157..4a42e2049 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -25,11 +25,11 @@ class Booster:
     Examples:
         ```python
         colossalai.launch(...)
-        plugin = GeminiPlugin(stage=3, ...)
+        plugin = GeminiPlugin(...)
         booster = Booster(precision='fp16', plugin=plugin)
 
         model = GPT2()
-        optimizer = Adam(model.parameters())
+        optimizer = HybridAdam(model.parameters())
         dataloader = Dataloader(Dataset)
         lr_scheduler = LinearWarmupScheduler()
         criterion = GPTLMLoss()
diff --git a/docs/source/en/features/zero_with_chunk.md b/docs/source/en/features/zero_with_chunk.md
index d6f6f611a..1b27d64b6 100644
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -195,7 +195,7 @@ def get_data(batch_size, seq_len, vocab_size):
 Finally, we define a model which uses Gemini + ZeRO DDP and define our training loop, As we pre-train GPT in this example, we just use a simple language model loss:
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -211,7 +211,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)
diff --git a/docs/source/zh-Hans/features/zero_with_chunk.md b/docs/source/zh-Hans/features/zero_with_chunk.md
index 9030464dd..9fe5601bb 100644
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -197,7 +197,7 @@ def get_data(batch_size, seq_len, vocab_size):
 最后，使用booster注入 Gemini + ZeRO DDP 特性, 并定义训练循环。由于我们在这个例子中对GPT进行预训练，因此只使用了一个简单的语言模型损失函数：
 
 ```python
-from torch.optim import Adam
+from colossalai.nn.optimizer import HybridAdam
 
 from colossalai.booster import Booster
 from colossalai.zero import ColoInitContext
@@ -213,7 +213,7 @@ def main():
 
     # build criterion
     criterion = GPTLMLoss()
-    optimizer = Adam(model.parameters(), lr=0.001)
+    optimizer = HybridAdam(model.parameters(), lr=0.001)
 
     torch.manual_seed(123)
     default_pg = ProcessGroup(tp_degree=args.tp_degree)

From 0e484e620134e3c284216c5c493e8813318cfbdb Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 6 Jun 2023 14:07:36 +0800
Subject: [PATCH 23/26] [nfc]fix typo colossalai/pipeline tensor nn (#3899)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel

* fix typo colossalai/nn

* revert change warmuped

* fix typo colossalai/pipeline tensor nn
---
 colossalai/nn/optimizer/cpu_adam.py           |  2 +-
 colossalai/nn/optimizer/hybrid_adam.py        |  2 +-
 colossalai/pipeline/pipelinable.py            |  8 ++++----
 colossalai/pipeline/rpc/_pipeline_base.py     | 10 +++++-----
 colossalai/pipeline/rpc/_pipeline_schedule.py |  6 +++---
 colossalai/pipeline/utils.py                  |  2 +-
 colossalai/tensor/d_tensor/comm_spec.py       |  2 +-
 colossalai/tensor/d_tensor/sharding_spec.py   |  4 ++--
 colossalai/tensor/param_op_hook.py            |  2 +-
 colossalai/tensor/process_group.py            |  2 +-
 colossalai/tensor/shape_consistency.py        |  6 +++---
 colossalai/tensor/sharding_spec.py            |  6 +++---
 colossalai/tensor/utils.py                    |  2 +-
 13 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 1ec8783c5..3a6d37103 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -13,7 +13,7 @@ from .nvme_optimizer import NVMeOptimizer
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 526071b06..84903ac36 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -14,7 +14,7 @@ from .cpu_adam import CPUAdam
 class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 
-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
     But the parameters and gradients should on the same device:
       * Parameters on CPU and gradients on CPU is allowed.
       * Parameters on GPU and gradients on GPU is allowed.
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py
index 9731530a6..79913987b 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@@ -83,7 +83,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
         for k, v in kwargs.items():
             if isinstance(v, torch.nn.Module):
                 v = self._layer_spec_dict[id(v)]
-            # (lyl)TODO: analyse ColoTensor as well
+            # (lyl)TODO: analyze ColoTensor as well
             modified_kwargs[k] = v
 
         # keep track of the module children
@@ -117,7 +117,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
     def to_layer_list(self, exec_seq=None):
         """
         Create a layer spec list and func list with execution sequence given by user.
-        If exec_seq is None, we will take the module initizing order as execution order.
+        If exec_seq is None, we will take the module initializing order as execution order.
         """
 
         self._exec_seq = exec_seq
@@ -177,7 +177,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
 
     def partition(self, num_chunks, pipeline_size, rank):
         """
-        Partitioned model will be built respect to partion policy.
+        Partitioned model will be built respect to partition policy.
         The real module instance will be built in this method.
         """
         if isinstance(self._policy, str):
@@ -193,7 +193,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
                 self.customized_parts = customized_partition(self._exec_seq)
                 assert len(self.customized_parts) == gpc.get_world_size(
                     ParallelMode.PIPELINE
-                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
+                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
                 parts = self.customized_parts[rank]
             else:
                 raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/pipeline/rpc/_pipeline_base.py
index 2d7e25c82..9e549df58 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@@ -123,7 +123,7 @@ class WorkerBase(ABC):
         self.device = device
         self._initialize_outstanding_range()
 
-        # variable and const for context managment
+        # variable and const for context management
         self.outstanding = 0
         self.forward_times = 0
         self.backward_times = 0
@@ -226,7 +226,7 @@ class WorkerBase(ABC):
         self.pp_rank_to_worker_rref = pp_rank_to_worker_rref
 
         # for some schedule need the other worker's info to initialise partition (like Chimera)
-        # construction of partition is executed after the registion of pp_rank_to_worker_rref
+        # construction of partition is executed after the registration of pp_rank_to_worker_rref
         self._initialize_partition()
 
     # res_use works for lifecycle counter,
@@ -418,7 +418,7 @@ class WorkerBase(ABC):
                 # On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
                 self.work_list[key] = work_item_from_producer
                 self.work_list_condition_lock.notify_all()
@@ -460,7 +460,7 @@ class WorkerBase(ABC):
                 # On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
                 # can only be executed once for every producer-consumer stage pair, which is necessary
                 # to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                 work_item_from_consumer = self._subscribe_consumer(microbatch_id)
                 self.work_list[key] = work_item_from_consumer
                 self.work_list_condition_lock.notify_all()
@@ -508,7 +508,7 @@ class WorkerBase(ABC):
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = self.get_producer_stage_ids()
         self.consumer_stage_ids = self.get_consumer_stage_ids()
 
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/pipeline/rpc/_pipeline_schedule.py
index 0d572231d..6eda8f3b3 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -123,7 +123,7 @@ class ChimeraWorker(WorkerBase):
         assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
         assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
 
-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
         self.producer_stage_ids = []
         self.consumer_stage_ids = []
 
@@ -174,7 +174,7 @@ class ChimeraWorker(WorkerBase):
         else:
             # if it is down pipeline, create partition by origin method
             co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
-            # get the coresponding model state dict and wait for its init
+            # get the corresponding model state dict and wait for its init
             state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
             super()._initialize_partition()
             self.module_partition.load_state_dict(state_dict)
@@ -228,7 +228,7 @@ class ChimeraWorker(WorkerBase):
         stage_num = self.actual_stage_num
         co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)
 
-        # if currrent pp_rank is not the first to do step
+        # if current pp_rank is not the first to do step
         # wait its previous pp_rank finish step
         grads = self.get_parameter_gradients()
 
diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py
index df7226644..ac8a3ad7d 100644
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -113,7 +113,7 @@ def _binary_search(weights, num):
 
 def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
     assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recommended"
 
     logger = get_dist_logger()
     parts = [[] for _ in range(pipeline_parallel_size)]
diff --git a/colossalai/tensor/d_tensor/comm_spec.py b/colossalai/tensor/d_tensor/comm_spec.py
index 765d8ec1b..159125fa1 100644
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -28,7 +28,7 @@ class CommSpec:
     to determine the buffer shape, and logical_process_axis
 
     Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
         process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
         gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
         shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 2ea0c4db8..565012b58 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -41,7 +41,7 @@ class DimSpec:
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -58,7 +58,7 @@ class DimSpec:
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index 9c2e0d4ad..8ed8176d9 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -164,7 +164,7 @@ def _get_grad_args(*args):
     for obj in args:
         if _is_grad_tensor(obj):
             return args, None
-    # otherwise, the first arguement should be a tuple of grad tensors
+    # otherwise, the first argument should be a tuple of grad tensors
     # if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
     arg_zero = args[0]
     if not isinstance(arg_zero, tuple):
diff --git a/colossalai/tensor/process_group.py b/colossalai/tensor/process_group.py
index f108bdc24..8d2e9a616 100644
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@@ -130,7 +130,7 @@ class ProcessGroup:
     @property
     def has_cpu_groups(self) -> bool:
         """has_cpu_groups
-        If cpu groups have been initailized.
+        If cpu groups have been initialized.
 
         Returns:
             bool: cpu process groups have been initialized or not.
diff --git a/colossalai/tensor/shape_consistency.py b/colossalai/tensor/shape_consistency.py
index 0a840006f..5bec552d6 100644
--- a/colossalai/tensor/shape_consistency.py
+++ b/colossalai/tensor/shape_consistency.py
@@ -252,7 +252,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
     def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
         '''
         Get all valid sharding specs from source_spec with single shard operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         For the sharding operation, we just care about legal sharding dimensions.
 
         Argument:
@@ -386,7 +386,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
     def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
         '''
         Get all valid sharding specs from source_spec with one step transform, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
         Note:
             all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
             and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
@@ -577,7 +577,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
         Step3:
             Repeat above steps until the source spec transform to target spec.
 
-        During finding the transform path, commucation cost will be accumulated, and it
+        During finding the transform path, communication cost will be accumulated, and it
         will be finally used in auto parallel solver.
 
         Additionally, to avoid repeating the path search in runtime, we cached all solved path
diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py
index bed320130..406ad4909 100644
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@@ -45,7 +45,7 @@ class _DimSpec:
 
     def _convert_str_to_shard_list(self, str_spec):
         '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.
 
         Argument:
             str_spec(str): dim spec in str type.
@@ -62,7 +62,7 @@ class _DimSpec:
 
     def build_difference_2d_dict(self):
         '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
         '''
 
@@ -166,7 +166,7 @@ class ShardingSpec:
         device_mesh(DeviceMesh): A logical view of a physical mesh.
         entire_shape(torch.Size): The entire shape of tensor before sharded.
         dim_partition_dict(Dict[int, List[int]]， optional): The key is the dimension of tensor to be sharded,
-            and the value of the key decribe which logical axis will be sharded in that dimension.
+            and the value of the key describe which logical axis will be sharded in that dimension.
         sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
     '''
 
diff --git a/colossalai/tensor/utils.py b/colossalai/tensor/utils.py
index 6e30f97fe..e7d51d099 100644
--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):
 
     Argument:
         target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
     '''
     _, shard_list = target_pair
     shard_list_list = []

From 41fb7236aa32c307e83b0b9cc50ce2a6da279343 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 6 Jun 2023 18:58:58 +0800
Subject: [PATCH 24/26] [devops] hotfix CI about testmon cache (#3910)

* [devops] hotfix CI about testmon cache

* [devops] fix testmon cahe on pr
---
 .github/workflows/build_on_pr.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index b5f293107..a2807859b 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -65,10 +65,10 @@ jobs:
         run: | # branch name may contain slash, we need to replace it with space
           export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
           if [ -d "/github/home/testmon_cache/${BASE}" ]; then
-            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
+            [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
           fi
         env:
-          PR_NUMBER: ${{ github.event.pull_request.head.ref }}
+          PR_NUMBER: ${{ github.event.number }}
 
   detect:
     name: Detect file change

From b5f0566363687aaa91767bb7069af874bedfb7e8 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 10:41:16 +0800
Subject: [PATCH 25/26] [chat] add distributed PPO trainer (#3740)

* Detached ppo (#9)

* run the base

* working on dist ppo

* sync

* detached trainer

* update detached trainer. no maker update function

* facing init problem

* 1 maker 1 trainer detached run. but no model update

* facing cuda problem

* fix save functions

* verified maker update

* nothing

* add ignore

* analyize loss issue

* remove some debug codes

* facing 2m1t stuck issue

* 2m1t verified

* do not use torchrun

* working on 2m2t

* working on 2m2t

* initialize strategy in ray actor env

* facing actor's init order issue

* facing ddp model update issue (need unwarp ddp)

* unwrap ddp actor

* checking 1m2t stuck problem

* nothing

* set timeout for trainer choosing. It solves the stuck problem!

* delete some debug output

* rename to sync with upstream

* rename to sync with upstream

* coati rename

* nothing

* I am going to detach the replaybuffer from trainer and make it a Ray Actor. Two benefits: 1. support TP trainer. 2. asynchronized buffer operations

* experience_maker_holder performs target-revolving _send_experience() instead of length comparison.

* move code to ray subfolder

* working on pipeline inference

* apply comments

* working on pipeline strategy. in progress.

* remove pipeline code. clean this branch

* update remote parameters by state_dict. no test

* nothing

* state_dict sharding transfer

* merge debug branch

* gemini _unwrap_model fix

* simplify code

* simplify code & fix LoRALinear AttributeError

* critic unwrapped state_dict

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] add perfomance evaluator and fix bugs (#10)

* [chat] add performance evaluator for ray

* [chat] refactor debug arg

* [chat] support hf config

* [chat] fix generation

* [chat] add 1mmt dummy example

* [chat] fix gemini ckpt

* split experience to send (#11)

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] refactor trainer and maker (#12)

* [chat] refactor experience maker holder

* [chat] refactor model init

* [chat] refactor trainer args

* [chat] refactor model init

* [chat] refactor trainer

* [chat] refactor experience sending logic and training loop args (#13)

* [chat] refactor experience send logic

* [chat] refactor trainer

* [chat] refactor trainer

* [chat] refactor experience maker

* [chat] refactor pbar

* [chat] refactor example folder (#14)

* [chat] support quant (#15)

* [chat] add quant

* [chat] add quant example

* prompt example (#16)

* prompt example

* prompt load csv data

* remove legacy try

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] add mmmt dummy example and refactor experience sending (#17)

* [chat] add mmmt dummy example

* [chat] refactor naive strategy

* [chat] fix struck problem

* [chat] fix naive strategy

* [chat] optimize experience maker sending logic

* [chat] refactor sending assignment

* [chat] refactor performance evaluator (#18)

* Prompt Example & requires_grad state_dict & sharding state_dict (#19)

* prompt example

* prompt load csv data

* remove legacy try

* maker models require_grad set to False

* working on zero redundancy update

* mmmt_prompt example; naive strategy requires_grad state_dict & sharding; maker model requires_no_grad.

* remove legacy examples

* remove legacy examples

* remove replay buffer tp state. bad design

---------

Co-authored-by: csric <richcsr256@gmail.com>

* state_dict sending adapts to new unwrap function (#20)

* prompt example

* prompt load csv data

* remove legacy try

* maker models require_grad set to False

* working on zero redundancy update

* mmmt_prompt example; naive strategy requires_grad state_dict & sharding; maker model requires_no_grad.

* remove legacy examples

* remove legacy examples

* remove replay buffer tp state. bad design

* opt benchmark

* better script

* nothing

* [chat] strategy refactor unwrap model

* [chat] strategy refactor save model

* [chat] add docstr

* [chat] refactor trainer save model

* [chat] fix strategy typing

* [chat] refactor trainer save model

* [chat] update readme

* [chat] fix unit test

* working on lora reconstruction

* state_dict sending adapts to new unwrap function

* remove comments

---------

Co-authored-by: csric <richcsr256@gmail.com>
Co-authored-by: ver217 <lhx0217@gmail.com>

* [chat-ray] add readme (#21)

* add readme

* transparent graph

* add note background

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] get images from url (#22)

* Refactor/chat ray (#23)

* [chat] lora add todo

* [chat] remove unused pipeline strategy

* [chat] refactor example structure

* [chat] setup ci for ray

* [chat-ray] Support LoRA trainer. LoRA weights reconstruction. (#24)

* lora support prototype

* lora support

* 1mmt lora & remove useless code

---------

Co-authored-by: csric <richcsr256@gmail.com>

* [chat] fix test ci for ray

* [chat] fix test ci requirements for ray

* [chat] fix ray runtime env

* [chat] fix ray runtime env

* [chat] fix example ci docker args

* [chat] add debug info in trainer

* [chat] add nccl debug info

* [chat] skip ray test

* [doc] fix typo

---------

Co-authored-by: csric <59389055+CsRic@users.noreply.github.com>
Co-authored-by: csric <richcsr256@gmail.com>
---
 .github/workflows/run_chatgpt_examples.yml    |   2 +-
 .../Chat/benchmarks/ray/1mmt_dummy.py         | 178 +++++++++++
 .../Chat/benchmarks/ray/mmmt_dummy.py         | 189 ++++++++++++
 applications/Chat/coati/models/lora.py        |   8 +-
 applications/Chat/coati/quant/__init__.py     |   7 +
 .../Chat/coati/quant/llama_gptq/__init__.py   |   5 +
 .../Chat/coati/quant/llama_gptq/loader.py     |  26 ++
 .../coati/quant/llama_gptq/model_utils.py     |  13 +
 .../Chat/coati/quant/llama_gptq/quant.py      | 283 ++++++++++++++++++
 applications/Chat/coati/quant/utils.py        |  28 ++
 applications/Chat/coati/ray/README.md         | 160 ++++++++++
 applications/Chat/coati/ray/__init__.py       |   2 -
 .../Chat/coati/ray/callbacks/__init__.py      |   9 +
 applications/Chat/coati/ray/callbacks/base.py |  66 ++++
 .../ray/callbacks/performance_evaluator.py    | 212 +++++++++++++
 .../ray/{src => }/detached_replay_buffer.py   |  69 ++---
 .../Chat/coati/ray/detached_trainer_base.py   | 179 +++++++++++
 .../ray/{src => }/detached_trainer_ppo.py     | 192 ++++++------
 applications/Chat/coati/ray/example/1m1t.py   | 153 ----------
 applications/Chat/coati/ray/example/1m1t.sh   |  23 --
 applications/Chat/coati/ray/example/1m2t.py   | 186 ------------
 applications/Chat/coati/ray/example/1m2t.sh   |  23 --
 applications/Chat/coati/ray/example/2m1t.py   | 140 ---------
 applications/Chat/coati/ray/example/2m1t.sh   |  23 --
 applications/Chat/coati/ray/example/2m2t.py   | 209 -------------
 applications/Chat/coati/ray/example/2m2t.sh   |  23 --
 .../Chat/coati/ray/experience_maker_holder.py | 271 +++++++++++++++++
 .../Chat/coati/ray/lora_constructor.py        | 122 ++++++++
 applications/Chat/coati/ray/src/__init__.py   |   0
 .../coati/ray/src/detached_trainer_base.py    | 121 --------
 .../coati/ray/src/experience_maker_holder.py  | 172 -----------
 .../Chat/coati/ray/src/pipeline_strategy.py   | 105 -------
 applications/Chat/coati/ray/src/utils.py      |  48 ---
 applications/Chat/coati/ray/utils.py          | 152 ++++++++++
 .../Chat/coati/trainer/strategies/base.py     |   4 +
 .../coati/trainer/strategies/colossalai.py    |  12 +
 .../Chat/coati/trainer/strategies/ddp.py      |  13 +-
 .../Chat/coati/trainer/strategies/naive.py    |  62 +++-
 .../Chat/coati/trainer/strategies/sampler.py  |   1 +
 applications/Chat/examples/ray/1mmt_prompt.py | 175 +++++++++++
 applications/Chat/examples/ray/mmmt_prompt.py | 189 ++++++++++++
 .../Chat/examples/ray/requirements.txt        |   1 +
 applications/Chat/examples/ray/test_ci.sh     |  12 +
 applications/Chat/examples/test_ci.sh         |   3 +
 44 files changed, 2494 insertions(+), 1377 deletions(-)
 create mode 100644 applications/Chat/benchmarks/ray/1mmt_dummy.py
 create mode 100644 applications/Chat/benchmarks/ray/mmmt_dummy.py
 create mode 100644 applications/Chat/coati/quant/__init__.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/__init__.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/loader.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/model_utils.py
 create mode 100644 applications/Chat/coati/quant/llama_gptq/quant.py
 create mode 100644 applications/Chat/coati/quant/utils.py
 create mode 100644 applications/Chat/coati/ray/README.md
 create mode 100644 applications/Chat/coati/ray/callbacks/__init__.py
 create mode 100644 applications/Chat/coati/ray/callbacks/base.py
 create mode 100644 applications/Chat/coati/ray/callbacks/performance_evaluator.py
 rename applications/Chat/coati/ray/{src => }/detached_replay_buffer.py (62%)
 create mode 100644 applications/Chat/coati/ray/detached_trainer_base.py
 rename applications/Chat/coati/ray/{src => }/detached_trainer_ppo.py (55%)
 delete mode 100644 applications/Chat/coati/ray/example/1m1t.py
 delete mode 100644 applications/Chat/coati/ray/example/1m1t.sh
 delete mode 100644 applications/Chat/coati/ray/example/1m2t.py
 delete mode 100644 applications/Chat/coati/ray/example/1m2t.sh
 delete mode 100644 applications/Chat/coati/ray/example/2m1t.py
 delete mode 100644 applications/Chat/coati/ray/example/2m1t.sh
 delete mode 100644 applications/Chat/coati/ray/example/2m2t.py
 delete mode 100644 applications/Chat/coati/ray/example/2m2t.sh
 create mode 100644 applications/Chat/coati/ray/experience_maker_holder.py
 create mode 100644 applications/Chat/coati/ray/lora_constructor.py
 delete mode 100644 applications/Chat/coati/ray/src/__init__.py
 delete mode 100644 applications/Chat/coati/ray/src/detached_trainer_base.py
 delete mode 100644 applications/Chat/coati/ray/src/experience_maker_holder.py
 delete mode 100644 applications/Chat/coati/ray/src/pipeline_strategy.py
 delete mode 100644 applications/Chat/coati/ray/src/utils.py
 create mode 100644 applications/Chat/coati/ray/utils.py
 create mode 100644 applications/Chat/examples/ray/1mmt_prompt.py
 create mode 100644 applications/Chat/examples/ray/mmmt_prompt.py
 create mode 100644 applications/Chat/examples/ray/requirements.txt
 create mode 100755 applications/Chat/examples/ray/test_ci.sh

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 9d9d3a007..129bf7ed3 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: [self-hosted, gpu]
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat
+      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
     timeout-minutes: 30
     defaults:
       run:
diff --git a/applications/Chat/benchmarks/ray/1mmt_dummy.py b/applications/Chat/benchmarks/ray/1mmt_dummy.py
new file mode 100644
index 000000000..9e8f36cef
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/1mmt_dummy.py
@@ -0,0 +1,178 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker0", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        kl_coef=0.1,
+        debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f'maker{x}' for x in get_receivers_per_sender(i, args.num_trainers, 1, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    wait_tasks.append(
+        experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                 num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/benchmarks/ray/mmmt_dummy.py b/applications/Chat/benchmarks/ray/mmmt_dummy.py
new file mode 100644
index 000000000..46a006289
--- /dev/null
+++ b/applications/Chat/benchmarks/ray/mmmt_dummy.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+        critic_cfg = AutoConfig.from_pretrained(args.critic_pretrain)
+        actor = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.critic_model, config=critic_cfg).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.critic_model,
+                                                  config=critic_cfg).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, config=actor_cfg).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, config=AutoConfig.from_pretrained(args.pretrain)).half().cuda()
+        critic = get_critic_from_args(args.critic_model,
+                                      config=AutoConfig.from_pretrained(args.critic_pretrain)).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def data_gen_fn():
+        input_ids = torch.randint(tokenizer.vocab_size, (256,), device=torch.cuda.current_device())
+        attn_mask = torch.ones_like(input_ids)
+        return {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    def build_dataloader(size):
+        dataset = [data_gen_fn() for _ in range(size)]
+        dataloader = DataLoader(dataset, batch_size=args.experience_batch_size)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(partial(build_dataloader, dataset_size),
+                                                     num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/coati/models/lora.py b/applications/Chat/coati/models/lora.py
index 0533a60dc..2a9059e69 100644
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@@ -61,7 +61,13 @@ class LoraLinear(lora.LoRALayer, nn.Module):
         if self.merge_weights and self.merged:
             # Make sure that the weights are not merged
             if self.r > 0:
-                self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+                if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
+                    # FIXME(csric): temporary fix
+                    self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
+                    self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
+                    self.reset_parameters()
+                else:
+                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
             self.merged = False
 
     def eval(self):
diff --git a/applications/Chat/coati/quant/__init__.py b/applications/Chat/coati/quant/__init__.py
new file mode 100644
index 000000000..a65a78d07
--- /dev/null
+++ b/applications/Chat/coati/quant/__init__.py
@@ -0,0 +1,7 @@
+from .llama_gptq import load_quant as llama_load_quant
+from .utils import low_resource_init
+
+__all__ = [
+    'llama_load_quant',
+    'low_resource_init',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/__init__.py b/applications/Chat/coati/quant/llama_gptq/__init__.py
new file mode 100644
index 000000000..51c8d6316
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/__init__.py
@@ -0,0 +1,5 @@
+from .loader import load_quant
+
+__all__ = [
+    'load_quant',
+]
diff --git a/applications/Chat/coati/quant/llama_gptq/loader.py b/applications/Chat/coati/quant/llama_gptq/loader.py
new file mode 100644
index 000000000..5353dc8a2
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/loader.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from .model_utils import find_layers
+from .quant import make_quant
+
+
+def load_quant(model: nn.Module, checkpoint: str, wbits: int, groupsize: int):
+    model = model.eval()
+    layers = find_layers(model)
+
+    # ignore lm head
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+
+    make_quant(model, layers, wbits, groupsize)
+
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+
+    return model
diff --git a/applications/Chat/coati/quant/llama_gptq/model_utils.py b/applications/Chat/coati/quant/llama_gptq/model_utils.py
new file mode 100644
index 000000000..62db171ab
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/model_utils.py
@@ -0,0 +1,13 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/modelutils.py
+
+import torch
+import torch.nn as nn
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
+    return res
diff --git a/applications/Chat/coati/quant/llama_gptq/quant.py b/applications/Chat/coati/quant/llama_gptq/quant.py
new file mode 100644
index 000000000..f7d5b7ce4
--- /dev/null
+++ b/applications/Chat/coati/quant/llama_gptq/quant.py
@@ -0,0 +1,283 @@
+# copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/past/quant.py
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def quantize(x, scale, zero, maxq):
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer('maxq', torch.tensor(0))
+        self.register_buffer('scale', torch.zeros(shape))
+        self.register_buffer('zero', torch.zeros(shape))
+
+    def configure(self, bits, perchannel=False, sym=True, mse=False, norm=2.4, grid=100, maxshrink=.8):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        self.scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+        else:
+            self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float('inf'), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+try:
+    import quant_cuda
+except:
+    print('CUDA extension not installed.')
+
+# Assumes layer is perfectly divisible into 256 * 256 blocks
+
+
+class QuantLinear(nn.Module):
+
+    def __init__(self, bits, groupsize, infeatures, outfeatures):
+        super().__init__()
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        if groupsize != -1 and groupsize < 32 and groupsize != int(math.pow(2, int(math.log2(groupsize)))):
+            raise NotImplementedError("groupsize supports powers of 2 greater than 32. (e.g. : 32,64,128,etc)")
+        groupsize = groupsize if groupsize != -1 else infeatures
+        self.groupsize = groupsize
+        self.register_buffer(
+            'qzeros', torch.zeros((math.ceil(infeatures / groupsize), outfeatures // 256 * (bits * 8)),
+                                  dtype=torch.int))
+        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / groupsize), outfeatures)))
+        self.register_buffer('bias', torch.zeros(outfeatures))
+        self.register_buffer('qweight', torch.zeros((infeatures // 256 * (bits * 8), outfeatures), dtype=torch.int))
+        self._initialized_quant_state = False
+
+    def pack(self, linear, scales, zeros):
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.groupsize
+            intweight.append(
+                torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:,
+                                                                                                                  None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros((intweight.shape[0] // 256 * (self.bits * 8), intweight.shape[1]), dtype=np.uint32)
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 256 * (self.bits * 8)), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        intermediate_dtype = torch.float32
+
+        if not self._initialized_quant_state:
+            # Do we even have a bias? Check for at least one non-zero element.
+            if self.bias is not None and bool(torch.any(self.bias != 0)):
+                # Then make sure it's the right type.
+                self.bias.data = self.bias.data.to(intermediate_dtype)
+            else:
+                self.bias = None
+
+        outshape = list(x.shape)
+        outshape[-1] = self.outfeatures
+        x = x.reshape(-1, x.shape[-1])
+        if self.bias is None:
+            y = torch.zeros(x.shape[0], outshape[-1], dtype=intermediate_dtype, device=x.device)
+        else:
+            y = self.bias.clone().repeat(x.shape[0], 1)
+
+        output_dtype = x.dtype
+        x = x.to(intermediate_dtype)
+        if self.bits == 2:
+            quant_cuda.vecquant2matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 3:
+            quant_cuda.vecquant3matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 4:
+            quant_cuda.vecquant4matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        elif self.bits == 8:
+            quant_cuda.vecquant8matmul(x, self.qweight, y, self.scales, self.qzeros, self.groupsize)
+        else:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        y = y.to(output_dtype)
+        return y.reshape(outshape)
+
+
+def make_quant(module, names, bits, groupsize, name=''):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features))
+    for name1, child in module.named_children():
+        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
diff --git a/applications/Chat/coati/quant/utils.py b/applications/Chat/coati/quant/utils.py
new file mode 100644
index 000000000..01b8cff0a
--- /dev/null
+++ b/applications/Chat/coati/quant/utils.py
@@ -0,0 +1,28 @@
+from contextlib import contextmanager
+
+import torch
+
+
+def _noop(*args, **kwargs):
+    pass
+
+
+@contextmanager
+def low_resource_init():
+    """This context manager disables weight initialization and sets the default float dtype to half.
+    """
+    old_kaiming_uniform_ = torch.nn.init.kaiming_uniform_
+    old_uniform_ = torch.nn.init.uniform_
+    old_normal_ = torch.nn.init.normal_
+    dtype = torch.get_default_dtype()
+    try:
+        torch.nn.init.kaiming_uniform_ = _noop
+        torch.nn.init.uniform_ = _noop
+        torch.nn.init.normal_ = _noop
+        torch.set_default_dtype(torch.half)
+        yield
+    finally:
+        torch.nn.init.kaiming_uniform_ = old_kaiming_uniform_
+        torch.nn.init.uniform_ = old_uniform_
+        torch.nn.init.normal_ = old_normal_
+        torch.set_default_dtype(dtype)
diff --git a/applications/Chat/coati/ray/README.md b/applications/Chat/coati/ray/README.md
new file mode 100644
index 000000000..228155a68
--- /dev/null
+++ b/applications/Chat/coati/ray/README.md
@@ -0,0 +1,160 @@
+# Distributed PPO Training on Stage 3
+
+## Detach Experience Makers and Trainers
+
+We can completely separate the trainers and makers.
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/basic_structure.png?raw=true" width=600/>
+</p>
+
+- The experience maker performs inference, produces experience, and remotely delivers it to the trainer (1).
+- The trainer consumes experience to train models, and periodically transmits new model parameters to the maker (2.1, 2.2).
+- Using an experience buffer to overlap transmission and computing.
+
+In this manner, each node will work continuously without model idle time, and different optimization strategies can be applied for inference and training to meet the needs of speed or storage. It is also helpful for scalability.
+
+`DetachedPPOTrainer` and `ExperienceMakerHolder` are Ray Actors (distinguished from Actor Model), representing Trainer and Experience Maker on the graph above, respectively.
+
+[More about Ray Core](https://docs.ray.io/en/latest/ray-core/walkthrough.html)
+
+## Usage
+
+See examples at `ColossalAI/application/Chat/examples/ray`
+
+### Setup Makers
+
+- define makers' environment variables :
+
+    ```python
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(num_makers)]
+
+    ```
+- define maker models :
+    ```python
+    def model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        reward_model = get_reward_model_from_args(...)
+        initial_model = get_actor_from_args(...)
+        return actor, critic, reward_model, initial_model
+
+    ```
+- set experience_holder_refs :
+
+    ```python
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(
+            name=f"maker_{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            detached_trainer_name_list=[f"trainer_{x}" for x in target_trainers(...)],
+            model_fn=model_fn,
+            ...)
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+    ```
+    The names in the `detached_trainer_name_list` refer to the target trainers that the maker should send experience to.
+    We set a trainer's name the same as a maker, by `.options(name="str")`. See below.
+
+### Setup Trainers
+
+- define trainers' environment variables :
+    ```python
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(num_trainers)]
+    ```
+- define trainer models :
+
+    ```python
+    def trainer_model_fn():
+        actor = get_actor_from_args(...)
+        critic = get_critic_from_args(...)
+        return actor, critic
+    ```
+- set trainer_refs :
+    ```python
+    trainer_refs = [
+        DetachedPPOTrainer.options(
+            name=f"trainer{i}",
+            num_gpus=1,
+            max_concurrency=2
+        ).remote(
+            experience_maker_holder_name_list=[f"maker{x}" for x in target_makers(...)],
+            model_fn = trainer_model_fn(),
+            ...)
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+    ```
+    The names in `experience_maker_holder_name_list` refer to the target makers that the trainer should send updated models to.
+    By setting  `detached_trainer_name_list` and `experience_maker_holder_name_list`, we can customize the transmission graph.
+
+### Launch Jobs
+- define data_loader :
+    ```python
+    def data_loader_fn():
+        return = torch.utils.data.DataLoader(dataset=dataset)
+
+    ```
+- launch makers :
+    ```python
+    wait_tasks = []
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(
+            experience_holder_ref.workingloop.remote(data_loader_fn(),
+                                                     num_steps=experience_steps))
+
+    ```
+
+- launch trainers :
+    ```python
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, update_steps, train_epochs))
+    ```
+
+- wait for done :
+    ```python
+    ray.get(wait_tasks)
+    ```
+
+## Flexible Structure
+
+We can deploy different strategies to makers and trainers. Here are some notions.
+
+### 2 Makers 1 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m1t.png?raw=true" width=600/>
+</p>
+
+### 2 Makers 2 Trainer
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t.png?raw=true" width=600/>
+</p>
+
+### Maker Inference Quantization
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/2m2t_quantize.png?raw=true" width=600/>
+</p>
+
+### Tensor Parallel
+
+<p align="center">
+<img src="https://github.com/hpcaitech/public_assets/blob/main/applications/chat/tp_ddp_hybrid.png?raw=true" width=600/>
+</p>
+
+## TODO
+
+- [ ] Support LoRA
+- [ ] Support TP & PP
diff --git a/applications/Chat/coati/ray/__init__.py b/applications/Chat/coati/ray/__init__.py
index 5802c05bc..e69de29bb 100644
--- a/applications/Chat/coati/ray/__init__.py
+++ b/applications/Chat/coati/ray/__init__.py
@@ -1,2 +0,0 @@
-from .src.detached_replay_buffer import DetachedReplayBuffer
-from .src.detached_trainer_ppo import DetachedPPOTrainer
diff --git a/applications/Chat/coati/ray/callbacks/__init__.py b/applications/Chat/coati/ray/callbacks/__init__.py
new file mode 100644
index 000000000..5f5e488f3
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/__init__.py
@@ -0,0 +1,9 @@
+from .base import MakerCallback, TrainerCallback
+from .performance_evaluator import ExperienceMakerPerformanceEvaluator, TrainerPerformanceEvaluator
+
+__all__ = [
+    "TrainerCallback",
+    "MakerCallback",
+    "ExperienceMakerPerformanceEvaluator",
+    "TrainerPerformanceEvaluator",
+]
diff --git a/applications/Chat/coati/ray/callbacks/base.py b/applications/Chat/coati/ray/callbacks/base.py
new file mode 100644
index 000000000..3306150a4
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/base.py
@@ -0,0 +1,66 @@
+from abc import ABC
+
+from coati.experience_maker import Experience
+
+
+class TrainerCallback(ABC):
+    """
+        Base callback class. It defines the interface for callbacks.
+    """
+
+    def on_fit_start(self) -> None:
+        pass
+
+    def on_fit_end(self) -> None:
+        pass
+
+    def on_episode_start(self, episode: int) -> None:
+        pass
+
+    def on_episode_end(self, episode: int) -> None:
+        pass
+
+    def on_epoch_start(self, epoch: int) -> None:
+        pass
+
+    def on_epoch_end(self, epoch: int) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        pass
+
+    def on_update_start(self) -> None:
+        pass
+
+    def on_update_end(self) -> None:
+        pass
+
+
+class MakerCallback(ABC):
+
+    def on_loop_start(self) -> None:
+        pass
+
+    def on_loop_end(self) -> None:
+        pass
+
+    def on_make_experience_start(self) -> None:
+        pass
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        pass
+
+    def on_send_start(self) -> None:
+        pass
+
+    def on_send_end(self) -> None:
+        pass
+
+    def on_batch_start(self) -> None:
+        pass
+
+    def on_batch_end(self) -> None:
+        pass
diff --git a/applications/Chat/coati/ray/callbacks/performance_evaluator.py b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
new file mode 100644
index 000000000..cd3517609
--- /dev/null
+++ b/applications/Chat/coati/ray/callbacks/performance_evaluator.py
@@ -0,0 +1,212 @@
+from time import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from coati.experience_maker import Experience
+
+from .base import MakerCallback, TrainerCallback
+
+
+def get_world_size() -> int:
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def print_rank_0(*args, **kwargs) -> None:
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+@torch.no_grad()
+def all_reduce_mean(x: float, world_size: int) -> float:
+    if world_size == 1:
+        return x
+    tensor = torch.tensor([x], device=torch.cuda.current_device())
+    dist.all_reduce(tensor)
+    tensor = tensor / world_size
+    return tensor.item()
+
+
+class Timer:
+
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.duration: float = 0.
+
+    def start(self) -> None:
+        self.start_time = time()
+
+    def end(self) -> None:
+        self.duration += time() - self.start_time
+
+    def reset(self) -> None:
+        self.duration = 0.
+
+
+class ExperienceMakerPerformanceEvaluator(MakerCallback):
+
+    def __init__(self, actor_num_params: int, critic_num_params: int, initial_model_num_params: int,
+                 reward_model_num_params: int) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.initial_model_num_params = initial_model_num_params
+        self.reward_model_num_params = reward_model_num_params
+
+        self.batch_timer = Timer()
+        self.send_timer = Timer()
+        self.make_experience_timer = Timer()
+        self.total_samples: int = 0
+        self.make_experience_flop: int = 0
+
+        print_rank_0(
+            f'ExperienceMaker actor: {actor_num_params/1024**3:.2f}B, critic: {critic_num_params/1024**3:.2f}B, initial model: {initial_model_num_params/1024**3:.2f}B, reward model: {reward_model_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_make_experience_start(self) -> None:
+        self.make_experience_timer.start()
+
+    def on_make_experience_end(self, experience: Experience) -> None:
+        self.make_experience_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor generate
+        num_actions = experience.action_mask.size(1)
+        input_len = seq_len - num_actions
+        total_seq_len = (input_len + seq_len - 1) * num_actions / 2
+        self.make_experience_flop += self.actor_num_params * batch_size * total_seq_len * 2
+        # actor forward
+        self.make_experience_flop += self.actor_num_params * batch_size * seq_len * 2
+        # critic forward
+        self.make_experience_flop += self.critic_num_params * batch_size * seq_len * 2
+        # initial model forward
+        self.make_experience_flop += self.initial_model_num_params * batch_size * seq_len * 2
+        # reward model forward
+        self.make_experience_flop += self.reward_model_num_params * batch_size * seq_len * 2
+
+    def on_send_start(self) -> None:
+        self.send_timer.start()
+
+    def on_send_end(self) -> None:
+        self.send_timer.end()
+
+    def on_batch_start(self) -> None:
+        self.batch_timer.start()
+
+    def on_batch_end(self) -> None:
+        self.batch_timer.end()
+
+    def on_loop_end(self) -> None:
+        avg_make_experience_duration = all_reduce_mean(self.make_experience_timer.duration, self.world_size)
+        avg_overall_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_send_duration = all_reduce_mean(self.send_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_overall_duration + 1e-12)
+        avg_make_experience_tflops = self.make_experience_flop / 1e12 / (avg_make_experience_duration + 1e-12)
+        avg_time_per_sample = (avg_overall_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_make_experience_time_per_sample = (avg_make_experience_duration + 1e-12) / \
+            (self.total_samples * self.world_size)
+        avg_send_time_per_sample = (avg_send_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Making Experience Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_make_experience_tflops:.3f}\n' +
+            f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (make experience): {avg_make_experience_time_per_sample:.3f} s, {avg_make_experience_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (send): {avg_send_time_per_sample:.3f} s, {avg_send_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
+
+
+class TrainerPerformanceEvaluator(TrainerCallback):
+
+    def __init__(self,
+                 actor_num_params: int,
+                 critic_num_params: int,
+                 enable_grad_checkpoint: bool = False,
+                 ignore_first_episodes: int = 1) -> None:
+        super().__init__()
+        self.world_size = get_world_size()
+        self.actor_num_params = actor_num_params
+        self.critic_num_params = critic_num_params
+        self.enable_grad_checkpoint = enable_grad_checkpoint
+        self.ignore_first_episodes = ignore_first_episodes
+        self.ignore_this_episode = False
+
+        self.episode_timer = Timer()
+        self.batch_timer = Timer()
+        self.update_timer = Timer()
+        self.total_samples: int = 0
+        self.learn_flop: int = 0
+
+        print_rank_0(
+            f'Trainer actor: {self.actor_num_params/1024**3:.2f}B, critic: {self.critic_num_params/1024**3:.2f}B, world size: {self.world_size}'
+        )
+
+    def on_episode_start(self, episodes: int) -> None:
+        self.ignore_this_episode = episodes < self.ignore_first_episodes
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.start()
+
+    def on_episode_end(self, episodes: int) -> None:
+        if self.ignore_this_episode:
+            return
+        self.episode_timer.end()
+
+    def on_batch_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.start()
+
+    def on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        if self.ignore_this_episode:
+            return
+        self.batch_timer.end()
+
+        batch_size, seq_len = experience.sequences.shape
+
+        self.total_samples += batch_size
+
+        # actor forward-backward, 3 means forward(1) + backward(2)
+        self.learn_flop += self.actor_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+        # critic forward-backward
+        self.learn_flop += self.critic_num_params * batch_size * seq_len * 2 * (3 + int(self.enable_grad_checkpoint))
+
+    def on_update_start(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.start()
+
+    def on_update_end(self) -> None:
+        if self.ignore_this_episode:
+            return
+        self.update_timer.end()
+
+    def on_fit_end(self) -> None:
+        if self.total_samples == 0:
+            print_rank_0('No samples are collected, skip trainer performance evaluation')
+            return
+        avg_train_duration = all_reduce_mean(self.batch_timer.duration, self.world_size)
+        avg_update_duration = all_reduce_mean(self.update_timer.duration, self.world_size)
+        avg_episode_duration = all_reduce_mean(self.episode_timer.duration, self.world_size)
+
+        avg_throughput = self.total_samples * self.world_size / (avg_episode_duration + 1e-12)
+        avg_learn_tflops = self.learn_flop / 1e12 / (avg_train_duration + 1e-12)
+        avg_time_per_sample = (avg_episode_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_train_time_per_sample = (avg_train_duration + 1e-12) / (self.total_samples * self.world_size)
+        avg_update_time_per_sample = (avg_update_duration + 1e-12) / (self.total_samples * self.world_size)
+
+        print_rank_0(
+            'Learning Performance Summary:\n' + f'Throughput: {avg_throughput:.3f} samples/sec\n' +
+            f'TFLOPS per GPU: {avg_learn_tflops:.3f}\n' + f'Sample time (overall): {avg_time_per_sample:.3f} s\n' +
+            f'Sample time (train): {avg_train_time_per_sample:.3f} s, {avg_train_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+            +
+            f'Sample time (update): {avg_update_time_per_sample:.3f} s, {avg_update_time_per_sample/avg_time_per_sample*100:.2f}%\n'
+        )
diff --git a/applications/Chat/coati/ray/src/detached_replay_buffer.py b/applications/Chat/coati/ray/detached_replay_buffer.py
similarity index 62%
rename from applications/Chat/coati/ray/src/detached_replay_buffer.py
rename to applications/Chat/coati/ray/detached_replay_buffer.py
index 18c8db388..2f7652811 100644
--- a/applications/Chat/coati/ray/src/detached_replay_buffer.py
+++ b/applications/Chat/coati/ray/detached_replay_buffer.py
@@ -1,22 +1,24 @@
-import torch
+import asyncio
+import copy
 import random
-from typing import List, Any
+from threading import Lock
+from typing import Any, List
+
+import ray
+import torch
+from coati.experience_maker.base import Experience
+from coati.replay_buffer import ReplayBuffer
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
 # from torch.multiprocessing import Queue
 from ray.util.queue import Queue
-import ray
-import asyncio
-from coati.experience_maker.base import Experience
-from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
-from coati.replay_buffer import ReplayBuffer
-from threading import Lock
-import copy
+
 
 class DetachedReplayBuffer:
     '''
-        Detached replay buffer. Share Experience across workers on the same node. 
-        Therefore a trainer node is expected to have only one instance. 
+        Detached replay buffer. Share Experience across workers on the same node.
+        Therefore a trainer node is expected to have only one instance.
         It is ExperienceMakerHolder's duty to call append(exp) method, remotely.
-    
+
     Args:
         sample_batch_size: Batch size when sampling. Exp won't enqueue until they formed a batch.
         tp_world_size: Number of workers in the same tp group
@@ -24,31 +26,25 @@ class DetachedReplayBuffer:
         cpu_offload: Whether to offload experience to cpu when sampling. Defaults to True.
     '''
 
-    def __init__(self, sample_batch_size: int, tp_world_size: int = 1, limit : int = 0, cpu_offload: bool = True) -> None:
-        self.cpu_offload = cpu_offload
+    def __init__(self, sample_batch_size: int, limit: int = 0) -> None:
         self.sample_batch_size = sample_batch_size
         self.limit = limit
-        self.items = Queue(self.limit, actor_options={"num_cpus":1})
-        self.batch_collector : List[BufferItem] = []
-
-        '''
-        Workers in the same tp group share this buffer and need same sample for one step.
-            Therefore a held_sample should be returned tp_world_size times before it could be dropped.
-            worker_state records whether a worker got the held_sample
-        '''
-        self.tp_world_size = tp_world_size
-        self.worker_state = [False] * self.tp_world_size
-        self.held_sample = None
-        self._worker_state_lock = Lock()
+        self.items = Queue(self.limit, actor_options={"num_cpus": 1})
+        self.batch_collector: List[BufferItem] = []
 
     @torch.no_grad()
     def append(self, experience: Experience) -> None:
         '''
         Expected to be called remotely.
         '''
-        if self.cpu_offload:
-            experience.to_device(torch.device('cpu'))
         items = split_experience_batch(experience)
+        self.extend(items)
+
+    @torch.no_grad()
+    def extend(self, items: List[BufferItem]) -> None:
+        '''
+        Expected to be called remotely.
+        '''
         self.batch_collector.extend(items)
         while len(self.batch_collector) >= self.sample_batch_size:
             items = self.batch_collector[:self.sample_batch_size]
@@ -62,19 +58,10 @@ class DetachedReplayBuffer:
         self.items = Queue(self.limit)
         self.worker_state = [False] * self.tp_world_size
         self.batch_collector = []
-     
+
     @torch.no_grad()
-    def sample(self, worker_rank = 0, to_device = "cpu") -> Experience:
-        self._worker_state_lock.acquire()
-        if not any(self.worker_state):
-            self.held_sample = self._sample_and_erase()
-        self.worker_state[worker_rank] = True
-        if all(self.worker_state):
-            self.worker_state = [False] * self.tp_world_size
-            ret = self.held_sample
-        else:
-            ret = copy.deepcopy(self.held_sample)
-        self._worker_state_lock.release()
+    def sample(self, worker_rank=0, to_device="cpu") -> Experience:
+        ret = self._sample_and_erase()
         ret.to_device(to_device)
         return ret
 
@@ -85,4 +72,4 @@ class DetachedReplayBuffer:
 
     def get_length(self) -> int:
         ret = self.items.qsize()
-        return ret
\ No newline at end of file
+        return ret
diff --git a/applications/Chat/coati/ray/detached_trainer_base.py b/applications/Chat/coati/ray/detached_trainer_base.py
new file mode 100644
index 000000000..ac2d35e9d
--- /dev/null
+++ b/applications/Chat/coati/ray/detached_trainer_base.py
@@ -0,0 +1,179 @@
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import ray
+import torch
+from coati.experience_maker import Experience
+from coati.replay_buffer.utils import BufferItem
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from .callbacks import TrainerCallback
+from .detached_replay_buffer import DetachedReplayBuffer
+from .utils import is_rank_0
+
+
+class DetachedTrainer(ABC):
+    '''
+        Base class for detached rlhf trainers.
+        'detach' means that the experience maker is detached compared to a normal Trainer.
+        Please set name attribute during init:
+            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
+            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
+    Args:
+        detached_strategy (DetachedStrategy): the strategy to use for training
+        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
+        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
+        callbacks (List[Callback], defaults to []): the callbacks to call during training process
+        generate_kwargs (dict, optional): the kwargs to use while model generating
+
+    '''
+
+    def __init__(self,
+                 experience_maker_holder_name_list: List[str],
+                 train_batch_size: int = 8,
+                 buffer_limit: int = 0,
+                 dataloader_pin_memory: bool = True,
+                 callbacks: List[TrainerCallback] = [],
+                 debug: bool = False) -> None:
+        super().__init__()
+        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit)
+        self.dataloader_pin_memory = dataloader_pin_memory
+        self.callbacks = callbacks
+        self.target_holder_name_list = experience_maker_holder_name_list
+        self.target_holder_list = []
+        self._is_target_holder_initialized = False
+        self._debug = debug
+
+    def update_target_holder_list(self):
+        # as the length of target_holder_list may be zero, we need to check it by a bool flag
+        if not self._is_target_holder_initialized:
+            for name in self.target_holder_name_list:
+                self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+            self._is_target_holder_initialized = True
+
+    @abstractmethod
+    def _update_remote_makers(self, fully_update: bool = False, **kwargs):
+        pass
+
+    def sync_models_to_remote_makers(self, **kwargs):
+        self._update_remote_makers(fully_update=True, **kwargs)
+
+    @abstractmethod
+    def training_step(self, experience: Experience) -> Dict[str, Any]:
+        pass
+
+    def _learn(self, update_steps: int, train_epochs: int) -> None:
+        data = []
+        # warmup
+        pbar = tqdm(range(update_steps), desc=f'Train epoch [1/{train_epochs}]', disable=not is_rank_0())
+        self._on_epoch_start(0)
+        self._learn_epoch(pbar, data)
+        self._on_epoch_end(0)
+        # item is already a batch
+        dataloader = DataLoader(data,
+                                batch_size=1,
+                                shuffle=True,
+                                pin_memory=self.dataloader_pin_memory,
+                                collate_fn=lambda x: x[0])
+        for epoch in range(1, train_epochs):
+            pbar = tqdm(dataloader, desc=f'Train epoch [{epoch + 1}/{train_epochs}]', disable=not is_rank_0())
+            self._on_epoch_start(epoch)
+            self._learn_epoch(pbar, data)
+            self._on_epoch_end(epoch)
+
+    def _learn_epoch(self, pbar: tqdm, data: List[Experience]) -> None:
+        is_warmup = len(data) == 0
+        for x in pbar:
+            if self._debug:
+                print("[trainer] training step")
+            # sample a batch and then train to avoid waiting
+            experience = x if not is_warmup else self._buffer_sample()
+            experience.to_device(torch.cuda.current_device())
+            self._on_batch_start()
+            metrics = self.training_step(experience)
+            self._on_batch_end(metrics, experience)
+
+            if self._debug:
+                print("[trainer] step over")
+            experience.to_device("cpu")
+            if is_warmup:
+                data.append(experience)
+            pbar.set_postfix(metrics)
+
+    def fit(self, total_steps: int, update_steps: int, train_epochs: int = 1) -> None:
+        self._on_fit_start()
+        for i in tqdm(range(total_steps // update_steps), desc='Trainer', disable=not is_rank_0()):
+            self._on_episode_start(i)
+            self._learn(update_steps, train_epochs)
+            self._on_update_start()
+            self._update_remote_makers()
+            self._on_update_end()
+            self._on_episode_end(i)
+        self._on_fit_end()
+
+    @ray.method(concurrency_group="buffer_length")
+    def buffer_get_length(self):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print("[trainer]                telling length")
+        return self.detached_replay_buffer.get_length()
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_append(self, experience: Experience):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.append(experience)
+
+    @ray.method(concurrency_group="buffer_append")
+    def buffer_extend(self, items: List[BufferItem]):
+        # called by ExperienceMakerHolder
+        if self._debug:
+            print(f"[trainer]               receiving exp.")
+        self.detached_replay_buffer.extend(items)
+
+    @ray.method(concurrency_group="buffer_sample")
+    def _buffer_sample(self):
+        return self.detached_replay_buffer.sample()
+
+    def _on_fit_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_start()
+
+    def _on_fit_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_fit_end()
+
+    def _on_episode_start(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_start(episode)
+
+    def _on_episode_end(self, episode: int) -> None:
+        for callback in self.callbacks:
+            callback.on_episode_end(episode)
+
+    def _on_epoch_start(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_start(epoch)
+
+    def _on_epoch_end(self, epoch: int) -> None:
+        for callback in self.callbacks:
+            callback.on_epoch_end(epoch)
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self, metrics: dict, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end(metrics, experience)
+
+    def _on_update_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_start()
+
+    def _on_update_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_update_end()
diff --git a/applications/Chat/coati/ray/src/detached_trainer_ppo.py b/applications/Chat/coati/ray/detached_trainer_ppo.py
similarity index 55%
rename from applications/Chat/coati/ray/src/detached_trainer_ppo.py
rename to applications/Chat/coati/ray/detached_trainer_ppo.py
index 838e82d07..5f0032716 100644
--- a/applications/Chat/coati/ray/src/detached_trainer_ppo.py
+++ b/applications/Chat/coati/ray/detached_trainer_ppo.py
@@ -1,24 +1,38 @@
-from typing import Any, Callable, Dict, List, Optional
-import torch
-from torch.optim import Adam
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
+import ray
+import torch
 from coati.experience_maker import Experience, NaiveExperienceMaker
 from coati.models.base import Actor, Critic
-from coati.models.generation_utils import update_model_kwargs_fn
 from coati.models.loss import PolicyLoss, ValueLoss
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
 from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy, Strategy
+from torch.optim import Adam
 
 from colossalai.nn.optimizer import HybridAdam
 
-import ray
-
-
-from .utils import is_rank_0, get_cuda_actor_critic_from_args, get_strategy_from_args, set_dist_env
+from .callbacks import TrainerCallback, TrainerPerformanceEvaluator
 from .detached_trainer_base import DetachedTrainer
+from .lora_constructor import LoRAConstructor
+from .utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_model_numel,
+    get_rank,
+    get_strategy_from_args,
+    is_rank_0,
+    set_dist_env,
+    state_dict_to,
+)
 
 
-@ray.remote(concurrency_groups={"buffer_length": 1, "buffer_append":1, "buffer_sample":1,"model_io": 1, "compute": 1})
+@ray.remote(concurrency_groups={
+    "buffer_length": 1,
+    "buffer_append": 1,
+    "buffer_sample": 1,
+    "model_io": 1,
+    "compute": 1
+})
 class DetachedPPOTrainer(DetachedTrainer):
     '''
         Detached Trainer for PPO algorithm
@@ -40,86 +54,102 @@ class DetachedPPOTrainer(DetachedTrainer):
         generate_kwargs (dict, optional): the kwargs to use while model generating
     '''
 
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 strategy: str,
-                 model: str,
-                 env_info: Dict[str, str] = None,
-                 pretrained: str = None,
-                 lora_rank: int = 0,
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 eps_clip: float = 0.2,
-                 value_clip: float = 0.4,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 10,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
+    def __init__(
+        self,
+        experience_maker_holder_name_list: List[str],
+        strategy_fn: Callable[[], Strategy],
+        model_fn: Callable[[], Tuple[Actor, Critic]],
+        env_info: Dict[str, str] = None,
+        train_batch_size: int = 8,
+        buffer_limit: int = 0,
+        eps_clip: float = 0.2,
+        value_clip: float = 0.4,
+        dataloader_pin_memory: bool = True,
+        callbacks: List[TrainerCallback] = [],
+        eval_performance: bool = False,
+        debug: bool = False,
+        update_lora_weights: bool = False,
+    ) -> None:
         # set environment variables
         if env_info:
             set_dist_env(env_info=env_info)
         # configure strategy
-        self.strategy = get_strategy_from_args(strategy)
+        self.strategy = strategy_fn()
         # configure models, loss and optimizers
         with self.strategy.model_init_context():
-            self.actor, self.critic = get_cuda_actor_critic_from_args(model, pretrained, lora_rank)
+            self.actor, self.critic = model_fn()
 
-        if strategy != 'colossalai_gemini':
-            self.actor.to(torch.float16).to(torch.cuda.current_device())
-            self.critic.to(torch.float16).to(torch.cuda.current_device())
+        if eval_performance:
+            actor_numel = get_model_numel(self.actor)
+            critic_numel = get_model_numel(self.critic)
+            evaluator = TrainerPerformanceEvaluator(actor_numel, critic_numel)
+            callbacks = callbacks + [evaluator]
 
-        if strategy.startswith('colossalai'):
-            self.actor_optim = HybridAdam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = HybridAdam(self.critic.parameters(), lr=5e-6)
+        if isinstance(self.strategy, ColossalAIStrategy):
+            self.actor_optim = HybridAdam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = HybridAdam(self.critic.parameters(), lr=1e-7)
         else:
-            self.actor_optim = Adam(self.actor.parameters(), lr=5e-6)
-            self.critic_optim = Adam(self.critic.parameters(), lr=5e-6)
+            self.actor_optim = Adam(self.actor.parameters(), lr=1e-7)
+            self.critic_optim = Adam(self.critic.parameters(), lr=1e-7)
 
         (self.actor, self.actor_optim), (self.critic, self.critic_optim) = \
             self.strategy.prepare((self.actor, self.actor_optim), (self.critic, self.critic_optim))
-        generate_kwargs = _set_default_generate_kwargs(self.strategy, generate_kwargs, self.actor)
 
+        # configure trainer
         self.actor_loss_fn = PolicyLoss(eps_clip)
         self.critic_loss_fn = ValueLoss(value_clip)
 
         super().__init__(experience_maker_holder_name_list,
                          train_batch_size=train_batch_size,
                          buffer_limit=buffer_limit,
-                         buffer_cpu_offload=buffer_cpu_offload,
-                         experience_batch_size=experience_batch_size,
-                         max_epochs=max_epochs,
                          dataloader_pin_memory=dataloader_pin_memory,
                          callbacks=callbacks,
-                         **generate_kwargs)
+                         debug=debug)
+        if self._debug:
+            print(f'[trainer{get_rank()}] will send state dict to {experience_maker_holder_name_list}')
+
+        self._update_lora_weights = update_lora_weights
 
     @ray.method(concurrency_group="model_io")
-    def _update_remote_makers(self):
+    @torch.no_grad()
+    def _update_remote_makers(self, fully_update: bool = False, **config):
         # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+        if not fully_update:
+            config['requires_grad_only'] = True
+        self.update_target_holder_list()
+        # mark start, ensure order
+        tasks = []
+        for target_holder in self.target_holder_list:
+            tasks.append(target_holder.update_experience_maker.remote(chunk_start=True, fully_update=fully_update))
+        ray.get(tasks)
+        # sending loop
+        tasks = []
+
+        for state_dict_shard in self._get_model_state_dict_shard(self.actor, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.update_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
-                    
-    @ray.method(concurrency_group="model_io")
-    def initialize_remote_makers(self):
-        # TODO: balance duties
-        if is_rank_0():
-            self.update_target_holder_list(self.target_holder_name_list)
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_actor_state_dict=state_dict_shard,
+                        new_actor_lora_config_dict=self._get_model_lora_config_dict(self.actor),
+                        fully_update=fully_update))
+        # sending loop
+        for state_dict_shard in self._get_model_state_dict_shard(self.critic, fully_update=fully_update, **config):
             for target_holder in self.target_holder_list:
-                # TODO: reduce malloc
-                with torch.no_grad():
-                    ray.get(target_holder.initialize_experience_maker.remote(self._get_unwrapped_actor(), self._get_unwrapped_critic()))
+                tasks.append(
+                    target_holder.update_experience_maker.remote(
+                        new_critic_state_dict=state_dict_shard,
+                        new_critic_lora_config_dict=self._get_model_lora_config_dict(self.critic),
+                        fully_update=fully_update))
+        ray.get(tasks)
+        # mark end
+        for target_holder in self.target_holder_list:
+            target_holder.update_experience_maker.remote(chunk_end=True, fully_update=fully_update)
 
     @ray.method(concurrency_group="compute")
     def training_step(self, experience: Experience) -> Dict[str, float]:
         self.actor.train()
         self.critic.train()
 
-        experience.to_device(torch.cuda.current_device())
         num_actions = experience.action_mask.size(1)
         action_log_probs = self.actor(experience.sequences, num_actions, attention_mask=experience.attention_mask)
         actor_loss = self.actor_loss_fn(action_log_probs,
@@ -155,38 +185,16 @@ class DetachedPPOTrainer(DetachedTrainer):
     def strategy_save_critic_optim(self, path: str, only_rank0: bool = False) -> None:
         self.strategy.save_optimizer(self.critic_optim, path, only_rank0)
 
-    def _get_unwrapped_actor(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = Actor(self.strategy._unwrap_model(self.actor))
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return Actor(self.strategy._unwrap_actor(self.actor))
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.actor
+    def _get_model_state_dict_shard(self, model: torch.nn.Module, fully_update=False, **config):
+        for state_dict in self.strategy.get_model_state_dict_shard(model, **config):
+            if not self._update_lora_weights or fully_update:
+                yield state_dict_to(state_dict)
+            else:
+                state_dict_lora, _ = LoRAConstructor.filter_state_dict_lora(state_dict)
+                yield state_dict_to(state_dict_lora)
 
-    def _get_unwrapped_critic(self):
-        if False:
-            pass
-        elif isinstance(self.strategy, ColossalAIStrategy):
-            ret = self.strategy._unwrap_model(self.critic)
-            return ret
-        elif isinstance(self.strategy, DDPStrategy):
-            return self.critic.module
-        elif isinstance(self.strategy, NaiveStrategy):
-            return self.critic
-
-
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> None:
-    origin_model = strategy._unwrap_actor(actor)
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
-        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
-
-    if 'update_model_kwargs_fn' not in generate_kwargs:
-        new_kwargs['update_model_kwargs_fn'] = update_model_kwargs_fn
-
-    return new_kwargs
-   
\ No newline at end of file
+    def _get_model_lora_config_dict(self, model: torch.nn.Module):
+        if not self._update_lora_weights:
+            return None
+        unwrapped_model = self.strategy.unwrap_model(model)
+        return LoRAConstructor.extract_lora_config(unwrapped_model)
diff --git a/applications/Chat/coati/ray/example/1m1t.py b/applications/Chat/coati/ray/example/1m1t.py
deleted file mode 100644
index a65273705..000000000
--- a/applications/Chat/coati/ray/example/1m1t.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-    
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '1',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '1',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info = env_info_trainer,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        env_info = env_info_maker,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3 # +3 for fault tolerance
-    maker_done_ref = experience_holder_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m1t.sh b/applications/Chat/coati/ray/example/1m1t.sh
deleted file mode 100644
index f7c5054c8..000000000
--- a/applications/Chat/coati/ray/example/1m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy colossalai_zero2 --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10   --debug
diff --git a/applications/Chat/coati/ray/example/1m2t.py b/applications/Chat/coati/ray/example/1m2t.py
deleted file mode 100644
index 3883c364a..000000000
--- a/applications/Chat/coati/ray/example/1m2t.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1])
-    ray.init(dashboard_port = 1145)
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug= args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs * 2 + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/1m2t.sh b/applications/Chat/coati/ray/example/1m2t.sh
deleted file mode 100644
index 669f41410..000000000
--- a/applications/Chat/coati/ray/example/1m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 1m2t.py "/path/to/prompts.csv" --model gpt2 \
-    --maker_strategy naive --trainer_strategy ddp --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  #--debug 
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/example/2m1t.py b/applications/Chat/coati/ray/example/2m1t.py
deleted file mode 100644
index b655de1ab..000000000
--- a/applications/Chat/coati/ray/example/2m1t.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def main(args):
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure Trainer
-    trainer_ref = DetachedPPOTrainer.options(name="trainer1", num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1"],
-        strategy=args.maker_strategy,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # trainer send its actor and critic to experience holders.
-    ray.get(trainer_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_done_ref = trainer_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs // 2 + 3 # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_done_ref, maker_1_done_ref, maker_2_done_ref])
-
-    # save model checkpoint after fitting
-    trainer_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                     only_rank0=False)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"])
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m1t.sh b/applications/Chat/coati/ray/example/2m1t.sh
deleted file mode 100644
index a207d4118..000000000
--- a/applications/Chat/coati/ray/example/2m1t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 3
-
-export RAY_NAMESPACE="admin"
-
-python 2m1t.py "/path/to/prompts.csv" \
-    --trainer_strategy naive --maker_strategy naive --lora_rank 2 --pretrain "facebook/opt-350m" --model 'opt' \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  # --debug
diff --git a/applications/Chat/coati/ray/example/2m2t.py b/applications/Chat/coati/ray/example/2m2t.py
deleted file mode 100644
index 435c71915..000000000
--- a/applications/Chat/coati/ray/example/2m2t.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import pandas as pd
-import torch
-from coati.trainer import PPOTrainer
-
-
-from coati.ray.src.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.src.detached_trainer_ppo import DetachedPPOTrainer
-
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from coati.experience_maker import NaiveExperienceMaker
-from torch.optim import Adam
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-import ray
-import os
-import socket
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(('8.8.8.8', 80))
-        return s.getsockname()[0]
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainer_1 = {'local_rank' : '0',
-                          'rank' : '0',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    env_info_trainer_2 = {'local_rank' : '0',
-                          'rank' : '1',
-                          'world_size' : '2',
-                          'master_port' : trainer_port,
-                          'master_addr' : master_addr}
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker_1 = {'local_rank' : '0',
-                        'rank' : '0',
-                        'world_size' : '2',
-                        'master_port' : maker_port,
-                        'master_addr' : master_addr}
-    env_info_maker_2 = {'local_rank' : '0',
-                        'rank' : '1',
-                        'world_size' : '2',
-                        'master_port': maker_port,
-                        'master_addr' : master_addr}
-    print([env_info_trainer_1, 
-           env_info_trainer_2,
-           env_info_maker_1,
-           env_info_maker_2])
-    ray.init()
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    # configure Trainer
-    trainer_1_ref = DetachedPPOTrainer.options(name="trainer1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_1,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    trainer_2_ref = DetachedPPOTrainer.options(name="trainer2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        experience_maker_holder_name_list=["maker1", "maker2"],
-        strategy=args.trainer_strategy,
-        model=args.model,
-        env_info=env_info_trainer_2,
-        pretrained=args.pretrain,
-        lora_rank=args.lora_rank,
-        train_batch_size=args.train_batch_size,
-        buffer_limit=16,
-        experience_batch_size=args.experience_batch_size,
-        max_epochs=args.max_epochs,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-
-    # configure Experience Maker
-    experience_holder_1_ref = ExperienceMakerHolder.options(name="maker1", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_1,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    experience_holder_2_ref = ExperienceMakerHolder.options(name="maker2", namespace=os.environ["RAY_NAMESPACE"], num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=["trainer1", "trainer2"],
-        strategy=args.maker_strategy,
-        env_info=env_info_maker_2,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        #kwargs:
-        max_length=128,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        debug=args.debug,
-    )
-    
-    # trainer send its actor and critic to experience holders.
-    # TODO: balance duty
-    ray.get(trainer_1_ref.initialize_remote_makers.remote())
-
-    # configure sampler
-    dataset = pd.read_csv(args.prompt_path)['prompt']
-    
-    def tokenize_fn(texts):
-        # MUST padding to max length to ensure inputs of all ranks have the same length
-        # Different length may lead to hang when using gemini, as different generation steps
-        batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
-        return {k: v.cuda() for k, v in batch.items()}
-
-    trainer_1_done_ref = trainer_1_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    trainer_2_done_ref = trainer_2_ref.fit.remote(num_episodes=args.num_episodes, max_timesteps=args.max_timesteps, update_timesteps=args.update_timesteps)
-    num_exp_per_maker = args.num_episodes * args.max_timesteps // args.update_timesteps * args.max_epochs + 3  # +3 for fault tolerance
-    maker_1_done_ref = experience_holder_1_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    maker_2_done_ref = experience_holder_2_ref.workingloop.remote(dataset, tokenize_fn, times=num_exp_per_maker)
-    
-    ray.get([trainer_1_done_ref, trainer_2_done_ref, maker_1_done_ref, maker_2_done_ref])
-    # save model checkpoint after fitting
-    trainer_1_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    trainer_2_ref.strategy_save_actor.remote(args.save_path, only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        trainer_1_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-        trainer_2_ref.strategy_save_actor_optim.remote('actor_optim_checkpoint_prompts_%d.pt' % (torch.cuda.current_device()),
-                                                 only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt_path')
-    parser.add_argument('--trainer_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--maker_strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
-    parser.add_argument('--need_optim_ckpt', type=bool, default=False)
-    parser.add_argument('--num_episodes', type=int, default=10)
-    parser.add_argument('--max_timesteps', type=int, default=10)
-    parser.add_argument('--update_timesteps', type=int, default=10)
-    parser.add_argument('--max_epochs', type=int, default=5)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--experience_batch_size', type=int, default=8)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument('--debug', action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/applications/Chat/coati/ray/example/2m2t.sh b/applications/Chat/coati/ray/example/2m2t.sh
deleted file mode 100644
index fb4024766..000000000
--- a/applications/Chat/coati/ray/example/2m2t.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-export RAY_NAMESPACE="admin"
-
-python 2m2t.py "path/to/prompts.csv" \
-    --maker_strategy naive --trainer_strategy colossalai_zero2 --lora_rank 2 \
-    --num_episodes 10 --max_timesteps 10 --update_timesteps 10 \
-    --max_epochs 10  --debug
\ No newline at end of file
diff --git a/applications/Chat/coati/ray/experience_maker_holder.py b/applications/Chat/coati/ray/experience_maker_holder.py
new file mode 100644
index 000000000..8551ef1ea
--- /dev/null
+++ b/applications/Chat/coati/ray/experience_maker_holder.py
@@ -0,0 +1,271 @@
+import os
+import time
+import tracemalloc
+from copy import deepcopy
+from threading import Lock
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import ray
+import torch
+import torch.nn as nn
+from coati.experience_maker import Experience, ExperienceMaker, NaiveExperienceMaker
+from coati.models.base import Actor, Critic, RewardModel
+from coati.replay_buffer.utils import BufferItem, make_experience_batch, split_experience_batch
+from coati.trainer.callbacks import Callback
+from coati.trainer.strategies import Strategy
+from coati.trainer.strategies.sampler import DistributedSampler
+from ray.exceptions import GetTimeoutError
+from torch import Tensor
+from tqdm import tqdm
+
+from .callbacks import ExperienceMakerPerformanceEvaluator, MakerCallback
+from .utils import (get_model_numel, 
+                    get_rank, 
+                    get_world_size, 
+                    is_rank_0, 
+                    set_dist_env,
+                    state_dict_to)
+from .lora_constructor import LoRAConstructor
+
+@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
+class ExperienceMakerHolder:
+    '''
+    Args:
+        detached_trainer_name_list: str list to get ray actor handles
+        strategy:
+        kl_coef: the coefficient of kl divergence loss
+        sync_models_from_trainers: whether to sync models from trainers. If True, you must call sync_models_to_remote_makers() in trainers to sync models.
+    '''
+
+    def __init__(
+            self,
+            detached_trainer_name_list: List[str],
+            strategy_fn: Callable[[], Strategy],
+    # a function returns (actor, critic, reward_model, initial_model)
+            model_fn: Callable[[], Tuple[Actor, Critic, RewardModel, Actor]],
+            env_info: Dict[str, str] = None,
+            sync_models_from_trainers: bool = False,
+            buffer_cpu_offload: bool = True,
+            kl_coef: float = 0.1,
+            callbacks: List[MakerCallback] = [],
+            eval_performance: bool = False,
+            debug: bool = False,
+            update_lora_weights: bool = False,
+            **generate_kwargs):
+        # set environment variables
+        if env_info:
+            set_dist_env(env_info=env_info)
+        self.target_trainer_list = []
+        assert len(detached_trainer_name_list) > 0
+        self._detached_trainer_name_list = detached_trainer_name_list
+        self.strategy = strategy_fn()
+        self.buffer_cpu_offload = buffer_cpu_offload
+        self.kl_coef = kl_coef
+        # init models
+        with self.strategy.model_init_context():
+            actor, critic, reward_model, initial_model = model_fn()
+        self.generate_kwargs = _set_default_generate_kwargs(generate_kwargs, actor)
+        if eval_performance:
+            actor_numel = get_model_numel(actor)
+            critic_numel = get_model_numel(critic)
+            initial_model_numel = get_model_numel(initial_model)
+            reward_model_numel = get_model_numel(reward_model)
+            evaluator = ExperienceMakerPerformanceEvaluator(actor_numel, critic_numel, initial_model_numel,
+                                                            reward_model_numel)
+            callbacks = callbacks + [evaluator]
+
+        actor, critic, reward_model, initial_model = self.strategy.prepare(actor, critic, reward_model, initial_model)
+        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
+        self.callbacks = callbacks
+
+        self._model_visit_lock = Lock()
+
+        self._is_fully_initialized = not sync_models_from_trainers
+
+        self._debug = debug
+        self._update_lora_weights = update_lora_weights
+        if self._update_lora_weights:
+            self.actor_lora_constructor = LoRAConstructor()
+            self.critic_lora_constructor = LoRAConstructor()
+
+        self.target_auto_balance = False
+
+        self._target_idx = 0
+
+        if self._debug:
+            print(f'[maker{get_rank()}] will send items to {self._detached_trainer_name_list}')
+            if not self._is_fully_initialized:
+                print(f'[maker{get_rank()}] Waiting for INIT')
+
+    def _get_ready(self):
+        while not self._fully_initialized():
+            time.sleep(1.0)
+
+    def _fully_initialized(self):
+        return self._is_fully_initialized
+
+    def _init_target_trainer_list(self):
+        if len(self.target_trainer_list) > 0:
+            return
+        for name in self._detached_trainer_name_list:
+            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
+
+    # copy from ../trainer/base.py
+    @ray.method(concurrency_group="compute")
+    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
+        if isinstance(inputs, Tensor):
+            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
+        elif isinstance(inputs, dict):
+            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
+        else:
+            raise ValueError(f'Unsupported input type "{type(inputs)}"')
+
+    @ray.method(concurrency_group="experience_io")
+    def _send_items(self, experience: Experience) -> None:
+        self._init_target_trainer_list()
+        items = split_experience_batch(experience)
+        items_per_trainer = [[] for _ in range(len(self.target_trainer_list))]
+        for item in items:
+            items_per_trainer[self._target_idx].append(item)
+            self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
+        for i, target_trainer in enumerate(self.target_trainer_list):
+            if len(items_per_trainer[i]) > 0:
+                target_trainer.buffer_extend.remote(items_per_trainer[i])
+
+    def _inference_step(self, batch) -> None:
+        self._on_batch_start()
+        with self._model_visit_lock:
+            self._on_make_experience_start()
+            experience = self._make_experience(batch)
+            self._on_make_experience_end(experience)
+        self._on_send_start()
+        if self.buffer_cpu_offload:
+            experience.to_device('cpu')
+        self._send_items(experience)
+        self._on_send_end()
+        self._on_batch_end()
+
+    def workingloop(self, dataloader_fn: Callable[[], Iterable], num_epochs: int = 1, num_steps: int = 0):
+        """Working loop of the experience maker.
+
+        Args:
+            dataloader_fn (Callable[[], Iterable]): A function that returns a dataloader.
+            num_epochs (int, optional): Iterate the dataloader for number of epochs. Defaults to 1.
+            num_steps (int, optional): Iterate the dataloader for number if steps. If this value > 0, num_epochs will be ignored. Defaults to 0.
+        """
+        self._get_ready()
+        self._on_loop_start()
+        dataloader = dataloader_fn()
+        if num_steps > 0:
+            # ignore num epochs
+            it = iter(dataloader)
+            for _ in tqdm(range(num_steps), desc='ExperienceMaker', disable=not is_rank_0()):
+                try:
+                    batch = next(it)
+                except StopIteration:
+                    it = iter(dataloader)
+                    batch = next(it)
+                self._inference_step(batch)
+        else:
+            with tqdm(total=num_epochs * len(dataloader), desc='ExperienceMaker', disable=not is_rank_0()) as pbar:
+                for _ in range(num_epochs):
+                    for batch in dataloader:
+                        self._inference_step(batch)
+                        pbar.update()
+        self._on_loop_end()
+
+    @ray.method(concurrency_group="model_io")
+    def update_experience_maker(self,
+                                new_actor_state_dict: Dict[str, Any] = None,
+                                new_actor_lora_config_dict: Dict[str, Any] = None,
+                                new_critic_state_dict: Dict[str, Any] = None,
+                                new_critic_lora_config_dict: Dict[str, Any] = None,
+                                fully_update: bool = False,
+                                chunk_start: bool = None,
+                                chunk_end: bool = None):
+        '''
+            called by trainer
+            chunk_start: Set True at the first call. Before sending state_dict calls
+            chunk_end: Set True at the last call. After sending state_dict calls.
+            fully_update: Set True if you want to sync models when initializing
+
+            TODO: load_state_dict integrate with model-sharding strategy
+        '''
+        _watch_memory = self._debug
+        if chunk_start:
+            if self._debug:
+                print("[maker] UPDATE ")
+            if _watch_memory:
+                tracemalloc.start()
+            self._model_visit_lock.acquire()
+
+        with torch.no_grad():
+            if new_actor_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.actor.model.load_state_dict(new_actor_state_dict, strict=False)
+                else:
+                    new_actor_state_dict = state_dict_to(new_actor_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.actor_lora_constructor.reconstruct_increase(new_actor_state_dict, new_actor_lora_config_dict)
+                    self.actor_lora_constructor.load_state_dict_increase(self.experience_maker.actor.model, state_dict_increasae)
+            if new_critic_state_dict is not None:
+                if not self._update_lora_weights or fully_update:
+                    self.experience_maker.critic.load_state_dict(new_critic_state_dict, strict=False)
+                else:
+                    new_critic_state_dict = state_dict_to(new_critic_state_dict, device=torch.cuda.current_device())
+                    state_dict_increasae = self.critic_lora_constructor.reconstruct_increase(new_critic_state_dict, new_critic_lora_config_dict)
+                    self.critic_lora_constructor.load_state_dict_increase(self.experience_maker.critic, state_dict_increasae)
+
+        # the lock must be released after both actor and critic being updated
+        if chunk_end:
+            self._model_visit_lock.release()
+            if _watch_memory:
+                current, peak = tracemalloc.get_traced_memory()
+                print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+                tracemalloc.stop()
+            if fully_update:
+                self._is_fully_initialized = True
+
+    def _on_make_experience_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_start()
+
+    def _on_make_experience_end(self, experience: Experience) -> None:
+        for callback in self.callbacks:
+            callback.on_make_experience_end(experience)
+
+    def _on_loop_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_start()
+
+    def _on_loop_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_loop_end()
+
+    def _on_send_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_start()
+
+    def _on_send_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_send_end()
+
+    def _on_batch_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_start()
+
+    def _on_batch_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_batch_end()
+
+
+def _set_default_generate_kwargs(generate_kwargs: dict, actor: Actor) -> None:
+    origin_model = actor.model
+    new_kwargs = {**generate_kwargs}
+    # use huggingface models method directly
+    if 'prepare_inputs_fn' not in generate_kwargs and hasattr(origin_model, 'prepare_inputs_for_generation'):
+        new_kwargs['prepare_inputs_fn'] = origin_model.prepare_inputs_for_generation
+
+    if 'update_model_kwargs_fn' not in generate_kwargs and hasattr(origin_model, '_update_model_kwargs_for_generation'):
+        new_kwargs['update_model_kwargs_fn'] = origin_model._update_model_kwargs_for_generation
+
+    return new_kwargs
diff --git a/applications/Chat/coati/ray/lora_constructor.py b/applications/Chat/coati/ray/lora_constructor.py
new file mode 100644
index 000000000..599a58248
--- /dev/null
+++ b/applications/Chat/coati/ray/lora_constructor.py
@@ -0,0 +1,122 @@
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from loralib.layers import LoRALayer
+from coati.models.lora import LoraLinear
+
+
+@dataclass
+class LoRAConfig:
+    r: int = 0
+    lora_alpha: int = 1
+    lora_dropout: float = 0
+    fan_in_fan_out: bool = False
+
+
+class LoRAConstructor:
+    '''
+    Tools for reconstructing a model from a remote LoRA model.
+    (Transfering only LoRA data costs much less!)
+    Usage:
+        Step 1 (Sender):
+            filter_state_dict_lora()
+            
+        Step 2 (Sender, Optional):
+            extract_lora_config()
+            
+        Step 3 (Sender):
+            send state_dict_lora and lora_config_dict
+            
+        Step 4 (Receiver):
+            reconstruct_increase()
+            
+        Step 5 (Receiver):
+            load_state_dict_increase()
+            
+    '''
+
+    def __init__(self):
+        self.lora_config_dict = None
+
+    def register_lora_config(self, lora_config_dict: Dict[str, Any]):
+        self.lora_config_dict = lora_config_dict
+
+    def reconstruct_increase(self, state_dict_lora: Dict[str, Any], lora_config_dict: Dict[str, Any]):
+        '''
+            xxx.lora_A, xxx.lora_B -->> xxx.weight
+            Warning: the xxx.weight here is the increment actually.
+        '''
+        if lora_config_dict is not None:
+            self.register_lora_config(lora_config_dict)
+
+        state_dict_increasae = OrderedDict()
+        config_iter = iter(self.lora_config_dict.items())
+        lora_A, lora_B, layer_prefix = None, None, None
+        for k, v in state_dict_lora.items():
+            if k.rpartition('.')[-1] == 'lora_A':
+                lora_A = v
+                layer_prefix = k.rpartition('.')[0]
+            elif k.rpartition('.')[-1] == 'lora_B':
+                assert layer_prefix == k.rpartition('.')[0], "unmatched (lora_A, lora_B) pair"
+                layer_prefix_2, config = next(config_iter)
+                assert layer_prefix_2 == layer_prefix, "unmatched (state_dict, config_dict) pair"
+                lora_B = v
+                weight_data_increase = self._compute(lora_A, lora_B, config)
+                state_dict_increasae[layer_prefix + '.weight'] = weight_data_increase
+                lora_A, lora_B, layer_prefix = None, None, None
+            else:
+                raise ValueError('unexpected key')
+        return state_dict_increasae
+
+    def _compute(self, lora_A, lora_B, config=LoRAConfig()):
+        def T(w):
+            return w.T if config.fan_in_fan_out else w
+        if config.r > 0:
+            scaling = config.lora_alpha / config.r
+            weight_data_increase = T(lora_B @ lora_A) * scaling
+            return weight_data_increase
+        return 0
+
+    def load_state_dict_increase(self, model: nn.Module, state_dict_increasae: Dict[str, Any]):
+        '''
+        The final reconstruction step
+        '''
+        # naive approach
+        model.load_state_dict({k: v + model.state_dict()[k] for k, v in state_dict_increasae.items()}, strict=False)
+
+    @staticmethod
+    def filter_state_dict_lora(state_dict: Dict[str, Any], keep_non_lora=False):
+        '''
+        if keep_non_lora, also return non_lora state_dict
+        '''
+        state_dict_lora = OrderedDict()
+        state_dict_non_lora = OrderedDict()
+        for k, v in state_dict.items():
+            if 'lora_A' in k or 'lora_B' in k:
+                state_dict_lora[k] = v
+            elif keep_non_lora:
+                state_dict_non_lora[k] = v
+        if keep_non_lora:
+            return state_dict_lora, state_dict_non_lora
+        else:
+            return state_dict_lora, None
+
+    @staticmethod
+    def extract_lora_config(model: nn.Module) -> Dict[str, LoRAConfig]:
+        '''
+        extract LoraLinear model.
+        return OrderedDict(): name -> LoRAConfig
+        '''
+        lora_config_dict = OrderedDict()
+
+        for name, child in model.named_modules():
+            if isinstance(child, LoraLinear):
+                lora_config_dict[name] = LoRAConfig(r=child.r,
+                                                    lora_alpha=child.lora_alpha,
+                                                    lora_dropout=child.lora_dropout,
+                                                    fan_in_fan_out=child.fan_in_fan_out)
+
+        return lora_config_dict
diff --git a/applications/Chat/coati/ray/src/__init__.py b/applications/Chat/coati/ray/src/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/applications/Chat/coati/ray/src/detached_trainer_base.py b/applications/Chat/coati/ray/src/detached_trainer_base.py
deleted file mode 100644
index f1ed1ec71..000000000
--- a/applications/Chat/coati/ray/src/detached_trainer_base.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
-from tqdm import tqdm
-from coati.trainer.callbacks import Callback
-from coati.experience_maker import Experience
-import ray
-import os
-
-from .detached_replay_buffer import DetachedReplayBuffer
-from .utils import is_rank_0
-
-class DetachedTrainer(ABC):
-    '''
-        Base class for detached rlhf trainers. 
-        'detach' means that the experience maker is detached compared to a normal Trainer.
-        Please set name attribute during init:
-            >>> trainer = DetachedTrainer.options(..., name = "xxx", ...).remote()
-            So an ExperienceMakerHolder can reach the detached_replay_buffer by Actor's name.
-    Args:
-        detached_strategy (DetachedStrategy): the strategy to use for training
-        detached_replay_buffer_ref (ObjectRef[DetachedReplayBuffer]): the replay buffer to use for training
-        experience_batch_size (int, defaults to 8): the batch size to use for experience generation
-        max_epochs (int, defaults to 1): the number of epochs of training process
-        data_loader_pin_memory (bool, defaults to True): whether to pin memory for data loader
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        generate_kwargs (dict, optional): the kwargs to use while model generating
-    '''
-
-    def __init__(self,
-                 experience_maker_holder_name_list: List[str],
-                 train_batch_size: int = 8,
-                 buffer_limit: int = 0,
-                 buffer_cpu_offload: bool = True,
-                 experience_batch_size: int = 8,
-                 max_epochs: int = 1,
-                 dataloader_pin_memory: bool = True,
-                 callbacks: List[Callback] = [],
-                 **generate_kwargs) -> None:
-        super().__init__()
-        self.detached_replay_buffer = DetachedReplayBuffer(train_batch_size, limit=buffer_limit, cpu_offload=buffer_cpu_offload)
-        self.experience_batch_size = experience_batch_size
-        self.max_epochs = max_epochs
-        self.dataloader_pin_memory = dataloader_pin_memory
-        self.callbacks = callbacks
-        self.generate_kwargs = generate_kwargs
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-
-    def update_target_holder_list(self, experience_maker_holder_name_list):
-        self.target_holder_name_list = experience_maker_holder_name_list
-        self.target_holder_list = []
-        for name in self.target_holder_name_list:
-            self.target_holder_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-
-    @abstractmethod
-    def _update_remote_makers(self):
-        pass
-
-    @abstractmethod
-    def training_step(self, experience: Experience) -> Dict[str, Any]:
-        pass
-
-    def _learn(self):
-        pbar = tqdm(range(self.max_epochs), desc='Train epoch', disable=not is_rank_0())
-        for _ in pbar:
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] sampling exp")
-            experience = self._buffer_sample()
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] training step")
-            metrics = self.training_step(experience)
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[trainer] step over")
-            pbar.set_postfix(metrics)
-
-    def fit(self, num_episodes: int = 50000, max_timesteps: int = 500, update_timesteps: int = 5000) -> None:
-        self._on_fit_start()
-        for episode in range(num_episodes):
-            self._on_episode_start(episode)
-            for timestep in tqdm(range(max_timesteps // update_timesteps),
-                                 desc=f'Episode [{episode+1}/{num_episodes}]',
-                                 disable=not is_rank_0()):
-                self._learn()
-                self._update_remote_makers()
-            self._on_episode_end(episode)
-        self._on_fit_end()
-
-    @ray.method(concurrency_group="buffer_length")
-    def buffer_get_length(self):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[trainer]                telling length")
-        return self.detached_replay_buffer.get_length()
-
-    @ray.method(concurrency_group="buffer_append")
-    def buffer_append(self, experience: Experience):
-        # called by ExperienceMakerHolder
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            # print(f"[trainer] receiving exp. Current buffer length: {self.detached_replay_buffer.get_length()}")
-            print(f"[trainer]               receiving exp.")
-        self.detached_replay_buffer.append(experience)
-
-    @ray.method(concurrency_group="buffer_sample")
-    def _buffer_sample(self):
-        return self.detached_replay_buffer.sample()
-
-    def _on_fit_start(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_start()
-
-    def _on_fit_end(self) -> None:
-        for callback in self.callbacks:
-            callback.on_fit_end()
-
-    def _on_episode_start(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_start(episode)
-
-    def _on_episode_end(self, episode: int) -> None:
-        for callback in self.callbacks:
-            callback.on_episode_end(episode)
diff --git a/applications/Chat/coati/ray/src/experience_maker_holder.py b/applications/Chat/coati/ray/src/experience_maker_holder.py
deleted file mode 100644
index 0ae4e3125..000000000
--- a/applications/Chat/coati/ray/src/experience_maker_holder.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import torch
-from typing import Any, Callable, Dict, List, Optional, Union
-import ray
-from ray.exceptions import GetTimeoutError
-from torch import Tensor
-import torch.nn as nn
-from coati.models.base import Actor, Critic, RewardModel
-from coati.trainer.strategies.sampler import DistributedSampler
-from coati.trainer.strategies import Strategy
-from coati.experience_maker import NaiveExperienceMaker, Experience, ExperienceMaker
-
-from copy import deepcopy
-from threading import Lock
-import time
-import os
-
-
-from .utils import is_rank_0, get_strategy_from_args, set_dist_env
-
-
-@ray.remote(concurrency_groups={"experience_io": 1, "model_io": 1, "compute": 1})
-class ExperienceMakerHolder:
-    '''
-    Args:
-        detached_trainer_name_list: str list to get ray actor handles
-        strategy: 
-        experience_batch_size: batch size of generated experience
-        kl_coef: the coefficient of kl divergence loss
-    '''
-
-    def __init__(self,
-                 detached_trainer_name_list: List[str],
-                 strategy: str,
-                 env_info: Dict[str, str] = None,
-                 experience_batch_size: int = 8,
-                 kl_coef: float = 0.1,
-                 **generate_kwargs):
-        # set environment variables
-        if env_info:
-            set_dist_env(env_info=env_info)
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name, namespace=os.environ["RAY_NAMESPACE"]))
-        self.strategy_str = strategy
-        self.strategy = get_strategy_from_args(strategy)
-        self.experience_batch_size = experience_batch_size
-        self.kl_coef = kl_coef
-        self.generate_kwargs = generate_kwargs
-        # Need a trainer to give an actor and a critic via initialize_experience_maker(...)
-        actor, critic, reward_model, initial_model = None, None, None, None
-        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, self.kl_coef)
-        self._model_visit_lock = Lock()
-        self.fully_initialized = False
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] Waiting for INIT')
-
-    def _get_ready(self):
-        while not self.fully_initialized:
-            time.sleep(1.0)
-
-    def update_target_trainer_list(self, detached_trainer_name_list):
-        self.target_trainer_list = []
-        for name in detached_trainer_name_list:
-            self.target_trainer_list.append(ray.get_actor(name))
-
-    # copy from ../trainer/base.py
-    @ray.method(concurrency_group="compute")
-    def _make_experience(self, inputs: Union[Tensor, Dict[str, Tensor]]) -> Experience:
-        self._get_ready()
-        if isinstance(inputs, Tensor):
-            return self.experience_maker.make_experience(inputs, **self.generate_kwargs)
-        elif isinstance(inputs, dict):
-            return self.experience_maker.make_experience(**inputs, **self.generate_kwargs)
-        else:
-            raise ValueError(f'Unsupported input type "{type(inputs)}"')
-
-    @ray.method(concurrency_group="experience_io")
-    def _send_experience(self, experience):
-        '''
-        ignore it
-
-        # choose a trainer that has the least experience batch in its detached_replay_buffer
-        chosen_trainer = None
-        min_length = None
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print("[maker] choosing target trainer")
-        while chosen_trainer is None:
-            for target_trainer in self.target_trainer_list:
-                try:
-                    temp_length = ray.get(target_trainer.buffer_get_length.remote(), timeout=0.1)
-                    if min_length is None:
-                        min_length = temp_length
-                        chosen_trainer = target_trainer
-                    else:
-                        if temp_length < min_length:
-                            min_length = temp_length
-                            chosen_trainer = target_trainer
-                except GetTimeoutError:
-                    pass
-                    
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        '''
-        # 
-        if not hasattr(self, "_target_idx"):
-            self._target_idx = 0
-        chosen_trainer = self.target_trainer_list[self._target_idx]
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print(f"[maker] sending exp to {chosen_trainer}")
-        chosen_trainer.buffer_append.remote(experience)
-        self._target_idx = (self._target_idx + 1) % len(self.target_trainer_list)
-
-    def workingloop(self, dataset, tokenizer: Optional[Callable[[Any], dict]] = None, times=5000 * 50000):
-        self._get_ready()
-        sampler = self.strategy.setup_sampler(dataset)
-        for _ in range(times):
-            rand_prompts = sampler.sample(self.experience_batch_size)
-            if tokenizer is not None:
-                inputs = tokenizer(rand_prompts)
-            else:
-                inputs = rand_prompts
-            self._model_visit_lock.acquire()
-            experience = self._make_experience(inputs=inputs)
-            self._model_visit_lock.release()
-            self._send_experience(experience=experience)
-
-    @ray.method(concurrency_group="model_io")
-    def initialize_experience_maker(self, init_actor: Actor, init_critic: Critic):
-        '''
-        called by trainer. Only once.
-        '''
-        # TODO: reduce malloc
-        if self.fully_initialized:
-            return
-        if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-            print('[maker] INIT')
-        with torch.no_grad():
-            with self.strategy.model_init_context():
-                actor = init_actor
-                critic = init_critic
-                initial_model = deepcopy(actor)
-                reward_model = RewardModel(deepcopy(critic.model),
-                                           deepcopy(critic.value_head)).to(torch.cuda.current_device())
-            if self.strategy_str != 'colossalai_gemini':
-                actor.to(torch.float16).to(torch.cuda.current_device())
-                critic.to(torch.float16).to(torch.cuda.current_device())
-                initial_model.to(torch.float16).to(torch.cuda.current_device())
-                reward_model.to(torch.float16).to(torch.cuda.current_device())
-
-            self.experience_maker.actor = self.strategy.prepare(actor)
-            self.experience_maker.critic = self.strategy.prepare(critic)
-            self.experience_maker.initial_model = self.strategy.prepare(initial_model)
-            self.experience_maker.reward_model = self.strategy.prepare(reward_model)
-        self.fully_initialized = True
-
-    @ray.method(concurrency_group="model_io")
-    def update_experience_maker(self, new_actor: Actor, new_critic: Critic):
-        '''
-            called by trainer
-        '''
-        # TODO: reduce malloc
-        self._model_visit_lock.acquire()
-        with torch.no_grad():
-            if 'debug' in self.generate_kwargs and self.generate_kwargs['debug'] == True:
-                print("[maker] UPDATE ")
-            if self.strategy_str != 'colossalai_gemini':
-                new_actor.to(torch.float16).to(torch.cuda.current_device())
-                new_critic.to(torch.float16).to(torch.cuda.current_device())
-            self.experience_maker.actor = self.strategy.prepare(new_actor)
-            self.experience_maker.critic = self.strategy.prepare(new_critic)
-        self._model_visit_lock.release()
diff --git a/applications/Chat/coati/ray/src/pipeline_strategy.py b/applications/Chat/coati/ray/src/pipeline_strategy.py
deleted file mode 100644
index 7ecb5d7d8..000000000
--- a/applications/Chat/coati/ray/src/pipeline_strategy.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# WIP
-
-
-from coati.trainer.strategies import Strategy
-from coati.trainer.strategies import NaiveStrategy
-from coati.models.base import Actor, RewardModel, Critic
-
-import numpy as np
-import torch
-from torch._C._distributed_rpc import _is_current_rpc_agent_set
-
-import colossalai
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
-from colossalai.fx import ColoTracer
-from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-
-
-import os
-from functools import partial
-import random
-
-rpc_is_initialized = _is_current_rpc_agent_set
-
-class PipelineModel(torch.nn.Module):
-    '''
-    Actor has 2 kinds of jobs: forward and generate. 
-        better to just pipeline the inner model
-    '''
-    def __init__(self,
-                 model: torch.nn.Module,
-                 stage_num: int,
-                 num_microbatches: int,
-                 data_kwargs = None,
-                 ):
-        super().__init__()
-        # create partition module
-        def create_partition_module(pp_rank:int, stage_num: int, model, data_kwargs):
-            model.eval()
-            tracer = ColoTracer()
-            meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
-            graph = tracer.trace(root=model, meta_args=meta_args)
-            gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
-            annotated_model = balanced_split_pass(gm, stage_num)
-            top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
-            topo = get_fx_topology(top_module)
-            for submodule in split_submodules:
-                if isinstance(submodule, torch.fx.GraphModule):
-                    setattr(submodule, '_topo', topo)
-            return split_submodules[pp_rank + 1]
-    
-        def partition(model, data_kwargs: dict, pp_rank: int, chunk: int, stage_num: int):
-            partition = create_partition_module(pp_rank, stage_num, model, data_kwargs)
-            return partition
-        self.inference_engine = OneFOneBPipelineEngine(
-            partition_fn=partial(partition, model, data_kwargs),
-            stage_num=stage_num,
-            num_microbatches=num_microbatches,
-            device='cuda',
-        )
-
-    def forward(self,
-                **model_inputs):
-        return self.inference_engine.forward_backward(**model_inputs, forward_only=True)
-
-
-
-class PPStrategy(NaiveStrategy):
-    """
-        Strategy for Pipeline inference (inference only!)
-        
-        master node only
-    """
-    def __init__(
-        self,
-        seed: int = 42
-    ):
-        self.seed = seed
-        super().__init__()
-        
-        
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-        ppg.set_global_info(rank = int(os.environ['RANK']),
-                            world_size=int(os.environ['WORLD_SIZE']),
-                            dp_degree=1,
-                            tp_degree=1,
-                            num_worker_threads=128,
-                            device="cuda")
-        
-    def model_init_context(self):
-        return super().model_init_context()
-    
-    def setup_model(self, model: torch.nn.Module) -> torch.nn.Module:
-        if isinstance(model, Actor) or \
-            isinstance(model, RewardModel) or \
-            isinstance(model, Critic):
-            model.model = PipelineModel(model.model)
-
-    def set_seed(self, seed: int) -> None:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
diff --git a/applications/Chat/coati/ray/src/utils.py b/applications/Chat/coati/ray/src/utils.py
deleted file mode 100644
index c750879b6..000000000
--- a/applications/Chat/coati/ray/src/utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch.distributed as dist
-from typing import Any, Callable, Dict, List, Optional
-from coati.models.bloom import BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTActor, GPTCritic
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-import torch
-import os
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
-
-
-def get_cuda_actor_critic_from_args(model: str, pretrained: str = None, lora_rank=0):
-    if model == 'gpt2':
-        actor = GPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'bloom':
-        actor = BLOOMActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    elif model == 'opt':
-        actor = OPTActor(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank).to(torch.cuda.current_device())
-    else:
-        raise ValueError(f'Unsupported model "{model}"')
-    return actor, critic
-
-
-def get_strategy_from_args(strategy: str):
-    if strategy == 'naive':
-        strategy_ = NaiveStrategy()
-    elif strategy == 'ddp':
-        strategy_ = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-    return strategy_
-
-
-def set_dist_env(env_info: Dict[str, str]):
-    os.environ["RANK"] = env_info['rank']
-    os.environ["LOCAL_RANK"] = env_info['local_rank']
-    os.environ["WORLD_SIZE"] = env_info['world_size']
-    os.environ['MASTER_PORT'] = env_info['master_port']
-    os.environ['MASTER_ADDR'] = env_info['master_addr']
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
new file mode 100644
index 000000000..4361ee236
--- /dev/null
+++ b/applications/Chat/coati/ray/utils.py
@@ -0,0 +1,152 @@
+import os
+from typing import Any, Callable, Dict, List, Optional
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
+from coati.models.gpt import GPTRM, GPTActor, GPTCritic
+from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
+from coati.models.opt import OPTRM, OPTActor, OPTCritic
+from coati.models.roberta import RoBERTaActor, RoBERTaCritic, RoBERTaRM
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
+from coati.utils import prepare_llama_tokenizer_and_embedding
+from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer, RobertaTokenizer
+
+
+def is_rank_0() -> bool:
+    return not dist.is_initialized() or dist.get_rank() == 0
+
+
+def get_rank() -> int:
+    return dist.get_rank() if dist.is_initialized() else 0
+
+
+def get_world_size() -> int:
+    return dist.get_world_size() if dist.is_initialized() else 1
+
+
+def get_actor_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        actor = GPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'bloom':
+        actor = BLOOMActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'opt':
+        actor = OPTActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'llama':
+        actor = LlamaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    elif model == 'roberta':
+        actor = RoBERTaActor(pretrained=pretrained, config=config, lora_rank=lora_rank)
+    else:
+        raise ValueError(f'Unsupported actor model "{model}"')
+    return actor
+
+
+def get_critic_from_args(model: str, pretrained: str = None, config=None, lora_rank=0):
+    if model == 'gpt2':
+        critic = GPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'bloom':
+        critic = BLOOMCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'opt':
+        critic = OPTCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'llama':
+        critic = LlamaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    elif model == 'roberta':
+        critic = RoBERTaCritic(pretrained=pretrained, lora_rank=lora_rank, config=config, use_action_mask=True)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return critic
+
+
+def get_reward_model_from_args(model: str, pretrained: str = None, config=None):
+    if model == 'gpt2':
+        reward_model = GPTRM(pretrained=pretrained, config=config)
+    elif model == 'bloom':
+        reward_model = BLOOMRM(pretrained=pretrained, config=config)
+    elif model == 'opt':
+        reward_model = OPTRM(pretrained=pretrained, config=config)
+    elif model == 'llama':
+        reward_model = LlamaRM(pretrained=pretrained, config=config)
+    elif model == 'roberta':
+        reward_model = RoBERTaRM(pretrained=pretrained, config=config)
+    else:
+        raise ValueError(f'Unsupported reward model "{model}"')
+    return reward_model
+
+
+def get_strategy_from_args(strategy: str):
+    if strategy == 'naive':
+        strategy_ = NaiveStrategy()
+    elif strategy == 'ddp':
+        strategy_ = DDPStrategy()
+    elif strategy == 'colossalai_gemini':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    elif strategy == 'colossalai_gemini_cpu':
+        strategy_ = ColossalAIStrategy(stage=3, placement_policy='cpu', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2_cpu':
+        strategy_ = ColossalAIStrategy(stage=2, placement_policy='cpu')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+    return strategy_
+
+
+def get_tokenizer_from_args(model: str, **kwargs):
+    if model == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    elif model == 'bloom':
+        tokenizer = BloomTokenizerFast.from_pretrained('bigscience/bloom-560m')
+    elif model == 'opt':
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+    elif model == 'llama':
+        pretrain_path = kwargs["pretrain"]
+        tokenizer = AutoTokenizer.from_pretrained(pretrain_path)
+    elif model == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    else:
+        raise ValueError(f'Unsupported model "{model}"')
+
+    tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+
+def set_dist_env(env_info: Dict[str, str]):
+    os.environ["RANK"] = env_info['rank']
+    os.environ["LOCAL_RANK"] = env_info['local_rank']
+    os.environ["WORLD_SIZE"] = env_info['world_size']
+    os.environ['MASTER_PORT'] = env_info['master_port']
+    os.environ['MASTER_ADDR'] = env_info['master_addr']
+
+
+def get_model_numel(model: nn.Module) -> int:
+    numel = sum(p.numel() for p in model.parameters())
+    return numel
+
+
+def get_receivers_per_sender(sender_idx: int, num_senders: int, num_receivers: int, allow_idle_sender: bool) -> list:
+    target_receivers = []
+    if num_senders <= num_receivers or allow_idle_sender:
+        # a sender will send data to one or more than one receivers
+        # a receiver only has one sender
+        for i in range(num_receivers):
+            if i % num_senders == sender_idx:
+                target_receivers.append(i)
+    else:
+        # a sender will send data to one receiver
+        # a receiver may have more than one sender
+        target_receivers.append(sender_idx % num_receivers)
+    return target_receivers
+
+
+def state_dict_to(state_dict: Dict[str, Any],
+                  dtype: torch.dtype = torch.float16,
+                  device: torch.device = torch.device('cpu')):
+    '''
+        keep state_dict intact
+    '''
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k] = v.to(dtype=dtype, device=device)
+    return new_state_dict
diff --git a/applications/Chat/coati/trainer/strategies/base.py b/applications/Chat/coati/trainer/strategies/base.py
index b14528691..bd3042202 100644
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -130,3 +130,7 @@ class Strategy(ABC):
                         only_rank0: bool = True,
                         tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
         pass
+
+    @abstractmethod
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        pass
\ No newline at end of file
diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
index 8aa302c77..88268b677 100644
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -186,3 +186,15 @@ class ColossalAIStrategy(DDPStrategy):
         if self.stage == 3:
             raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
         super().save_pretrained(model, path, only_rank0, tokenizer)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        if self.stage != 3:
+            yield from super().get_model_state_dict_shard(model, **config)
+        else:
+            # unwrapped_model = self._unwrap_model(model)
+            # for module in unwrapped_model.modules():
+            #     if isinstance(module, LoraLinear):
+            #         module.merge_weights = True
+            #         module.eval()
+            base_model: ZeroDDP = get_base_model(model)
+            yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index 7910b5787..a1fecb363 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -26,19 +26,8 @@ class DDPStrategy(NaiveStrategy):
         super().__init__()
 
     def setup_distributed(self) -> None:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-        except KeyError as e:
-            raise RuntimeError(
-                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-            )
-        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+        self._try_init_dist(force=True)
         self.set_seed(self.seed)
-        torch.cuda.set_device(local_rank)
 
     def set_seed(self, seed: int) -> None:
         random.seed(seed)
diff --git a/applications/Chat/coati/trainer/strategies/naive.py b/applications/Chat/coati/trainer/strategies/naive.py
index 4d94026ce..972deebea 100644
--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
@@ -1,10 +1,17 @@
-from typing import Any, Optional
+import os
+import sys
+from collections import OrderedDict
+from typing import Any, Dict, Optional
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 from coati.models.base import get_base_model
 from coati.replay_buffer import ReplayBuffer
+from coati.models.base import RewardModel
+from coati.models.lora import LoraLinear
+from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.modeling_utils import PreTrainedModel
@@ -13,6 +20,15 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from .base import Strategy
 
 
+# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
+def get_grad_required_state_dict(model: nn.Module):
+    state_dict = OrderedDict()
+    for name, parameter in model.named_parameters():
+        if parameter.requires_grad:
+            state_dict[name] = parameter.detach()
+    return state_dict
+
+
 class NaiveStrategy(Strategy):
     """
         Strategy for single GPU. No parallelism is used.
@@ -25,7 +41,7 @@ class NaiveStrategy(Strategy):
         optimizer.step()
 
     def setup_distributed(self) -> None:
-        pass
+        self._try_init_dist(force=False)
 
     def setup_model(self, model: nn.Module) -> nn.Module:
         return model
@@ -68,3 +84,45 @@ class NaiveStrategy(Strategy):
         unwrapped_model.save_pretrained(path)
         if tokenizer is not None:
             tokenizer.save_pretrained(path)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        # TODO: implement sharding on naive strategy
+        model = self.unwrap_model(model)
+        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+            state_dict = get_grad_required_state_dict(model)
+        else:
+            state_dict = model.state_dict()
+
+        if 'shard_size' in config:
+            shard_size = config['shard_size']
+            accumulate_size = 0
+            state_dict_shard = OrderedDict()
+            for name, param in state_dict.items():
+                state_dict_shard[name] = param
+                accumulate_size += param.numel() * param.element_size()
+                if accumulate_size >= shard_size:
+                    accumulate_size = 0
+                    yield state_dict_shard
+                    state_dict_shard = OrderedDict()
+            if accumulate_size > 0:
+                yield state_dict_shard
+        else:
+            yield state_dict
+
+    def _try_init_dist(self, force: bool = False) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            torch.cuda.set_device(local_rank)
+        except KeyError as e:
+            if force:
+                raise RuntimeError(
+                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+                )
+        except Exception as e:
+            if force:
+                raise e
diff --git a/applications/Chat/coati/trainer/strategies/sampler.py b/applications/Chat/coati/trainer/strategies/sampler.py
index d726fa640..65e199dbf 100644
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -27,6 +27,7 @@ class DistributedSampler:
         assert len(indices) == self.num_samples
         self.indices = indices
 
+
     def sample(self, batch_size: int) -> list:
         sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
         return [self.dataset[idx] for idx in sampled_indices]
diff --git a/applications/Chat/examples/ray/1mmt_prompt.py b/applications/Chat/examples/ray/1mmt_prompt.py
new file mode 100644
index 000000000..afdd6a922
--- /dev/null
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+    get_tokenizer_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_maker = {
+        'local_rank': '0',
+        'rank': '0',
+        'world_size': '1',
+        'master_port': maker_port,
+        'master_addr': master_addr
+    }
+
+    # configure tokenizer
+    tokenizer = get_tokenizer_from_args(args.model)
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=["maker1"],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        ) for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
+        detached_trainer_name_list=[f'trainer{i}' for i in range(args.num_trainers)],
+        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+        model_fn=model_fn,
+        env_info=env_info_maker,
+        experience_batch_size=args.experience_batch_size,
+        kl_coef=0.1,
+        debug=args.debug,
+        update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+        max_length=512,
+        do_sample=True,
+        temperature=1.0,
+        top_k=50,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        eval_performance=True,
+        use_cache=True,
+    )
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/mmmt_prompt.py b/applications/Chat/examples/ray/mmmt_prompt.py
new file mode 100644
index 000000000..fa7b2bd7e
--- /dev/null
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@@ -0,0 +1,189 @@
+import argparse
+import os
+import socket
+from functools import partial
+
+import pandas as pd
+import ray
+import torch
+from coati.quant import llama_load_quant, low_resource_init
+from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
+from coati.ray.experience_maker_holder import ExperienceMakerHolder
+from coati.ray.utils import (
+    get_actor_from_args,
+    get_critic_from_args,
+    get_receivers_per_sender,
+    get_reward_model_from_args,
+    get_strategy_from_args,
+)
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoTokenizer
+from transformers.modeling_utils import no_init_weights
+
+
+def get_free_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+
+
+def get_local_ip():
+    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
+        s.connect(('8.8.8.8', 80))
+        return s.getsockname()[0]
+
+
+def main(args):
+    master_addr = str(get_local_ip())
+    # trainer_env_info
+    trainer_port = str(get_free_port())
+    env_info_trainers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_trainers),
+        'master_port': trainer_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_trainers)]
+
+    # maker_env_info
+    maker_port = str(get_free_port())
+    env_info_makers = [{
+        'local_rank': '0',
+        'rank': str(rank),
+        'world_size': str(args.num_makers),
+        'master_port': maker_port,
+        'master_addr': master_addr
+    } for rank in range(args.num_makers)]
+
+    # configure tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
+        if args.initial_model_quant_ckpt is not None and args.model == 'llama':
+            # quantize initial model
+            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
+            with low_resource_init(), no_init_weights():
+                initial_model = get_actor_from_args(args.model, config=actor_cfg)
+            initial_model.model = llama_load_quant(initial_model.model, args.initial_model_quant_ckpt, args.quant_bits,
+                                                   args.quant_group_size).cuda().requires_grad_(False)
+        else:
+            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
+        return actor, critic, reward_model, initial_model
+
+    # configure Experience Maker
+    experience_holder_refs = [
+        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
+            detached_trainer_name_list=[
+                f'trainer{x}'
+                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
+            model_fn=model_fn,
+            env_info=env_info_maker,
+            kl_coef=0.1,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+    # sync_models_from_trainers=True,
+    # generation kwargs:
+            max_length=512,
+            do_sample=True,
+            temperature=1.0,
+            top_k=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            eval_performance=True,
+            use_cache=True,
+        )
+        for i, env_info_maker in enumerate(env_info_makers)
+    ]
+
+    def trainer_model_fn():
+        actor = get_actor_from_args(args.model, args.pretrain, lora_rank=args.lora_rank).half().cuda()
+        critic = get_critic_from_args(args.model, args.critic_pretrain, lora_rank=args.lora_rank).half().cuda()
+        return actor, critic
+
+    # configure Trainer
+    trainer_refs = [
+        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
+            experience_maker_holder_name_list=[
+                f"maker{x}"
+                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
+            ],
+            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
+            model_fn=trainer_model_fn,
+            env_info=env_info_trainer,
+            train_batch_size=args.train_batch_size,
+            buffer_limit=16,
+            eval_performance=True,
+            debug=args.debug,
+            update_lora_weights=not (args.lora_rank == 0),
+        )
+        for i, env_info_trainer in enumerate(env_info_trainers)
+    ]
+
+    dataset_size = args.experience_batch_size * 4
+
+    def build_dataloader():
+
+        def tokenize_fn(texts):
+            batch = tokenizer(texts, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
+            return {k: v.cuda() for k, v in batch.items()}
+
+        dataset = pd.read_csv(args.prompt_path)['prompt']
+        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
+        return dataloader
+
+    # uncomment this function if sync_models_from_trainers is True
+    # ray.get([
+    #     trainer_ref.sync_models_to_remote_makers.remote()
+    #     for trainer_ref in trainer_refs
+    # ])
+
+    wait_tasks = []
+
+    for experience_holder_ref in experience_holder_refs:
+        wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
+
+    total_steps = args.experience_batch_size * args.experience_steps * \
+        args.num_makers // (args.num_trainers * args.train_batch_size)
+    for trainer_ref in trainer_refs:
+        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
+
+    ray.get(wait_tasks)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prompt_path', type=str, default=None)
+    parser.add_argument('--num_makers', type=int, default=1)
+    parser.add_argument('--num_trainers', type=int, default=1)
+    parser.add_argument('--trainer_strategy',
+                        choices=[
+                            'naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2', 'colossalai_gemini_cpu',
+                            'colossalai_zero2_cpu'
+                        ],
+                        default='naive')
+    parser.add_argument('--maker_strategy', choices=['naive'], default='naive')
+    parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--critic_model', default='gpt2', choices=['gpt2', 'bloom', 'opt', 'llama'])
+    parser.add_argument('--pretrain', type=str, default=None)
+    parser.add_argument('--critic_pretrain', type=str, default=None)
+    parser.add_argument('--experience_steps', type=int, default=4)
+    parser.add_argument('--experience_batch_size', type=int, default=8)
+    parser.add_argument('--train_epochs', type=int, default=1)
+    parser.add_argument('--update_steps', type=int, default=2)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
+
+    parser.add_argument('--initial_model_quant_ckpt', type=str, default=None)
+    parser.add_argument('--quant_bits', type=int, default=4)
+    parser.add_argument('--quant_group_size', type=int, default=128)
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
+    main(args)
diff --git a/applications/Chat/examples/ray/requirements.txt b/applications/Chat/examples/ray/requirements.txt
new file mode 100644
index 000000000..e02756318
--- /dev/null
+++ b/applications/Chat/examples/ray/requirements.txt
@@ -0,0 +1 @@
+ray
diff --git a/applications/Chat/examples/ray/test_ci.sh b/applications/Chat/examples/ray/test_ci.sh
new file mode 100755
index 000000000..895f7de0f
--- /dev/null
+++ b/applications/Chat/examples/ray/test_ci.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -xe
+BASE=$(realpath $(dirname $0))
+
+export RAY_NAMESPACE=admin
+export DATA=/data/scratch/chatgpt/prompts.csv
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2
diff --git a/applications/Chat/examples/test_ci.sh b/applications/Chat/examples/test_ci.sh
index 2b049163c..2fa6c6052 100755
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
 rm -rf ${BASE}/rm_ckpt_gpt.pt
 
 rm -rf ${BASE}/actor_checkpoint_prompts.pt
+
+# 3080 doesn't support P2P, skip this test
+# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}

From 9c88b6cbd1597d5f429b61a04a7219b9e0d14a1b Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Jun 2023 11:10:12 +0800
Subject: [PATCH 26/26] [lazy] fix compatibility problem on torch 1.13 (#3911)

---
 colossalai/lazy/lazy_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/lazy/lazy_init.py b/colossalai/lazy/lazy_init.py
index c1fda3c53..76f550dc4 100644
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -37,7 +37,7 @@ _EARLY_MATERIALIZED_OPS = ['__getitem__', 'split']
 # If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)
 # without autograd tracking the change, remove the .data / .detach() call and wrap the change in a `with torch.no_grad():` block.
 # These ops cannot be unwrapped using .data
-_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__']
+_CHANGE_META_OPS = ['_cudnn_rnn_flatten_weight', 'requires_grad_', '__get__', '__set__', 'numel', 'size', 'dim']
 
 _LEGACY_TENSOR_CONSTRUCTOR = {
     'FloatTensor': torch.float,