diff --git a/.compatibility b/.compatibility
index 32da32be5..a918cb162 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,3 +1,2 @@
-1.12.0-11.3.0
-1.13.0-11.6.0
 2.0.0-11.7.0
+2.1.0-11.8.0
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index e2114d43b..8eb358c4f 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -30,7 +30,7 @@ jobs:
       github.event.repository.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
@@ -54,7 +54,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
@@ -140,7 +140,7 @@ jobs:
     if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 60
     defaults:
@@ -268,7 +268,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
@@ -299,7 +299,7 @@ jobs:
       github.event.repository.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --rm
     timeout-minutes: 5
     defaults:
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 6c77377be..e5afe9622 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, 8-gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 40
     steps:
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index f1e7a2d0c..51238905e 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -56,7 +56,7 @@ jobs:
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm
     timeout-minutes: 20
     defaults:
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index 027fbfd0a..b4c776747 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index 9d3bd9a48..02e30f52a 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -45,7 +45,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 10
     steps:
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index 859b6e4fb..6d6952aa1 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -77,7 +77,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 20
     concurrency:
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index 5ed128c3e..919fa5092 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
     timeout-minutes: 10
     steps:
       - name: 📚 Checkout
diff --git a/README.md b/README.md
index 04a349337..545500591 100644
--- a/README.md
+++ b/README.md
@@ -372,7 +372,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 ## Installation
 
 Requirements:
-- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- PyTorch >= 1.11 and PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
index d5be04ab9..349c26aad 100644
--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@@ -461,17 +461,19 @@ Thanks so much to all of our amazing contributors!
 
 Coati is developed by ColossalAI Team:
 
-- [Fazzie](https://fazzie-key.cool/about/index.html)
-- [FrankLeeeee](https://github.com/FrankLeeeee)
-- [BlueRum](https://github.com/ht-zhou)
-- [ver217](https://github.com/ver217)
-- [ofey404](https://github.com/ofey404)
-- [Wenhao Chen](https://github.com/CWHer)
+- [ver217](https://github.com/ver217) Leading the project while contributing to the main framework.
+- [FrankLeeeee](https://github.com/FrankLeeeee) Providing ML infra support and also taking charge of both front-end and back-end development.
+- [htzhou](https://github.com/ht-zhou) Contributing to the algorithm and development for RM and PPO training.
+- [Fazzie](https://fazzie-key.cool/about/index.html) Contributing to the algorithm and development for SFT.
+- [ofey404](https://github.com/ofey404) Contributing to both front-end and back-end development.
+- [Wenhao Chen](https://github.com/CWHer) Contributing to subsequent code enhancements and performance improvements.
 
 The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
 - [Zangwei Zheng](https://github.com/zhengzangw)
 - [Xue Fuzhao](https://github.com/XueFuzhao)
 
+We also appreciate the valuable suggestions provided by [Jian Hu](https://github.com/hijkzzz) regarding the convergence of the PPO algorithm.
+
 ## Citations
 
 ```bibtex
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
index 3f1660087..37dbac3cf 100644
--- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List
+from typing import Dict, List, Union
 
 import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
 import numpy as np
@@ -58,12 +58,12 @@ class DatasetEvaluator(object):
         [sample["output"] for sample in self.data[category]["data"]]
 
         flag = False
-        softmaxs = []
+        logits = []
         for i, sample in enumerate(self.data[category]["data"]):
-            if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))):
+            if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))):
                 if not flag:
                     print(
-                        f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
+                        f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
                     )
                     flag = True
                 score = 0
@@ -79,13 +79,13 @@ class DatasetEvaluator(object):
                         score,
                         metric_helper.accuracy_by_options(sample["input"], sample["output"], ref),
                     )
-                softmaxs.append(references[i] if score == 1 else -1)
+                logits.append(references[i] if score == 1 else -1)
             else:
-                softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values()))))
+                logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values()))))
 
         references = np.array(references)
-        softmaxs = np.array(softmaxs)
-        scores = np.sum(references == softmaxs) / len(self.data[category]["data"]) * 100
+        logits = np.array(logits)
+        scores = np.sum(references == logits) / len(self.data[category]["data"]) * 100
 
         self.evaluation_results[metric][category] = (scores, len(self.data[category]["data"]))
         self.evaluation_results[metric]["ALL"] += scores * weight
@@ -105,12 +105,12 @@ class DatasetEvaluator(object):
         predictions = [sample["output"] for sample in self.data[category]["data"]]
 
         flag = False
-        softmaxs = []
+        logits = []
         for i, sample in enumerate(self.data[category]["data"]):
-            if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))):
+            if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))):
                 if not flag:
                     print(
-                        f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
+                        f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
                     )
                     flag = True
                 score = 0
@@ -121,16 +121,14 @@ class DatasetEvaluator(object):
                             sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]
                         ),
                     )
-                softmaxs.append(references[i] if score == 1 else -1)
+                logits.append(references[i] if score == 1 else -1)
             else:
-                softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values()))))
+                logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values()))))
 
         metric_method = eval("metric_helper." + metric)
 
         total_score = 0.0
-        for prediction, reference, references_label, softmax in zip(
-            predictions, references, references_labels, softmaxs
-        ):
+        for prediction, reference, references_label, softmax in zip(predictions, references, references_labels, logits):
             score = 0.0
 
             for ref in reference:
@@ -281,7 +279,9 @@ class DatasetEvaluator(object):
 
         return self.evaluation_results
 
-    def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]):
+    def get_evaluation_results(
+        self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str]
+    ):
         """
         Evaluate inference data on the given metrics.
 
@@ -292,10 +292,11 @@ class DatasetEvaluator(object):
             metrics: Metrics used to evaluate.
 
         """
-        self.data = data
+        self.data = data["inference_results"]
         self.dataset_name = dataset_name
+        self.dataset_class = data["dataset_class"]
         self.model_name = model_name
-        self.categories = list(data.keys())
+        self.categories = list(self.data.keys())
         self.metrics = metrics
         self.judgements = {}
 
@@ -315,9 +316,7 @@ class DatasetEvaluator(object):
 
         for metric in self.metrics:
             # Train and reference split use same metric as test split.
-            self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][
-                metric
-            ]
+            self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric]
             if "ALL" in self.suggested_categories[metric]:
                 self.suggested_categories[metric] = self.categories
                 self.metric_total_length[metric] = self.total_length
diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py
index 030059b14..d734eabdb 100644
--- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py
+++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py
@@ -25,7 +25,7 @@ metrics4subcategory = {
         "per_byte_ppl_score": ["ALL"],
     },
     # The commented are non 4-choice questions.
-    "agieval": {
+    "AGIEvalDataset": {
         "combined_single_choice_accuracy": [
             # "lsat-ar",
             # "lsat-lr",
@@ -103,14 +103,14 @@ metrics4subcategory = {
         ],
         "ppl_score": ["ALL"],
     },
-    "cmmlu": {
+    "CMMLUDataset": {
         "first_token_accuracy": ["ALL"],
         "single_choice_accuracy": ["ALL"],
         "perplexity": ["ALL"],
         "ppl_score_over_choices": ["ALL"],
         "ppl_score": ["ALL"],
     },
-    "gaokaobench": {
+    "GaoKaoBenchDataset": {
         "combined_single_choice_accuracy": [
             "English MCQs",
             "Biology MCQs",
@@ -170,7 +170,7 @@ metrics4subcategory = {
         "ppl_score_over_choices": ["ALL"],
         "ppl_score": ["ALL"],
     },
-    "longbench": {
+    "LongBenchDataset": {
         "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
         "f1_zh_score": ["multifieldqa_zh"],
         "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
@@ -183,7 +183,7 @@ metrics4subcategory = {
         "perplexity": ["ALL"],
         "ppl_score": ["ALL"],
     },
-    "mmlu": {
+    "MMLUDataset": {
         "first_token_accuracy": ["ALL"],
         "single_choice_accuracy": ["ALL"],
         "accuracy": ["ALL"],
@@ -191,11 +191,11 @@ metrics4subcategory = {
         "ppl_score_over_choices": ["ALL"],
         "ppl_score": ["ALL"],
     },
-    "mtbench": {"mtbench_single_judge": ["ALL"]},
-    "cvalues": {"first_token_accuracy": ["ALL"]},
-    "safetybench_zh": {"first_token_accuracy": ["ALL"]},
-    "safetybench_en": {"first_token_accuracy": ["ALL"]},
-    "gsm": {
+    "MTBenchDataset": {"mtbench_single_judge": ["ALL"]},
+    "CValuesDataset": {"first_token_accuracy": ["ALL"]},
+    "SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]},
+    "SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]},
+    "GSMDataset": {
         "loss_over_all_tokens": ["ALL"],
         "gsm_accuracy": ["ALL"],
     },
diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py
index 5699955fd..741c884f0 100644
--- a/applications/ColossalEval/colossal_eval/models/huggingface.py
+++ b/applications/ColossalEval/colossal_eval/models/huggingface.py
@@ -116,10 +116,10 @@ class HuggingFaceModel(BaseModel):
             shard_config: Shard config for tensor parallel.
 
         """
-        model_kwargs.setdefault("torch_dtype", torch.float16)
-
         if "torch_dtype" in model_kwargs:
             model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
+        else:
+            model_kwargs.setdefault("torch_dtype", torch.float16)
 
         if "config" in model_kwargs:
             model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"])
@@ -586,11 +586,10 @@ class HuggingFaceCausalLM(HuggingFaceModel):
             shard_config: Shard config for tensor parallel.
 
         """
-
-        model_kwargs.setdefault("torch_dtype", torch.float16)
-
         if "torch_dtype" in model_kwargs:
             model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
+        else:
+            model_kwargs.setdefault("torch_dtype", torch.float16)
 
         if "config" in model_kwargs:
             model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"])
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
index 3f066e721..5b09f9de8 100644
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
 logger = get_dist_logger()
 
 
-def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+def rm_and_merge(
+    dp_size: int,
+    save_path: str,
+    model_names: List[str],
+    dataset_names: Dict[str, List],
+    dataset_classes: Dict[str, List],
+) -> None:
     """
     Remove inference result per rank and merge them into one file.
 
@@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
         save_path: The folder for storing inference results.
         model_names: Names of models for inference.
         dataset_names: Names of dataset for inference.
+        dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
 
     """
 
     for model_name in model_names:
         for dataset_name, categories in dataset_names.items():
+            all_answers_with_dataset_class = {}
+            all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
+
             all_answers = {}
             for category in categories:
                 all_answers[category] = {"data": []}
@@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
 
                 all_answers[category] = answers
 
+            all_answers_with_dataset_class["inference_results"] = all_answers
+
             logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
-            utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+            utils.jdump(
+                all_answers_with_dataset_class,
+                os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
+            )
 
         logger.info(f"Save inference results of model {model_name} for all dataset.")
     logger.info(f"Save inference results of all models for all dataset.")
@@ -98,6 +113,7 @@ def main(args):
     )
 
     inference_data = {}
+    dataset_classes = {}
     debug_args = {}
     few_shot_args = {}
     multiturn_args = {}
@@ -128,6 +144,7 @@ def main(args):
 
             continue
 
+        dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
         dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
         if not issubclass(dataset_class, dataset.BaseDataset):
             raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@@ -149,12 +166,14 @@ def main(args):
             debug_args[new_dataset_name] = dataset_parameter["debug"]
             few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
             inference_data[new_dataset_name] = dataset_.dataset["train"]
+            dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
 
         if load_reference and "reference" in dataset_.dataset:
             new_dataset_name = f"{dataset_name}_reference"
             debug_args[new_dataset_name] = dataset_parameter["debug"]
             few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
             inference_data[new_dataset_name] = dataset_.dataset["reference"]
+            dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
 
     if rank == 0:
         logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@@ -225,7 +244,7 @@ def main(args):
     if rank == 0:
         model_names = [model_parameter["name"] for model_parameter in model_parameters]
         dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
-        rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
+        rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
 
 
 if __name__ == "__main__":
diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.py b/applications/ColossalEval/examples/gpt_evaluation/inference.py
index 3f066e721..5b09f9de8 100644
--- a/applications/ColossalEval/examples/gpt_evaluation/inference.py
+++ b/applications/ColossalEval/examples/gpt_evaluation/inference.py
@@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
 logger = get_dist_logger()
 
 
-def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
+def rm_and_merge(
+    dp_size: int,
+    save_path: str,
+    model_names: List[str],
+    dataset_names: Dict[str, List],
+    dataset_classes: Dict[str, List],
+) -> None:
     """
     Remove inference result per rank and merge them into one file.
 
@@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
         save_path: The folder for storing inference results.
         model_names: Names of models for inference.
         dataset_names: Names of dataset for inference.
+        dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
 
     """
 
     for model_name in model_names:
         for dataset_name, categories in dataset_names.items():
+            all_answers_with_dataset_class = {}
+            all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
+
             all_answers = {}
             for category in categories:
                 all_answers[category] = {"data": []}
@@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
 
                 all_answers[category] = answers
 
+            all_answers_with_dataset_class["inference_results"] = all_answers
+
             logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
-            utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
+            utils.jdump(
+                all_answers_with_dataset_class,
+                os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
+            )
 
         logger.info(f"Save inference results of model {model_name} for all dataset.")
     logger.info(f"Save inference results of all models for all dataset.")
@@ -98,6 +113,7 @@ def main(args):
     )
 
     inference_data = {}
+    dataset_classes = {}
     debug_args = {}
     few_shot_args = {}
     multiturn_args = {}
@@ -128,6 +144,7 @@ def main(args):
 
             continue
 
+        dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
         dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
         if not issubclass(dataset_class, dataset.BaseDataset):
             raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@@ -149,12 +166,14 @@ def main(args):
             debug_args[new_dataset_name] = dataset_parameter["debug"]
             few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
             inference_data[new_dataset_name] = dataset_.dataset["train"]
+            dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
 
         if load_reference and "reference" in dataset_.dataset:
             new_dataset_name = f"{dataset_name}_reference"
             debug_args[new_dataset_name] = dataset_parameter["debug"]
             few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
             inference_data[new_dataset_name] = dataset_.dataset["reference"]
+            dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
 
     if rank == 0:
         logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@@ -225,7 +244,7 @@ def main(args):
     if rank == 0:
         model_names = [model_parameter["name"] for model_parameter in model_parameters]
         dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
-        rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
+        rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
 
 
 if __name__ == "__main__":
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 6622b6dc1..a891db422 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -437,6 +437,10 @@ class GeminiPlugin(DPPluginBase):
             enable_sequence_overlap=self.enable_sequence_overlap,
         )
 
+    def __del__(self):
+        """Destroy the prcess groups in ProcessGroupMesh"""
+        self.pg_mesh.destroy_mesh_process_groups()
+
     def support_no_sync(self) -> bool:
         return False
 
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index c52de0ba7..205660f94 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1054,6 +1054,10 @@ class HybridParallelPlugin(PipelinePluginBase):
 
         self.max_norm = max_norm
 
+    def __del__(self):
+        """Destroy the prcess groups in ProcessGroupMesh"""
+        self.pg_mesh.destroy_mesh_process_groups()
+
     @property
     def enable_pipeline_parallelism(self) -> bool:
         return self.pp_size > 1
diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py
index 7a3bde448..ae3956c69 100644
--- a/colossalai/cluster/process_group_mesh.py
+++ b/colossalai/cluster/process_group_mesh.py
@@ -45,7 +45,7 @@ class ProcessGroupMesh:
         self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {}
         self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {}
 
-    def __del__(self):
+    def destroy_mesh_process_groups(self):
         r"""
         Destructor method for the ProcessGroupMesh class.
 
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 8fd92a2ed..4bca335c8 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -7,6 +7,12 @@ try:
 except:
     fused_mix_prec_layer_norm_cuda = None
 
+try:
+    import fused_weight_gradient_mlp_cuda
+    _grad_accum_fusion_available = True
+except ImportError:
+    _grad_accum_fusion_available = False
+
 
 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
     r"""Layernorm
@@ -141,7 +147,19 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
             # all-reduce scheduled first and have GPU resources allocated
             _ = torch.empty(1, device=grad_output.device) + 1
 
-        grad_weight = grad_output.t().matmul(total_input)
+        if _grad_accum_fusion_available and weight.grad is not None:
+            grad = weight.grad
+            if grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
+                grad_weight = None
+            elif grad.dtype == torch.float16:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+                grad_weight = None
+            else:
+                grad_weight = grad_output.t().matmul(total_input)
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
+
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.async_grad_allreduce:
@@ -214,7 +232,19 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
                 # reduce-scatter scheduled first and have GPU resources allocated
                 _ = torch.empty(1, device=grad_output.device) + 1
 
-            grad_weight = grad_output.t().matmul(total_input)
+            if _grad_accum_fusion_available and weight.grad is not None:
+                grad = weight.grad
+                if grad.dtype == torch.float32:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
+                    grad_weight = None
+                elif grad.dtype == torch.float16:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+                    grad_weight = None
+                else:
+                    grad_weight = grad_output.t().matmul(total_input)
+            else:
+                grad_weight = grad_output.t().matmul(total_input)
+
             grad_bias = grad_output.sum(dim=0) if use_bias else None
 
             if ctx.async_grad_reduce_scatter:
@@ -249,7 +279,20 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
             # calculate gradient
             if len(input_parallel.shape) > 2:
                 input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
-            grad_weight = grad_output.t().matmul(input_parallel)
+
+            if _grad_accum_fusion_available and weight.grad is not None:
+                grad = weight.grad
+                if grad.dtype == torch.float32:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(input_parallel, grad_output, grad)
+                    grad_weight = None
+                elif grad.dtype == torch.float16:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(input_parallel, grad_output, grad)
+                    grad_weight = None
+                else:
+                    grad_weight = grad_output.t().matmul(input_parallel)
+            else:
+                grad_weight = grad_output.t().matmul(input_parallel)
+            # grad_weight = grad_output.t().matmul(input_parallel)
             # wait until reduce-scatter finished
             reducescatter_handle.wait()
 
@@ -388,7 +431,7 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
             input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
             # calculate gradient
             if len(input_parallel.shape) > 2:
-                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
+                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])   
             grad_weight = input_parallel.t().matmul(grad_output)
             # wait until reduce-scatter finished
             reducescatter_handle.wait()
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 9e6386223..eeb0ef399 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -408,7 +408,7 @@ class Linear1D_Row(ParallelModule):
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            output_parallel = F.linear(input_, self.weight)
+            output_parallel = linear_with_async_comm(input_, self.weight, None, None, False)
             if self.seq_parallel:
                 output = linear_reducescatter_forward_gather_backward(
                     output_parallel, self.process_group, self.seq_parallel_dim
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 286852899..1b53ce4af 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -414,7 +414,7 @@ class LlamaPipelineForwards:
             return {"hidden_states": hidden_states}
 
 
-def get_llama_flash_attention_forward():
+def get_llama_flash_attention_forward(shard_config: ShardConfig):
     from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb
 
     from colossalai.kernel.cuda_native import AttnMaskType, ColoAttention
@@ -470,14 +470,13 @@ def get_llama_flash_attention_forward():
 
         flash_attention_mask = None
         attn_mask_type = AttnMaskType.causal
-        if attention_mask != None:
+        if not getattr(shard_config, "causal_lm", False) and attention_mask != None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
-            if not torch.all(flash_attention_mask):
-                attn_mask_type = AttnMaskType.paddedcausal
+            attn_mask_type = AttnMaskType.paddedcausal
 
         attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
         attn_output = attention(
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 39a4d4023..1faa24f71 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -130,7 +130,7 @@ class LlamaPolicy(Policy):
         if self.shard_config.enable_flash_attention:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_llama_flash_attention_forward(),
+                    "forward": get_llama_flash_attention_forward(self.shard_config),
                 },
                 policy=policy,
                 target_key=LlamaAttention,
@@ -250,6 +250,8 @@ class LlamaForCausalLMPolicy(LlamaPolicy):
 
         policy = super().module_policy()
 
+        setattr(self.shard_config, "causal_lm", True)
+
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
             new_item = {
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index c6f15b6d6..7bb4a414b 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -368,7 +368,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 
 环境要求:
 
-- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- PyTorch >= 1.11 并且 PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index 6fc4ce2c9..18607a34c 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -1,7 +1,7 @@
 # Setup
 
 Requirements:
-- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- PyTorch >= 1.11 and PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md
index a6c88672b..e75e42530 100755
--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -2,7 +2,7 @@
 
 环境要求:
 
-- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- PyTorch >= 1.11 并且 PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 61b58055e..4136cefc3 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,5 +1,4 @@
 diffusers
-fbgemm-gpu==0.2.0
 pytest
 coverage==7.2.3
 git+https://github.com/hpcaitech/pytest-testmon
@@ -16,7 +15,7 @@ triton==2.1.0
 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611
 SentencePiece
 ninja
-flash_attn==2.0.5
+flash_attn
 datasets
 pydantic
 ray