diff --git a/.compatibility b/.compatibility index 32da32be5..a918cb162 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1,2 @@ -1.12.0-11.3.0 -1.13.0-11.6.0 2.0.0-11.7.0 +2.1.0-11.8.0 diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e2114d43b..8eb358c4f 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -30,7 +30,7 @@ jobs: github.event.repository.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --rm timeout-minutes: 5 defaults: @@ -54,7 +54,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --rm timeout-minutes: 5 defaults: @@ -140,7 +140,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 60 defaults: @@ -268,7 +268,7 @@ jobs: github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --rm timeout-minutes: 5 defaults: @@ -299,7 +299,7 @@ jobs: github.event.repository.full_name == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --rm timeout-minutes: 5 defaults: diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 6c77377be..e5afe9622 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -12,7 +12,7 @@ jobs: if: github.repository == 'hpcaitech/ColossalAI' runs-on: [self-hosted, 8-gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 40 steps: diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index f1e7a2d0c..51238905e 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -56,7 +56,7 @@ jobs: needs: detect-changed-doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm timeout-minutes: 20 defaults: diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index 027fbfd0a..b4c776747 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index 9d3bd9a48..02e30f52a 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ timeout-minutes: 10 steps: diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index 859b6e4fb..6d6952aa1 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -77,7 +77,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ timeout-minutes: 20 concurrency: diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index 5ed128c3e..919fa5092 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 timeout-minutes: 10 steps: - name: 📚 Checkout diff --git a/README.md b/README.md index 04a349337..545500591 100644 --- a/README.md +++ b/README.md @@ -372,7 +372,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt ## Installation Requirements: -- PyTorch >= 1.11 (PyTorch 2.x in progress) +- PyTorch >= 1.11 and PyTorch <= 2.1 - Python >= 3.7 - CUDA >= 11.0 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) diff --git a/applications/Chat/README.md b/applications/Chat/README.md index d5be04ab9..349c26aad 100644 --- a/applications/Chat/README.md +++ b/applications/Chat/README.md @@ -461,17 +461,19 @@ Thanks so much to all of our amazing contributors! Coati is developed by ColossalAI Team: -- [Fazzie](https://fazzie-key.cool/about/index.html) -- [FrankLeeeee](https://github.com/FrankLeeeee) -- [BlueRum](https://github.com/ht-zhou) -- [ver217](https://github.com/ver217) -- [ofey404](https://github.com/ofey404) -- [Wenhao Chen](https://github.com/CWHer) +- [ver217](https://github.com/ver217) Leading the project while contributing to the main framework. +- [FrankLeeeee](https://github.com/FrankLeeeee) Providing ML infra support and also taking charge of both front-end and back-end development. +- [htzhou](https://github.com/ht-zhou) Contributing to the algorithm and development for RM and PPO training. +- [Fazzie](https://fazzie-key.cool/about/index.html) Contributing to the algorithm and development for SFT. +- [ofey404](https://github.com/ofey404) Contributing to both front-end and back-end development. +- [Wenhao Chen](https://github.com/CWHer) Contributing to subsequent code enhancements and performance improvements. The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project. - [Zangwei Zheng](https://github.com/zhengzangw) - [Xue Fuzhao](https://github.com/XueFuzhao) +We also appreciate the valuable suggestions provided by [Jian Hu](https://github.com/hijkzzz) regarding the convergence of the PPO algorithm. + ## Citations ```bibtex diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py index 3f1660087..37dbac3cf 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py @@ -1,5 +1,5 @@ import os -from typing import Dict, List +from typing import Dict, List, Union import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper import numpy as np @@ -58,12 +58,12 @@ class DatasetEvaluator(object): [sample["output"] for sample in self.data[category]["data"]] flag = False - softmaxs = [] + logits = [] for i, sample in enumerate(self.data[category]["data"]): - if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))): + if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))): if not flag: print( - f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." + f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." ) flag = True score = 0 @@ -79,13 +79,13 @@ class DatasetEvaluator(object): score, metric_helper.accuracy_by_options(sample["input"], sample["output"], ref), ) - softmaxs.append(references[i] if score == 1 else -1) + logits.append(references[i] if score == 1 else -1) else: - softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values())))) + logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values())))) references = np.array(references) - softmaxs = np.array(softmaxs) - scores = np.sum(references == softmaxs) / len(self.data[category]["data"]) * 100 + logits = np.array(logits) + scores = np.sum(references == logits) / len(self.data[category]["data"]) * 100 self.evaluation_results[metric][category] = (scores, len(self.data[category]["data"])) self.evaluation_results[metric]["ALL"] += scores * weight @@ -105,12 +105,12 @@ class DatasetEvaluator(object): predictions = [sample["output"] for sample in self.data[category]["data"]] flag = False - softmaxs = [] + logits = [] for i, sample in enumerate(self.data[category]["data"]): - if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))): + if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))): if not flag: print( - f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." + f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." ) flag = True score = 0 @@ -121,16 +121,14 @@ class DatasetEvaluator(object): sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"] ), ) - softmaxs.append(references[i] if score == 1 else -1) + logits.append(references[i] if score == 1 else -1) else: - softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values())))) + logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values())))) metric_method = eval("metric_helper." + metric) total_score = 0.0 - for prediction, reference, references_label, softmax in zip( - predictions, references, references_labels, softmaxs - ): + for prediction, reference, references_label, softmax in zip(predictions, references, references_labels, logits): score = 0.0 for ref in reference: @@ -281,7 +279,9 @@ class DatasetEvaluator(object): return self.evaluation_results - def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]): + def get_evaluation_results( + self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str] + ): """ Evaluate inference data on the given metrics. @@ -292,10 +292,11 @@ class DatasetEvaluator(object): metrics: Metrics used to evaluate. """ - self.data = data + self.data = data["inference_results"] self.dataset_name = dataset_name + self.dataset_class = data["dataset_class"] self.model_name = model_name - self.categories = list(data.keys()) + self.categories = list(self.data.keys()) self.metrics = metrics self.judgements = {} @@ -315,9 +316,7 @@ class DatasetEvaluator(object): for metric in self.metrics: # Train and reference split use same metric as test split. - self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][ - metric - ] + self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric] if "ALL" in self.suggested_categories[metric]: self.suggested_categories[metric] = self.categories self.metric_total_length[metric] = self.total_length diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py index 030059b14..d734eabdb 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py @@ -25,7 +25,7 @@ metrics4subcategory = { "per_byte_ppl_score": ["ALL"], }, # The commented are non 4-choice questions. - "agieval": { + "AGIEvalDataset": { "combined_single_choice_accuracy": [ # "lsat-ar", # "lsat-lr", @@ -103,14 +103,14 @@ metrics4subcategory = { ], "ppl_score": ["ALL"], }, - "cmmlu": { + "CMMLUDataset": { "first_token_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"], "perplexity": ["ALL"], "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "gaokaobench": { + "GaoKaoBenchDataset": { "combined_single_choice_accuracy": [ "English MCQs", "Biology MCQs", @@ -170,7 +170,7 @@ metrics4subcategory = { "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "longbench": { + "LongBenchDataset": { "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"], "f1_zh_score": ["multifieldqa_zh"], "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"], @@ -183,7 +183,7 @@ metrics4subcategory = { "perplexity": ["ALL"], "ppl_score": ["ALL"], }, - "mmlu": { + "MMLUDataset": { "first_token_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"], "accuracy": ["ALL"], @@ -191,11 +191,11 @@ metrics4subcategory = { "ppl_score_over_choices": ["ALL"], "ppl_score": ["ALL"], }, - "mtbench": {"mtbench_single_judge": ["ALL"]}, - "cvalues": {"first_token_accuracy": ["ALL"]}, - "safetybench_zh": {"first_token_accuracy": ["ALL"]}, - "safetybench_en": {"first_token_accuracy": ["ALL"]}, - "gsm": { + "MTBenchDataset": {"mtbench_single_judge": ["ALL"]}, + "CValuesDataset": {"first_token_accuracy": ["ALL"]}, + "SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]}, + "SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]}, + "GSMDataset": { "loss_over_all_tokens": ["ALL"], "gsm_accuracy": ["ALL"], }, diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py index 5699955fd..741c884f0 100644 --- a/applications/ColossalEval/colossal_eval/models/huggingface.py +++ b/applications/ColossalEval/colossal_eval/models/huggingface.py @@ -116,10 +116,10 @@ class HuggingFaceModel(BaseModel): shard_config: Shard config for tensor parallel. """ - model_kwargs.setdefault("torch_dtype", torch.float16) - if "torch_dtype" in model_kwargs: model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"]) + else: + model_kwargs.setdefault("torch_dtype", torch.float16) if "config" in model_kwargs: model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"]) @@ -586,11 +586,10 @@ class HuggingFaceCausalLM(HuggingFaceModel): shard_config: Shard config for tensor parallel. """ - - model_kwargs.setdefault("torch_dtype", torch.float16) - if "torch_dtype" in model_kwargs: model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"]) + else: + model_kwargs.setdefault("torch_dtype", torch.float16) if "config" in model_kwargs: model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"]) diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py index 3f066e721..5b09f9de8 100644 --- a/applications/ColossalEval/examples/dataset_evaluation/inference.py +++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py @@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig logger = get_dist_logger() -def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: +def rm_and_merge( + dp_size: int, + save_path: str, + model_names: List[str], + dataset_names: Dict[str, List], + dataset_classes: Dict[str, List], +) -> None: """ Remove inference result per rank and merge them into one file. @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n save_path: The folder for storing inference results. model_names: Names of models for inference. dataset_names: Names of dataset for inference. + dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process. """ for model_name in model_names: for dataset_name, categories in dataset_names.items(): + all_answers_with_dataset_class = {} + all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name] + all_answers = {} for category in categories: all_answers[category] = {"data": []} @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n all_answers[category] = answers + all_answers_with_dataset_class["inference_results"] = all_answers + logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") - utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) + utils.jdump( + all_answers_with_dataset_class, + os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"), + ) logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of all models for all dataset.") @@ -98,6 +113,7 @@ def main(args): ) inference_data = {} + dataset_classes = {} debug_args = {} few_shot_args = {} multiturn_args = {} @@ -128,6 +144,7 @@ def main(args): continue + dataset_classes[dataset_name] = dataset_parameter["dataset_class"] dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") if not issubclass(dataset_class, dataset.BaseDataset): raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") @@ -149,12 +166,14 @@ def main(args): debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["train"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if load_reference and "reference" in dataset_.dataset: new_dataset_name = f"{dataset_name}_reference" debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["reference"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if rank == 0: logger.info(f"Dataset for inference are: {list(inference_data.keys())}") @@ -225,7 +244,7 @@ def main(args): if rank == 0: model_names = [model_parameter["name"] for model_parameter in model_parameters] dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} - rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) + rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes) if __name__ == "__main__": diff --git a/applications/ColossalEval/examples/gpt_evaluation/inference.py b/applications/ColossalEval/examples/gpt_evaluation/inference.py index 3f066e721..5b09f9de8 100644 --- a/applications/ColossalEval/examples/gpt_evaluation/inference.py +++ b/applications/ColossalEval/examples/gpt_evaluation/inference.py @@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig logger = get_dist_logger() -def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: +def rm_and_merge( + dp_size: int, + save_path: str, + model_names: List[str], + dataset_names: Dict[str, List], + dataset_classes: Dict[str, List], +) -> None: """ Remove inference result per rank and merge them into one file. @@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n save_path: The folder for storing inference results. model_names: Names of models for inference. dataset_names: Names of dataset for inference. + dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process. """ for model_name in model_names: for dataset_name, categories in dataset_names.items(): + all_answers_with_dataset_class = {} + all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name] + all_answers = {} for category in categories: all_answers[category] = {"data": []} @@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n all_answers[category] = answers + all_answers_with_dataset_class["inference_results"] = all_answers + logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") - utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) + utils.jdump( + all_answers_with_dataset_class, + os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"), + ) logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of all models for all dataset.") @@ -98,6 +113,7 @@ def main(args): ) inference_data = {} + dataset_classes = {} debug_args = {} few_shot_args = {} multiturn_args = {} @@ -128,6 +144,7 @@ def main(args): continue + dataset_classes[dataset_name] = dataset_parameter["dataset_class"] dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") if not issubclass(dataset_class, dataset.BaseDataset): raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") @@ -149,12 +166,14 @@ def main(args): debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["train"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if load_reference and "reference" in dataset_.dataset: new_dataset_name = f"{dataset_name}_reference" debug_args[new_dataset_name] = dataset_parameter["debug"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] inference_data[new_dataset_name] = dataset_.dataset["reference"] + dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"] if rank == 0: logger.info(f"Dataset for inference are: {list(inference_data.keys())}") @@ -225,7 +244,7 @@ def main(args): if rank == 0: model_names = [model_parameter["name"] for model_parameter in model_parameters] dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} - rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) + rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes) if __name__ == "__main__": diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index 6622b6dc1..a891db422 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -437,6 +437,10 @@ class GeminiPlugin(DPPluginBase): enable_sequence_overlap=self.enable_sequence_overlap, ) + def __del__(self): + """Destroy the prcess groups in ProcessGroupMesh""" + self.pg_mesh.destroy_mesh_process_groups() + def support_no_sync(self) -> bool: return False diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index c52de0ba7..205660f94 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1054,6 +1054,10 @@ class HybridParallelPlugin(PipelinePluginBase): self.max_norm = max_norm + def __del__(self): + """Destroy the prcess groups in ProcessGroupMesh""" + self.pg_mesh.destroy_mesh_process_groups() + @property def enable_pipeline_parallelism(self) -> bool: return self.pp_size > 1 diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py index 7a3bde448..ae3956c69 100644 --- a/colossalai/cluster/process_group_mesh.py +++ b/colossalai/cluster/process_group_mesh.py @@ -45,7 +45,7 @@ class ProcessGroupMesh: self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {} self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {} - def __del__(self): + def destroy_mesh_process_groups(self): r""" Destructor method for the ProcessGroupMesh class. diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py index 8fd92a2ed..4bca335c8 100644 --- a/colossalai/shardformer/layer/_operation.py +++ b/colossalai/shardformer/layer/_operation.py @@ -7,6 +7,12 @@ try: except: fused_mix_prec_layer_norm_cuda = None +try: + import fused_weight_gradient_mlp_cuda + _grad_accum_fusion_available = True +except ImportError: + _grad_accum_fusion_available = False + class FusedLayerNormAffineFunction1D(torch.autograd.Function): r"""Layernorm @@ -141,7 +147,19 @@ class LinearWithAsyncCommunication(torch.autograd.Function): # all-reduce scheduled first and have GPU resources allocated _ = torch.empty(1, device=grad_output.device) + 1 - grad_weight = grad_output.t().matmul(total_input) + if _grad_accum_fusion_available and weight.grad is not None: + grad = weight.grad + if grad.dtype == torch.float32: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad) + grad_weight = None + elif grad.dtype == torch.float16: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad) + grad_weight = None + else: + grad_weight = grad_output.t().matmul(total_input) + else: + grad_weight = grad_output.t().matmul(total_input) + grad_bias = grad_output.sum(dim=0) if use_bias else None if ctx.async_grad_allreduce: @@ -214,7 +232,19 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function): # reduce-scatter scheduled first and have GPU resources allocated _ = torch.empty(1, device=grad_output.device) + 1 - grad_weight = grad_output.t().matmul(total_input) + if _grad_accum_fusion_available and weight.grad is not None: + grad = weight.grad + if grad.dtype == torch.float32: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad) + grad_weight = None + elif grad.dtype == torch.float16: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad) + grad_weight = None + else: + grad_weight = grad_output.t().matmul(total_input) + else: + grad_weight = grad_output.t().matmul(total_input) + grad_bias = grad_output.sum(dim=0) if use_bias else None if ctx.async_grad_reduce_scatter: @@ -249,7 +279,20 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function): # calculate gradient if len(input_parallel.shape) > 2: input_parallel = input_parallel.view(-1, input_parallel.shape[-1]) - grad_weight = grad_output.t().matmul(input_parallel) + + if _grad_accum_fusion_available and weight.grad is not None: + grad = weight.grad + if grad.dtype == torch.float32: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(input_parallel, grad_output, grad) + grad_weight = None + elif grad.dtype == torch.float16: + fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(input_parallel, grad_output, grad) + grad_weight = None + else: + grad_weight = grad_output.t().matmul(input_parallel) + else: + grad_weight = grad_output.t().matmul(input_parallel) + # grad_weight = grad_output.t().matmul(input_parallel) # wait until reduce-scatter finished reducescatter_handle.wait() @@ -388,7 +431,7 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function): input_parallel = torch.cat(tensor_list, dim=dim).contiguous() # calculate gradient if len(input_parallel.shape) > 2: - input_parallel = input_parallel.view(-1, input_parallel.shape[-1]) + input_parallel = input_parallel.view(-1, input_parallel.shape[-1]) grad_weight = input_parallel.t().matmul(grad_output) # wait until reduce-scatter finished reducescatter_handle.wait() diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py index 9e6386223..eeb0ef399 100644 --- a/colossalai/shardformer/layer/linear.py +++ b/colossalai/shardformer/layer/linear.py @@ -408,7 +408,7 @@ class Linear1D_Row(ParallelModule): handle.wait() output = torch.cat(output_parallel_list, dim=-1) else: - output_parallel = F.linear(input_, self.weight) + output_parallel = linear_with_async_comm(input_, self.weight, None, None, False) if self.seq_parallel: output = linear_reducescatter_forward_gather_backward( output_parallel, self.process_group, self.seq_parallel_dim diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index 286852899..1b53ce4af 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -414,7 +414,7 @@ class LlamaPipelineForwards: return {"hidden_states": hidden_states} -def get_llama_flash_attention_forward(): +def get_llama_flash_attention_forward(shard_config: ShardConfig): from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb from colossalai.kernel.cuda_native import AttnMaskType, ColoAttention @@ -470,14 +470,13 @@ def get_llama_flash_attention_forward(): flash_attention_mask = None attn_mask_type = AttnMaskType.causal - if attention_mask != None: + if not getattr(shard_config, "causal_lm", False) and attention_mask != None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous() - if not torch.all(flash_attention_mask): - attn_mask_type = AttnMaskType.paddedcausal + attn_mask_type = AttnMaskType.paddedcausal attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads) attn_output = attention( diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index 39a4d4023..1faa24f71 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -130,7 +130,7 @@ class LlamaPolicy(Policy): if self.shard_config.enable_flash_attention: self.append_or_create_method_replacement( description={ - "forward": get_llama_flash_attention_forward(), + "forward": get_llama_flash_attention_forward(self.shard_config), }, policy=policy, target_key=LlamaAttention, @@ -250,6 +250,8 @@ class LlamaForCausalLMPolicy(LlamaPolicy): policy = super().module_policy() + setattr(self.shard_config, "causal_lm", True) + if self.shard_config.enable_tensor_parallelism: # add a new item for casual lm new_item = { diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md index c6f15b6d6..7bb4a414b 100644 --- a/docs/README-zh-Hans.md +++ b/docs/README-zh-Hans.md @@ -368,7 +368,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的 环境要求: -- PyTorch >= 1.11 (PyTorch 2.x 正在适配中) +- PyTorch >= 1.11 并且 PyTorch <= 2.1 - Python >= 3.7 - CUDA >= 11.0 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md index 6fc4ce2c9..18607a34c 100644 --- a/docs/source/en/get_started/installation.md +++ b/docs/source/en/get_started/installation.md @@ -1,7 +1,7 @@ # Setup Requirements: -- PyTorch >= 1.11 (PyTorch 2.x in progress) +- PyTorch >= 1.11 and PyTorch <= 2.1 - Python >= 3.7 - CUDA >= 11.0 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) diff --git a/docs/source/zh-Hans/get_started/installation.md b/docs/source/zh-Hans/get_started/installation.md index a6c88672b..e75e42530 100755 --- a/docs/source/zh-Hans/get_started/installation.md +++ b/docs/source/zh-Hans/get_started/installation.md @@ -2,7 +2,7 @@ 环境要求: -- PyTorch >= 1.11 (PyTorch 2.x 正在适配中) +- PyTorch >= 1.11 并且 PyTorch <= 2.1 - Python >= 3.7 - CUDA >= 11.0 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 61b58055e..4136cefc3 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,5 +1,4 @@ diffusers -fbgemm-gpu==0.2.0 pytest coverage==7.2.3 git+https://github.com/hpcaitech/pytest-testmon @@ -16,7 +15,7 @@ triton==2.1.0 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611 SentencePiece ninja -flash_attn==2.0.5 +flash_attn datasets pydantic ray