Merge branch 'main' into exp/mixtral

pull/5253/head
ver217 2024-01-08 16:42:00 +08:00
commit ce1cff26bd
26 changed files with 167 additions and 79 deletions

View File

@ -1,3 +1,2 @@
1.12.0-11.3.0
1.13.0-11.6.0
2.0.0-11.7.0 2.0.0-11.7.0
2.1.0-11.8.0

View File

@ -30,7 +30,7 @@ jobs:
github.event.repository.full_name == 'hpcaitech/ColossalAI' github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm options: --rm
timeout-minutes: 5 timeout-minutes: 5
defaults: defaults:
@ -54,7 +54,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm options: --rm
timeout-minutes: 5 timeout-minutes: 5
defaults: defaults:
@ -140,7 +140,7 @@ jobs:
if: needs.detect.outputs.anyLibraryFileChanged == 'true' if: needs.detect.outputs.anyLibraryFileChanged == 'true'
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 60 timeout-minutes: 60
defaults: defaults:
@ -268,7 +268,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm options: --rm
timeout-minutes: 5 timeout-minutes: 5
defaults: defaults:
@ -299,7 +299,7 @@ jobs:
github.event.repository.full_name == 'hpcaitech/ColossalAI' github.event.repository.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --rm options: --rm
timeout-minutes: 5 timeout-minutes: 5
defaults: defaults:

View File

@ -12,7 +12,7 @@ jobs:
if: github.repository == 'hpcaitech/ColossalAI' if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, 8-gpu] runs-on: [self-hosted, 8-gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 40 timeout-minutes: 40
steps: steps:

View File

@ -56,7 +56,7 @@ jobs:
needs: detect-changed-doc needs: detect-changed-doc
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm options: --gpus all --rm
timeout-minutes: 20 timeout-minutes: 20
defaults: defaults:

View File

@ -12,7 +12,7 @@ jobs:
name: Test the changed Doc name: Test the changed Doc
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm options: --gpus all --rm
timeout-minutes: 60 timeout-minutes: 60
steps: steps:

View File

@ -45,7 +45,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/ options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10 timeout-minutes: 10
steps: steps:

View File

@ -77,7 +77,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/ options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 20 timeout-minutes: 20
concurrency: concurrency:

View File

@ -34,7 +34,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container: container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
timeout-minutes: 10 timeout-minutes: 10
steps: steps:
- name: 📚 Checkout - name: 📚 Checkout

View File

@ -372,7 +372,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
## Installation ## Installation
Requirements: Requirements:
- PyTorch >= 1.11 (PyTorch 2.x in progress) - PyTorch >= 1.11 and PyTorch <= 2.1
- Python >= 3.7 - Python >= 3.7
- CUDA >= 11.0 - CUDA >= 11.0
- [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

View File

@ -461,17 +461,19 @@ Thanks so much to all of our amazing contributors!
Coati is developed by ColossalAI Team: Coati is developed by ColossalAI Team:
- [Fazzie](https://fazzie-key.cool/about/index.html) - [ver217](https://github.com/ver217) Leading the project while contributing to the main framework.
- [FrankLeeeee](https://github.com/FrankLeeeee) - [FrankLeeeee](https://github.com/FrankLeeeee) Providing ML infra support and also taking charge of both front-end and back-end development.
- [BlueRum](https://github.com/ht-zhou) - [htzhou](https://github.com/ht-zhou) Contributing to the algorithm and development for RM and PPO training.
- [ver217](https://github.com/ver217) - [Fazzie](https://fazzie-key.cool/about/index.html) Contributing to the algorithm and development for SFT.
- [ofey404](https://github.com/ofey404) - [ofey404](https://github.com/ofey404) Contributing to both front-end and back-end development.
- [Wenhao Chen](https://github.com/CWHer) - [Wenhao Chen](https://github.com/CWHer) Contributing to subsequent code enhancements and performance improvements.
The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project. The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
- [Zangwei Zheng](https://github.com/zhengzangw) - [Zangwei Zheng](https://github.com/zhengzangw)
- [Xue Fuzhao](https://github.com/XueFuzhao) - [Xue Fuzhao](https://github.com/XueFuzhao)
We also appreciate the valuable suggestions provided by [Jian Hu](https://github.com/hijkzzz) regarding the convergence of the PPO algorithm.
## Citations ## Citations
```bibtex ```bibtex

View File

@ -1,5 +1,5 @@
import os import os
from typing import Dict, List from typing import Dict, List, Union
import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
import numpy as np import numpy as np
@ -58,12 +58,12 @@ class DatasetEvaluator(object):
[sample["output"] for sample in self.data[category]["data"]] [sample["output"] for sample in self.data[category]["data"]]
flag = False flag = False
softmaxs = [] logits = []
for i, sample in enumerate(self.data[category]["data"]): for i, sample in enumerate(self.data[category]["data"]):
if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))): if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))):
if not flag: if not flag:
print( print(
f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
) )
flag = True flag = True
score = 0 score = 0
@ -79,13 +79,13 @@ class DatasetEvaluator(object):
score, score,
metric_helper.accuracy_by_options(sample["input"], sample["output"], ref), metric_helper.accuracy_by_options(sample["input"], sample["output"], ref),
) )
softmaxs.append(references[i] if score == 1 else -1) logits.append(references[i] if score == 1 else -1)
else: else:
softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values())))) logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values()))))
references = np.array(references) references = np.array(references)
softmaxs = np.array(softmaxs) logits = np.array(logits)
scores = np.sum(references == softmaxs) / len(self.data[category]["data"]) * 100 scores = np.sum(references == logits) / len(self.data[category]["data"]) * 100
self.evaluation_results[metric][category] = (scores, len(self.data[category]["data"])) self.evaluation_results[metric][category] = (scores, len(self.data[category]["data"]))
self.evaluation_results[metric]["ALL"] += scores * weight self.evaluation_results[metric]["ALL"] += scores * weight
@ -105,12 +105,12 @@ class DatasetEvaluator(object):
predictions = [sample["output"] for sample in self.data[category]["data"]] predictions = [sample["output"] for sample in self.data[category]["data"]]
flag = False flag = False
softmaxs = [] logits = []
for i, sample in enumerate(self.data[category]["data"]): for i, sample in enumerate(self.data[category]["data"]):
if np.any(np.isnan(np.array(list(sample["softmax_over_choices"].values())))): if np.any(np.isnan(np.array(list(sample["logits_over_choices"].values())))):
if not flag: if not flag:
print( print(
f"NaN in the softmax, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}." f"NaN in the logits, switch to exact match for category {category} in dataset {self.dataset_name} in model {self.model_name}."
) )
flag = True flag = True
score = 0 score = 0
@ -121,16 +121,14 @@ class DatasetEvaluator(object):
sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"] sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"]
), ),
) )
softmaxs.append(references[i] if score == 1 else -1) logits.append(references[i] if score == 1 else -1)
else: else:
softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values())))) logits.append(np.argmax(np.array(list(sample["logits_over_choices"].values()))))
metric_method = eval("metric_helper." + metric) metric_method = eval("metric_helper." + metric)
total_score = 0.0 total_score = 0.0
for prediction, reference, references_label, softmax in zip( for prediction, reference, references_label, softmax in zip(predictions, references, references_labels, logits):
predictions, references, references_labels, softmaxs
):
score = 0.0 score = 0.0
for ref in reference: for ref in reference:
@ -281,7 +279,9 @@ class DatasetEvaluator(object):
return self.evaluation_results return self.evaluation_results
def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]): def get_evaluation_results(
self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str]
):
""" """
Evaluate inference data on the given metrics. Evaluate inference data on the given metrics.
@ -292,10 +292,11 @@ class DatasetEvaluator(object):
metrics: Metrics used to evaluate. metrics: Metrics used to evaluate.
""" """
self.data = data self.data = data["inference_results"]
self.dataset_name = dataset_name self.dataset_name = dataset_name
self.dataset_class = data["dataset_class"]
self.model_name = model_name self.model_name = model_name
self.categories = list(data.keys()) self.categories = list(self.data.keys())
self.metrics = metrics self.metrics = metrics
self.judgements = {} self.judgements = {}
@ -315,9 +316,7 @@ class DatasetEvaluator(object):
for metric in self.metrics: for metric in self.metrics:
# Train and reference split use same metric as test split. # Train and reference split use same metric as test split.
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][ self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric]
metric
]
if "ALL" in self.suggested_categories[metric]: if "ALL" in self.suggested_categories[metric]:
self.suggested_categories[metric] = self.categories self.suggested_categories[metric] = self.categories
self.metric_total_length[metric] = self.total_length self.metric_total_length[metric] = self.total_length

View File

@ -25,7 +25,7 @@ metrics4subcategory = {
"per_byte_ppl_score": ["ALL"], "per_byte_ppl_score": ["ALL"],
}, },
# The commented are non 4-choice questions. # The commented are non 4-choice questions.
"agieval": { "AGIEvalDataset": {
"combined_single_choice_accuracy": [ "combined_single_choice_accuracy": [
# "lsat-ar", # "lsat-ar",
# "lsat-lr", # "lsat-lr",
@ -103,14 +103,14 @@ metrics4subcategory = {
], ],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"cmmlu": { "CMMLUDataset": {
"first_token_accuracy": ["ALL"], "first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"],
"perplexity": ["ALL"], "perplexity": ["ALL"],
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"gaokaobench": { "GaoKaoBenchDataset": {
"combined_single_choice_accuracy": [ "combined_single_choice_accuracy": [
"English MCQs", "English MCQs",
"Biology MCQs", "Biology MCQs",
@ -170,7 +170,7 @@ metrics4subcategory = {
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"longbench": { "LongBenchDataset": {
"f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"], "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
"f1_zh_score": ["multifieldqa_zh"], "f1_zh_score": ["multifieldqa_zh"],
"rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"], "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
@ -183,7 +183,7 @@ metrics4subcategory = {
"perplexity": ["ALL"], "perplexity": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"mmlu": { "MMLUDataset": {
"first_token_accuracy": ["ALL"], "first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"],
"accuracy": ["ALL"], "accuracy": ["ALL"],
@ -191,11 +191,11 @@ metrics4subcategory = {
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"mtbench": {"mtbench_single_judge": ["ALL"]}, "MTBenchDataset": {"mtbench_single_judge": ["ALL"]},
"cvalues": {"first_token_accuracy": ["ALL"]}, "CValuesDataset": {"first_token_accuracy": ["ALL"]},
"safetybench_zh": {"first_token_accuracy": ["ALL"]}, "SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]},
"safetybench_en": {"first_token_accuracy": ["ALL"]}, "SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]},
"gsm": { "GSMDataset": {
"loss_over_all_tokens": ["ALL"], "loss_over_all_tokens": ["ALL"],
"gsm_accuracy": ["ALL"], "gsm_accuracy": ["ALL"],
}, },

View File

@ -116,10 +116,10 @@ class HuggingFaceModel(BaseModel):
shard_config: Shard config for tensor parallel. shard_config: Shard config for tensor parallel.
""" """
model_kwargs.setdefault("torch_dtype", torch.float16)
if "torch_dtype" in model_kwargs: if "torch_dtype" in model_kwargs:
model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"]) model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
else:
model_kwargs.setdefault("torch_dtype", torch.float16)
if "config" in model_kwargs: if "config" in model_kwargs:
model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"]) model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"])
@ -586,11 +586,10 @@ class HuggingFaceCausalLM(HuggingFaceModel):
shard_config: Shard config for tensor parallel. shard_config: Shard config for tensor parallel.
""" """
model_kwargs.setdefault("torch_dtype", torch.float16)
if "torch_dtype" in model_kwargs: if "torch_dtype" in model_kwargs:
model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"]) model_kwargs["torch_dtype"] = eval(model_kwargs["torch_dtype"])
else:
model_kwargs.setdefault("torch_dtype", torch.float16)
if "config" in model_kwargs: if "config" in model_kwargs:
model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"]) model_kwargs["config"] = AutoConfig.from_pretrained(model_kwargs["config"])

View File

@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
logger = get_dist_logger() logger = get_dist_logger()
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
""" """
Remove inference result per rank and merge them into one file. Remove inference result per rank and merge them into one file.
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results. save_path: The folder for storing inference results.
model_names: Names of models for inference. model_names: Names of models for inference.
dataset_names: Names of dataset for inference. dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
""" """
for model_name in model_names: for model_name in model_names:
for dataset_name, categories in dataset_names.items(): for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
all_answers = {} all_answers = {}
for category in categories: for category in categories:
all_answers[category] = {"data": []} all_answers[category] = {"data": []}
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
all_answers[category] = answers all_answers[category] = answers
all_answers_with_dataset_class["inference_results"] = all_answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)
logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.") logger.info(f"Save inference results of all models for all dataset.")
@ -98,6 +113,7 @@ def main(args):
) )
inference_data = {} inference_data = {}
dataset_classes = {}
debug_args = {} debug_args = {}
few_shot_args = {} few_shot_args = {}
multiturn_args = {} multiturn_args = {}
@ -128,6 +144,7 @@ def main(args):
continue continue
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset): if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"] inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if load_reference and "reference" in dataset_.dataset: if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference" new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"] inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if rank == 0: if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}") logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@ -225,7 +244,7 @@ def main(args):
if rank == 0: if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters] model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
logger = get_dist_logger() logger = get_dist_logger()
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
""" """
Remove inference result per rank and merge them into one file. Remove inference result per rank and merge them into one file.
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results. save_path: The folder for storing inference results.
model_names: Names of models for inference. model_names: Names of models for inference.
dataset_names: Names of dataset for inference. dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
""" """
for model_name in model_names: for model_name in model_names:
for dataset_name, categories in dataset_names.items(): for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
all_answers = {} all_answers = {}
for category in categories: for category in categories:
all_answers[category] = {"data": []} all_answers[category] = {"data": []}
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
all_answers[category] = answers all_answers[category] = answers
all_answers_with_dataset_class["inference_results"] = all_answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)
logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.") logger.info(f"Save inference results of all models for all dataset.")
@ -98,6 +113,7 @@ def main(args):
) )
inference_data = {} inference_data = {}
dataset_classes = {}
debug_args = {} debug_args = {}
few_shot_args = {} few_shot_args = {}
multiturn_args = {} multiturn_args = {}
@ -128,6 +144,7 @@ def main(args):
continue continue
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset): if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"] inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if load_reference and "reference" in dataset_.dataset: if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference" new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"] inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if rank == 0: if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}") logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@ -225,7 +244,7 @@ def main(args):
if rank == 0: if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters] model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -437,6 +437,10 @@ class GeminiPlugin(DPPluginBase):
enable_sequence_overlap=self.enable_sequence_overlap, enable_sequence_overlap=self.enable_sequence_overlap,
) )
def __del__(self):
"""Destroy the prcess groups in ProcessGroupMesh"""
self.pg_mesh.destroy_mesh_process_groups()
def support_no_sync(self) -> bool: def support_no_sync(self) -> bool:
return False return False

View File

@ -1054,6 +1054,10 @@ class HybridParallelPlugin(PipelinePluginBase):
self.max_norm = max_norm self.max_norm = max_norm
def __del__(self):
"""Destroy the prcess groups in ProcessGroupMesh"""
self.pg_mesh.destroy_mesh_process_groups()
@property @property
def enable_pipeline_parallelism(self) -> bool: def enable_pipeline_parallelism(self) -> bool:
return self.pp_size > 1 return self.pp_size > 1

View File

@ -45,7 +45,7 @@ class ProcessGroupMesh:
self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {} self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {}
self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {} self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {}
def __del__(self): def destroy_mesh_process_groups(self):
r""" r"""
Destructor method for the ProcessGroupMesh class. Destructor method for the ProcessGroupMesh class.

View File

@ -7,6 +7,12 @@ try:
except: except:
fused_mix_prec_layer_norm_cuda = None fused_mix_prec_layer_norm_cuda = None
try:
import fused_weight_gradient_mlp_cuda
_grad_accum_fusion_available = True
except ImportError:
_grad_accum_fusion_available = False
class FusedLayerNormAffineFunction1D(torch.autograd.Function): class FusedLayerNormAffineFunction1D(torch.autograd.Function):
r"""Layernorm r"""Layernorm
@ -141,7 +147,19 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
# all-reduce scheduled first and have GPU resources allocated # all-reduce scheduled first and have GPU resources allocated
_ = torch.empty(1, device=grad_output.device) + 1 _ = torch.empty(1, device=grad_output.device) + 1
grad_weight = grad_output.t().matmul(total_input) if _grad_accum_fusion_available and weight.grad is not None:
grad = weight.grad
if grad.dtype == torch.float32:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
grad_weight = None
elif grad.dtype == torch.float16:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
grad_weight = None
else:
grad_weight = grad_output.t().matmul(total_input)
else:
grad_weight = grad_output.t().matmul(total_input)
grad_bias = grad_output.sum(dim=0) if use_bias else None grad_bias = grad_output.sum(dim=0) if use_bias else None
if ctx.async_grad_allreduce: if ctx.async_grad_allreduce:
@ -214,7 +232,19 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
# reduce-scatter scheduled first and have GPU resources allocated # reduce-scatter scheduled first and have GPU resources allocated
_ = torch.empty(1, device=grad_output.device) + 1 _ = torch.empty(1, device=grad_output.device) + 1
grad_weight = grad_output.t().matmul(total_input) if _grad_accum_fusion_available and weight.grad is not None:
grad = weight.grad
if grad.dtype == torch.float32:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
grad_weight = None
elif grad.dtype == torch.float16:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
grad_weight = None
else:
grad_weight = grad_output.t().matmul(total_input)
else:
grad_weight = grad_output.t().matmul(total_input)
grad_bias = grad_output.sum(dim=0) if use_bias else None grad_bias = grad_output.sum(dim=0) if use_bias else None
if ctx.async_grad_reduce_scatter: if ctx.async_grad_reduce_scatter:
@ -249,7 +279,20 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
# calculate gradient # calculate gradient
if len(input_parallel.shape) > 2: if len(input_parallel.shape) > 2:
input_parallel = input_parallel.view(-1, input_parallel.shape[-1]) input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
grad_weight = grad_output.t().matmul(input_parallel)
if _grad_accum_fusion_available and weight.grad is not None:
grad = weight.grad
if grad.dtype == torch.float32:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(input_parallel, grad_output, grad)
grad_weight = None
elif grad.dtype == torch.float16:
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(input_parallel, grad_output, grad)
grad_weight = None
else:
grad_weight = grad_output.t().matmul(input_parallel)
else:
grad_weight = grad_output.t().matmul(input_parallel)
# grad_weight = grad_output.t().matmul(input_parallel)
# wait until reduce-scatter finished # wait until reduce-scatter finished
reducescatter_handle.wait() reducescatter_handle.wait()
@ -388,7 +431,7 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
input_parallel = torch.cat(tensor_list, dim=dim).contiguous() input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
# calculate gradient # calculate gradient
if len(input_parallel.shape) > 2: if len(input_parallel.shape) > 2:
input_parallel = input_parallel.view(-1, input_parallel.shape[-1]) input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
grad_weight = input_parallel.t().matmul(grad_output) grad_weight = input_parallel.t().matmul(grad_output)
# wait until reduce-scatter finished # wait until reduce-scatter finished
reducescatter_handle.wait() reducescatter_handle.wait()

View File

@ -408,7 +408,7 @@ class Linear1D_Row(ParallelModule):
handle.wait() handle.wait()
output = torch.cat(output_parallel_list, dim=-1) output = torch.cat(output_parallel_list, dim=-1)
else: else:
output_parallel = F.linear(input_, self.weight) output_parallel = linear_with_async_comm(input_, self.weight, None, None, False)
if self.seq_parallel: if self.seq_parallel:
output = linear_reducescatter_forward_gather_backward( output = linear_reducescatter_forward_gather_backward(
output_parallel, self.process_group, self.seq_parallel_dim output_parallel, self.process_group, self.seq_parallel_dim

View File

@ -414,7 +414,7 @@ class LlamaPipelineForwards:
return {"hidden_states": hidden_states} return {"hidden_states": hidden_states}
def get_llama_flash_attention_forward(): def get_llama_flash_attention_forward(shard_config: ShardConfig):
from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb
from colossalai.kernel.cuda_native import AttnMaskType, ColoAttention from colossalai.kernel.cuda_native import AttnMaskType, ColoAttention
@ -470,14 +470,13 @@ def get_llama_flash_attention_forward():
flash_attention_mask = None flash_attention_mask = None
attn_mask_type = AttnMaskType.causal attn_mask_type = AttnMaskType.causal
if attention_mask != None: if not getattr(shard_config, "causal_lm", False) and attention_mask != None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError( raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
) )
flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous() flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
if not torch.all(flash_attention_mask): attn_mask_type = AttnMaskType.paddedcausal
attn_mask_type = AttnMaskType.paddedcausal
attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads) attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
attn_output = attention( attn_output = attention(

View File

@ -130,7 +130,7 @@ class LlamaPolicy(Policy):
if self.shard_config.enable_flash_attention: if self.shard_config.enable_flash_attention:
self.append_or_create_method_replacement( self.append_or_create_method_replacement(
description={ description={
"forward": get_llama_flash_attention_forward(), "forward": get_llama_flash_attention_forward(self.shard_config),
}, },
policy=policy, policy=policy,
target_key=LlamaAttention, target_key=LlamaAttention,
@ -250,6 +250,8 @@ class LlamaForCausalLMPolicy(LlamaPolicy):
policy = super().module_policy() policy = super().module_policy()
setattr(self.shard_config, "causal_lm", True)
if self.shard_config.enable_tensor_parallelism: if self.shard_config.enable_tensor_parallelism:
# add a new item for casual lm # add a new item for casual lm
new_item = { new_item = {

View File

@ -368,7 +368,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
环境要求: 环境要求:
- PyTorch >= 1.11 (PyTorch 2.x 正在适配中) - PyTorch >= 1.11 并且 PyTorch <= 2.1
- Python >= 3.7 - Python >= 3.7
- CUDA >= 11.0 - CUDA >= 11.0
- [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

View File

@ -1,7 +1,7 @@
# Setup # Setup
Requirements: Requirements:
- PyTorch >= 1.11 (PyTorch 2.x in progress) - PyTorch >= 1.11 and PyTorch <= 2.1
- Python >= 3.7 - Python >= 3.7
- CUDA >= 11.0 - CUDA >= 11.0
- [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

View File

@ -2,7 +2,7 @@
环境要求: 环境要求:
- PyTorch >= 1.11 (PyTorch 2.x 正在适配中) - PyTorch >= 1.11 并且 PyTorch <= 2.1
- Python >= 3.7 - Python >= 3.7
- CUDA >= 11.0 - CUDA >= 11.0
- [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher) - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

View File

@ -1,5 +1,4 @@
diffusers diffusers
fbgemm-gpu==0.2.0
pytest pytest
coverage==7.2.3 coverage==7.2.3
git+https://github.com/hpcaitech/pytest-testmon git+https://github.com/hpcaitech/pytest-testmon
@ -16,7 +15,7 @@ triton==2.1.0
requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611
SentencePiece SentencePiece
ninja ninja
flash_attn==2.0.5 flash_attn
datasets datasets
pydantic pydantic
ray ray