mirror of https://github.com/hpcaitech/ColossalAI
Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>pull/5207/head
parent
4fa689fca1
commit
eae01b6740
|
@ -1,5 +1,5 @@
|
||||||
import os
|
import os
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
|
import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -279,7 +279,9 @@ class DatasetEvaluator(object):
|
||||||
|
|
||||||
return self.evaluation_results
|
return self.evaluation_results
|
||||||
|
|
||||||
def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]):
|
def get_evaluation_results(
|
||||||
|
self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str]
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate inference data on the given metrics.
|
Evaluate inference data on the given metrics.
|
||||||
|
|
||||||
|
@ -290,10 +292,11 @@ class DatasetEvaluator(object):
|
||||||
metrics: Metrics used to evaluate.
|
metrics: Metrics used to evaluate.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self.data = data
|
self.data = data["inference_results"]
|
||||||
self.dataset_name = dataset_name
|
self.dataset_name = dataset_name
|
||||||
|
self.dataset_class = data["dataset_class"]
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.categories = list(data.keys())
|
self.categories = list(self.data.keys())
|
||||||
self.metrics = metrics
|
self.metrics = metrics
|
||||||
self.judgements = {}
|
self.judgements = {}
|
||||||
|
|
||||||
|
@ -313,9 +316,7 @@ class DatasetEvaluator(object):
|
||||||
|
|
||||||
for metric in self.metrics:
|
for metric in self.metrics:
|
||||||
# Train and reference split use same metric as test split.
|
# Train and reference split use same metric as test split.
|
||||||
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][
|
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric]
|
||||||
metric
|
|
||||||
]
|
|
||||||
if "ALL" in self.suggested_categories[metric]:
|
if "ALL" in self.suggested_categories[metric]:
|
||||||
self.suggested_categories[metric] = self.categories
|
self.suggested_categories[metric] = self.categories
|
||||||
self.metric_total_length[metric] = self.total_length
|
self.metric_total_length[metric] = self.total_length
|
||||||
|
|
|
@ -25,7 +25,7 @@ metrics4subcategory = {
|
||||||
"per_byte_ppl_score": ["ALL"],
|
"per_byte_ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
# The commented are non 4-choice questions.
|
# The commented are non 4-choice questions.
|
||||||
"agieval": {
|
"AGIEvalDataset": {
|
||||||
"combined_single_choice_accuracy": [
|
"combined_single_choice_accuracy": [
|
||||||
# "lsat-ar",
|
# "lsat-ar",
|
||||||
# "lsat-lr",
|
# "lsat-lr",
|
||||||
|
@ -103,14 +103,14 @@ metrics4subcategory = {
|
||||||
],
|
],
|
||||||
"ppl_score": ["ALL"],
|
"ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
"cmmlu": {
|
"CMMLUDataset": {
|
||||||
"first_token_accuracy": ["ALL"],
|
"first_token_accuracy": ["ALL"],
|
||||||
"single_choice_accuracy": ["ALL"],
|
"single_choice_accuracy": ["ALL"],
|
||||||
"perplexity": ["ALL"],
|
"perplexity": ["ALL"],
|
||||||
"ppl_score_over_choices": ["ALL"],
|
"ppl_score_over_choices": ["ALL"],
|
||||||
"ppl_score": ["ALL"],
|
"ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
"gaokaobench": {
|
"GaoKaoBenchDataset": {
|
||||||
"combined_single_choice_accuracy": [
|
"combined_single_choice_accuracy": [
|
||||||
"English MCQs",
|
"English MCQs",
|
||||||
"Biology MCQs",
|
"Biology MCQs",
|
||||||
|
@ -170,7 +170,7 @@ metrics4subcategory = {
|
||||||
"ppl_score_over_choices": ["ALL"],
|
"ppl_score_over_choices": ["ALL"],
|
||||||
"ppl_score": ["ALL"],
|
"ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
"longbench": {
|
"LongBenchDataset": {
|
||||||
"f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
|
"f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
|
||||||
"f1_zh_score": ["multifieldqa_zh"],
|
"f1_zh_score": ["multifieldqa_zh"],
|
||||||
"rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
|
"rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
|
||||||
|
@ -183,7 +183,7 @@ metrics4subcategory = {
|
||||||
"perplexity": ["ALL"],
|
"perplexity": ["ALL"],
|
||||||
"ppl_score": ["ALL"],
|
"ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
"mmlu": {
|
"MMLUDataset": {
|
||||||
"first_token_accuracy": ["ALL"],
|
"first_token_accuracy": ["ALL"],
|
||||||
"single_choice_accuracy": ["ALL"],
|
"single_choice_accuracy": ["ALL"],
|
||||||
"accuracy": ["ALL"],
|
"accuracy": ["ALL"],
|
||||||
|
@ -191,11 +191,11 @@ metrics4subcategory = {
|
||||||
"ppl_score_over_choices": ["ALL"],
|
"ppl_score_over_choices": ["ALL"],
|
||||||
"ppl_score": ["ALL"],
|
"ppl_score": ["ALL"],
|
||||||
},
|
},
|
||||||
"mtbench": {"mtbench_single_judge": ["ALL"]},
|
"MTBenchDataset": {"mtbench_single_judge": ["ALL"]},
|
||||||
"cvalues": {"first_token_accuracy": ["ALL"]},
|
"CValuesDataset": {"first_token_accuracy": ["ALL"]},
|
||||||
"safetybench_zh": {"first_token_accuracy": ["ALL"]},
|
"SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]},
|
||||||
"safetybench_en": {"first_token_accuracy": ["ALL"]},
|
"SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]},
|
||||||
"gsm": {
|
"GSMDataset": {
|
||||||
"loss_over_all_tokens": ["ALL"],
|
"loss_over_all_tokens": ["ALL"],
|
||||||
"gsm_accuracy": ["ALL"],
|
"gsm_accuracy": ["ALL"],
|
||||||
},
|
},
|
||||||
|
|
|
@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
|
||||||
logger = get_dist_logger()
|
logger = get_dist_logger()
|
||||||
|
|
||||||
|
|
||||||
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
|
def rm_and_merge(
|
||||||
|
dp_size: int,
|
||||||
|
save_path: str,
|
||||||
|
model_names: List[str],
|
||||||
|
dataset_names: Dict[str, List],
|
||||||
|
dataset_classes: Dict[str, List],
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Remove inference result per rank and merge them into one file.
|
Remove inference result per rank and merge them into one file.
|
||||||
|
|
||||||
|
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
|
||||||
save_path: The folder for storing inference results.
|
save_path: The folder for storing inference results.
|
||||||
model_names: Names of models for inference.
|
model_names: Names of models for inference.
|
||||||
dataset_names: Names of dataset for inference.
|
dataset_names: Names of dataset for inference.
|
||||||
|
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for model_name in model_names:
|
for model_name in model_names:
|
||||||
for dataset_name, categories in dataset_names.items():
|
for dataset_name, categories in dataset_names.items():
|
||||||
|
all_answers_with_dataset_class = {}
|
||||||
|
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
|
||||||
|
|
||||||
all_answers = {}
|
all_answers = {}
|
||||||
for category in categories:
|
for category in categories:
|
||||||
all_answers[category] = {"data": []}
|
all_answers[category] = {"data": []}
|
||||||
|
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
|
||||||
|
|
||||||
all_answers[category] = answers
|
all_answers[category] = answers
|
||||||
|
|
||||||
|
all_answers_with_dataset_class["inference_results"] = all_answers
|
||||||
|
|
||||||
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
||||||
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
|
utils.jdump(
|
||||||
|
all_answers_with_dataset_class,
|
||||||
|
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
||||||
logger.info(f"Save inference results of all models for all dataset.")
|
logger.info(f"Save inference results of all models for all dataset.")
|
||||||
|
@ -98,6 +113,7 @@ def main(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
inference_data = {}
|
inference_data = {}
|
||||||
|
dataset_classes = {}
|
||||||
debug_args = {}
|
debug_args = {}
|
||||||
few_shot_args = {}
|
few_shot_args = {}
|
||||||
multiturn_args = {}
|
multiturn_args = {}
|
||||||
|
@ -128,6 +144,7 @@ def main(args):
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
|
||||||
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
||||||
if not issubclass(dataset_class, dataset.BaseDataset):
|
if not issubclass(dataset_class, dataset.BaseDataset):
|
||||||
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
||||||
|
@ -149,12 +166,14 @@ def main(args):
|
||||||
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
||||||
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
||||||
inference_data[new_dataset_name] = dataset_.dataset["train"]
|
inference_data[new_dataset_name] = dataset_.dataset["train"]
|
||||||
|
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
|
||||||
|
|
||||||
if load_reference and "reference" in dataset_.dataset:
|
if load_reference and "reference" in dataset_.dataset:
|
||||||
new_dataset_name = f"{dataset_name}_reference"
|
new_dataset_name = f"{dataset_name}_reference"
|
||||||
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
||||||
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
||||||
inference_data[new_dataset_name] = dataset_.dataset["reference"]
|
inference_data[new_dataset_name] = dataset_.dataset["reference"]
|
||||||
|
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
|
||||||
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
|
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
|
||||||
|
@ -225,7 +244,7 @@ def main(args):
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
||||||
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
||||||
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
|
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
|
||||||
logger = get_dist_logger()
|
logger = get_dist_logger()
|
||||||
|
|
||||||
|
|
||||||
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None:
|
def rm_and_merge(
|
||||||
|
dp_size: int,
|
||||||
|
save_path: str,
|
||||||
|
model_names: List[str],
|
||||||
|
dataset_names: Dict[str, List],
|
||||||
|
dataset_classes: Dict[str, List],
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Remove inference result per rank and merge them into one file.
|
Remove inference result per rank and merge them into one file.
|
||||||
|
|
||||||
|
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
|
||||||
save_path: The folder for storing inference results.
|
save_path: The folder for storing inference results.
|
||||||
model_names: Names of models for inference.
|
model_names: Names of models for inference.
|
||||||
dataset_names: Names of dataset for inference.
|
dataset_names: Names of dataset for inference.
|
||||||
|
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for model_name in model_names:
|
for model_name in model_names:
|
||||||
for dataset_name, categories in dataset_names.items():
|
for dataset_name, categories in dataset_names.items():
|
||||||
|
all_answers_with_dataset_class = {}
|
||||||
|
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
|
||||||
|
|
||||||
all_answers = {}
|
all_answers = {}
|
||||||
for category in categories:
|
for category in categories:
|
||||||
all_answers[category] = {"data": []}
|
all_answers[category] = {"data": []}
|
||||||
|
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
|
||||||
|
|
||||||
all_answers[category] = answers
|
all_answers[category] = answers
|
||||||
|
|
||||||
|
all_answers_with_dataset_class["inference_results"] = all_answers
|
||||||
|
|
||||||
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
|
||||||
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"))
|
utils.jdump(
|
||||||
|
all_answers_with_dataset_class,
|
||||||
|
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
logger.info(f"Save inference results of model {model_name} for all dataset.")
|
||||||
logger.info(f"Save inference results of all models for all dataset.")
|
logger.info(f"Save inference results of all models for all dataset.")
|
||||||
|
@ -98,6 +113,7 @@ def main(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
inference_data = {}
|
inference_data = {}
|
||||||
|
dataset_classes = {}
|
||||||
debug_args = {}
|
debug_args = {}
|
||||||
few_shot_args = {}
|
few_shot_args = {}
|
||||||
multiturn_args = {}
|
multiturn_args = {}
|
||||||
|
@ -128,6 +144,7 @@ def main(args):
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
|
||||||
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
|
||||||
if not issubclass(dataset_class, dataset.BaseDataset):
|
if not issubclass(dataset_class, dataset.BaseDataset):
|
||||||
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
|
||||||
|
@ -149,12 +166,14 @@ def main(args):
|
||||||
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
||||||
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
||||||
inference_data[new_dataset_name] = dataset_.dataset["train"]
|
inference_data[new_dataset_name] = dataset_.dataset["train"]
|
||||||
|
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
|
||||||
|
|
||||||
if load_reference and "reference" in dataset_.dataset:
|
if load_reference and "reference" in dataset_.dataset:
|
||||||
new_dataset_name = f"{dataset_name}_reference"
|
new_dataset_name = f"{dataset_name}_reference"
|
||||||
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
debug_args[new_dataset_name] = dataset_parameter["debug"]
|
||||||
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
|
||||||
inference_data[new_dataset_name] = dataset_.dataset["reference"]
|
inference_data[new_dataset_name] = dataset_.dataset["reference"]
|
||||||
|
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
|
||||||
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
|
logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
|
||||||
|
@ -225,7 +244,7 @@ def main(args):
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
model_names = [model_parameter["name"] for model_parameter in model_parameters]
|
||||||
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
|
||||||
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names)
|
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue