Improve logic for selecting metrics (#5196)

Co-authored-by: Xu <yuanchen.xu00@gmail.com>
pull/5207/head
Yuanchen 2023-12-22 14:52:50 +08:00 committed by GitHub
parent 4fa689fca1
commit eae01b6740
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 23 deletions

View File

@ -1,5 +1,5 @@
import os import os
from typing import Dict, List from typing import Dict, List, Union
import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper import colossal_eval.evaluate.dataset_evaluator.metrics as metric_helper
import numpy as np import numpy as np
@ -279,7 +279,9 @@ class DatasetEvaluator(object):
return self.evaluation_results return self.evaluation_results
def get_evaluation_results(self, data: List[Dict], dataset_name: str, model_name: str, metrics: List[str]): def get_evaluation_results(
self, data: Dict[str, Union[str, Dict]], dataset_name: str, model_name: str, metrics: List[str]
):
""" """
Evaluate inference data on the given metrics. Evaluate inference data on the given metrics.
@ -290,10 +292,11 @@ class DatasetEvaluator(object):
metrics: Metrics used to evaluate. metrics: Metrics used to evaluate.
""" """
self.data = data self.data = data["inference_results"]
self.dataset_name = dataset_name self.dataset_name = dataset_name
self.dataset_class = data["dataset_class"]
self.model_name = model_name self.model_name = model_name
self.categories = list(data.keys()) self.categories = list(self.data.keys())
self.metrics = metrics self.metrics = metrics
self.judgements = {} self.judgements = {}
@ -313,9 +316,7 @@ class DatasetEvaluator(object):
for metric in self.metrics: for metric in self.metrics:
# Train and reference split use same metric as test split. # Train and reference split use same metric as test split.
self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_name.split("_")[0]][ self.suggested_categories[metric] = metric_helper.metrics4subcategory[self.dataset_class][metric]
metric
]
if "ALL" in self.suggested_categories[metric]: if "ALL" in self.suggested_categories[metric]:
self.suggested_categories[metric] = self.categories self.suggested_categories[metric] = self.categories
self.metric_total_length[metric] = self.total_length self.metric_total_length[metric] = self.total_length

View File

@ -25,7 +25,7 @@ metrics4subcategory = {
"per_byte_ppl_score": ["ALL"], "per_byte_ppl_score": ["ALL"],
}, },
# The commented are non 4-choice questions. # The commented are non 4-choice questions.
"agieval": { "AGIEvalDataset": {
"combined_single_choice_accuracy": [ "combined_single_choice_accuracy": [
# "lsat-ar", # "lsat-ar",
# "lsat-lr", # "lsat-lr",
@ -103,14 +103,14 @@ metrics4subcategory = {
], ],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"cmmlu": { "CMMLUDataset": {
"first_token_accuracy": ["ALL"], "first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"],
"perplexity": ["ALL"], "perplexity": ["ALL"],
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"gaokaobench": { "GaoKaoBenchDataset": {
"combined_single_choice_accuracy": [ "combined_single_choice_accuracy": [
"English MCQs", "English MCQs",
"Biology MCQs", "Biology MCQs",
@ -170,7 +170,7 @@ metrics4subcategory = {
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"longbench": { "LongBenchDataset": {
"f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"], "f1_score": ["hotpotqa", "2wikimqa", "musique", "narrativeqa", "qasper", "multifieldqa_en", "triviaqa"],
"f1_zh_score": ["multifieldqa_zh"], "f1_zh_score": ["multifieldqa_zh"],
"rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"], "rouge_score": ["gov_report", "qmsum", "multi_news", "samsum"],
@ -183,7 +183,7 @@ metrics4subcategory = {
"perplexity": ["ALL"], "perplexity": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"mmlu": { "MMLUDataset": {
"first_token_accuracy": ["ALL"], "first_token_accuracy": ["ALL"],
"single_choice_accuracy": ["ALL"], "single_choice_accuracy": ["ALL"],
"accuracy": ["ALL"], "accuracy": ["ALL"],
@ -191,11 +191,11 @@ metrics4subcategory = {
"ppl_score_over_choices": ["ALL"], "ppl_score_over_choices": ["ALL"],
"ppl_score": ["ALL"], "ppl_score": ["ALL"],
}, },
"mtbench": {"mtbench_single_judge": ["ALL"]}, "MTBenchDataset": {"mtbench_single_judge": ["ALL"]},
"cvalues": {"first_token_accuracy": ["ALL"]}, "CValuesDataset": {"first_token_accuracy": ["ALL"]},
"safetybench_zh": {"first_token_accuracy": ["ALL"]}, "SafetyBenchZHDataset": {"first_token_accuracy": ["ALL"]},
"safetybench_en": {"first_token_accuracy": ["ALL"]}, "SafetyBenchENDataset": {"first_token_accuracy": ["ALL"]},
"gsm": { "GSMDataset": {
"loss_over_all_tokens": ["ALL"], "loss_over_all_tokens": ["ALL"],
"gsm_accuracy": ["ALL"], "gsm_accuracy": ["ALL"],
}, },

View File

@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
logger = get_dist_logger() logger = get_dist_logger()
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
""" """
Remove inference result per rank and merge them into one file. Remove inference result per rank and merge them into one file.
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results. save_path: The folder for storing inference results.
model_names: Names of models for inference. model_names: Names of models for inference.
dataset_names: Names of dataset for inference. dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
""" """
for model_name in model_names: for model_name in model_names:
for dataset_name, categories in dataset_names.items(): for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
all_answers = {} all_answers = {}
for category in categories: for category in categories:
all_answers[category] = {"data": []} all_answers[category] = {"data": []}
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
all_answers[category] = answers all_answers[category] = answers
all_answers_with_dataset_class["inference_results"] = all_answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)
logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.") logger.info(f"Save inference results of all models for all dataset.")
@ -98,6 +113,7 @@ def main(args):
) )
inference_data = {} inference_data = {}
dataset_classes = {}
debug_args = {} debug_args = {}
few_shot_args = {} few_shot_args = {}
multiturn_args = {} multiturn_args = {}
@ -128,6 +144,7 @@ def main(args):
continue continue
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset): if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"] inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if load_reference and "reference" in dataset_.dataset: if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference" new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"] inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if rank == 0: if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}") logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@ -225,7 +244,7 @@ def main(args):
if rank == 0: if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters] model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -15,7 +15,13 @@ from colossalai.shardformer import ShardConfig
logger = get_dist_logger() logger = get_dist_logger()
def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_names: Dict[str, List]) -> None: def rm_and_merge(
dp_size: int,
save_path: str,
model_names: List[str],
dataset_names: Dict[str, List],
dataset_classes: Dict[str, List],
) -> None:
""" """
Remove inference result per rank and merge them into one file. Remove inference result per rank and merge them into one file.
@ -24,11 +30,15 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
save_path: The folder for storing inference results. save_path: The folder for storing inference results.
model_names: Names of models for inference. model_names: Names of models for inference.
dataset_names: Names of dataset for inference. dataset_names: Names of dataset for inference.
dataset_classes: Dataset class for different inference results. We need to save dataset class to smooth the evaluation process.
""" """
for model_name in model_names: for model_name in model_names:
for dataset_name, categories in dataset_names.items(): for dataset_name, categories in dataset_names.items():
all_answers_with_dataset_class = {}
all_answers_with_dataset_class["dataset_class"] = dataset_classes[dataset_name]
all_answers = {} all_answers = {}
for category in categories: for category in categories:
all_answers[category] = {"data": []} all_answers[category] = {"data": []}
@ -58,8 +68,13 @@ def rm_and_merge(dp_size: int, save_path: str, model_names: List[str], dataset_n
all_answers[category] = answers all_answers[category] = answers
all_answers_with_dataset_class["inference_results"] = all_answers
logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.") logger.info(f"Save inference results of model {model_name} on dataset {dataset_name}.")
utils.jdump(all_answers, os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json")) utils.jdump(
all_answers_with_dataset_class,
os.path.join(save_path, model_name, f"{dataset_name}_inference_results.json"),
)
logger.info(f"Save inference results of model {model_name} for all dataset.") logger.info(f"Save inference results of model {model_name} for all dataset.")
logger.info(f"Save inference results of all models for all dataset.") logger.info(f"Save inference results of all models for all dataset.")
@ -98,6 +113,7 @@ def main(args):
) )
inference_data = {} inference_data = {}
dataset_classes = {}
debug_args = {} debug_args = {}
few_shot_args = {} few_shot_args = {}
multiturn_args = {} multiturn_args = {}
@ -128,6 +144,7 @@ def main(args):
continue continue
dataset_classes[dataset_name] = dataset_parameter["dataset_class"]
dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}") dataset_class = eval(f"dataset.{dataset_parameter['dataset_class']}")
if not issubclass(dataset_class, dataset.BaseDataset): if not issubclass(dataset_class, dataset.BaseDataset):
raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.") raise ValueError(f"Dataset class {dataset_parameter['dataset_class']} is not a subclass of BaseDataset.")
@ -149,12 +166,14 @@ def main(args):
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["train"] inference_data[new_dataset_name] = dataset_.dataset["train"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if load_reference and "reference" in dataset_.dataset: if load_reference and "reference" in dataset_.dataset:
new_dataset_name = f"{dataset_name}_reference" new_dataset_name = f"{dataset_name}_reference"
debug_args[new_dataset_name] = dataset_parameter["debug"] debug_args[new_dataset_name] = dataset_parameter["debug"]
few_shot_args[new_dataset_name] = dataset_parameter["few_shot"] few_shot_args[new_dataset_name] = dataset_parameter["few_shot"]
inference_data[new_dataset_name] = dataset_.dataset["reference"] inference_data[new_dataset_name] = dataset_.dataset["reference"]
dataset_classes[new_dataset_name] = dataset_parameter["dataset_class"]
if rank == 0: if rank == 0:
logger.info(f"Dataset for inference are: {list(inference_data.keys())}") logger.info(f"Dataset for inference are: {list(inference_data.keys())}")
@ -225,7 +244,7 @@ def main(args):
if rank == 0: if rank == 0:
model_names = [model_parameter["name"] for model_parameter in model_parameters] model_names = [model_parameter["name"] for model_parameter in model_parameters]
dataset_names = {key: list(inference_data[key].keys()) for key in inference_data} dataset_names = {key: list(inference_data[key].keys()) for key in inference_data}
rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names) rm_and_merge(dp_size, args.inference_save_path, model_names, dataset_names, dataset_classes)
if __name__ == "__main__": if __name__ == "__main__":