ColossalAI/applications/Chat/evaluate/eval.py

113 lines
4.9 KiB
Python
Raw Normal View History

import argparse
import json
import os
import openai
from evaluator import Evaluator
from utils import jload
def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"
# load config
config = jload(args.config_file)
if config["language"] in ["cn", "en"]:
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
metrics_all = {}
for metric_type, metrics in config["category"][category].items():
metrics_all[metric_type] = metrics
metrics_per_category[category] = metrics_all
battle_prompt = None
if args.battle_prompt_file:
battle_prompt = jload(args.battle_prompt_file)
gpt_evaluation_prompt = None
if args.gpt_evaluation_prompt_file:
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
if len(args.model_name_list) == 2 and not battle_prompt:
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
raise Exception(
"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
)
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
evaluator.battle(answers1=answers1, answers2=answers2)
evaluator.save(args.save_path, args.model_name_list)
elif len(args.model_name_list) == 1:
targets = jload(args.target_file)
answers = jload(args.answer_file_list[0])
assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
evaluator.evaluate(answers=answers, targets=targets)
evaluator.save(args.save_path, args.model_name_list)
else:
raise ValueError("Unsupported number of answer files and model names!")
else:
raise ValueError(f'Unsupported language {config["language"]}!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
type=str,
nargs='+',
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--gpt_with_reference',
default=False,
action="store_true",
help='whether to include reference answer in gpt evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()
if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")
main(args)