|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
|
|
|
|
import openai
|
|
|
|
from evaluator import Evaluator
|
|
|
|
from utils import jload
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
assert len(args.answer_file_list) == len(
|
|
|
|
args.model_name_list
|
|
|
|
), "The number of answer files and model names should be equal!"
|
|
|
|
|
|
|
|
# load config
|
|
|
|
config = jload(args.config_file)
|
|
|
|
|
|
|
|
if config["language"] in ["cn", "en"]:
|
|
|
|
# get metric settings for all categories
|
|
|
|
metrics_per_category = {}
|
|
|
|
for category in config["category"].keys():
|
|
|
|
metrics_all = {}
|
|
|
|
for metric_type, metrics in config["category"][category].items():
|
|
|
|
metrics_all[metric_type] = metrics
|
|
|
|
metrics_per_category[category] = metrics_all
|
|
|
|
|
|
|
|
battle_prompt = None
|
|
|
|
if args.battle_prompt_file:
|
|
|
|
battle_prompt = jload(args.battle_prompt_file)
|
|
|
|
|
|
|
|
gpt_evaluation_prompt = None
|
|
|
|
if args.gpt_evaluation_prompt_file:
|
|
|
|
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
|
|
|
|
|
|
|
|
if len(args.model_name_list) == 2 and not battle_prompt:
|
|
|
|
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
|
|
|
|
|
|
|
|
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
|
|
|
|
raise Exception(
|
|
|
|
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
|
|
|
|
)
|
|
|
|
|
|
|
|
if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
|
|
|
|
raise Exception(
|
|
|
|
"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
|
|
|
|
)
|
|
|
|
|
|
|
|
# initialize evaluator
|
|
|
|
evaluator = Evaluator(
|
|
|
|
metrics_per_category,
|
|
|
|
battle_prompt,
|
|
|
|
gpt_evaluation_prompt,
|
|
|
|
args.gpt_model,
|
|
|
|
config["language"],
|
|
|
|
config.get("path_for_UniEval", None),
|
|
|
|
args.gpt_with_reference,
|
|
|
|
)
|
|
|
|
if len(args.model_name_list) == 2:
|
|
|
|
answers1 = jload(args.answer_file_list[0])
|
|
|
|
answers2 = jload(args.answer_file_list[1])
|
|
|
|
|
|
|
|
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
|
|
|
|
|
|
|
|
evaluator.battle(answers1=answers1, answers2=answers2)
|
|
|
|
evaluator.save(args.save_path, args.model_name_list)
|
|
|
|
elif len(args.model_name_list) == 1:
|
|
|
|
targets = jload(args.target_file)
|
|
|
|
answers = jload(args.answer_file_list[0])
|
|
|
|
|
|
|
|
assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
|
|
|
|
|
|
|
|
evaluator.evaluate(answers=answers, targets=targets)
|
|
|
|
evaluator.save(args.save_path, args.model_name_list)
|
|
|
|
else:
|
|
|
|
raise ValueError("Unsupported number of answer files and model names!")
|
|
|
|
else:
|
|
|
|
raise ValueError(f'Unsupported language {config["language"]}!')
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
|
|
|
|
parser.add_argument(
|
|
|
|
"--config_file", type=str, default=None, required=True, help="path to the file of target results"
|
|
|
|
)
|
|
|
|
parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
|
|
|
|
parser.add_argument(
|
|
|
|
"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
|
|
|
|
)
|
|
|
|
parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
|
|
|
|
parser.add_argument(
|
|
|
|
"--answer_file_list",
|
|
|
|
type=str,
|
|
|
|
nargs="+",
|
|
|
|
default=[],
|
|
|
|
required=True,
|
|
|
|
help="path to the answer files of at most 2 models",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--gpt_model",
|
|
|
|
default="gpt-3.5-turbo",
|
|
|
|
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
|
|
|
|
help="which GPT model to use for evaluation",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--gpt_with_reference",
|
|
|
|
default=False,
|
|
|
|
action="store_true",
|
|
|
|
help="whether to include reference answer in gpt evaluation",
|
|
|
|
)
|
|
|
|
parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
|
|
|
|
parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if args.openai_key is not None:
|
|
|
|
os.environ["OPENAI_API_KEY"] = args.openai_key
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
|
|
|
|
|
|
main(args)
|