ColossalAI/applications/Chat/evaluate/eval.py

import argparse
import json
import os

import openai
from evaluator import Evaluator
from utils import jload


def main(args):
    assert len(args.answer_file_list) == len(
        args.model_name_list), "The number of answer files and model names should be equal!"

    # load config
    config = jload(args.config_file)

    if config["language"] in ["cn", "en"]:
        # get metric settings for all categories
        metrics_per_category = {}
        for category in config["category"].keys():
            metrics_all = {}
            for metric_type, metrics in config["category"][category].items():
                metrics_all[metric_type] = metrics
            metrics_per_category[category] = metrics_all

        battle_prompt = None
        if args.battle_prompt_file:
            battle_prompt = jload(args.battle_prompt_file)

        gpt_evaluation_prompt = None
        if args.gpt_evaluation_prompt_file:
            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)

        if len(args.model_name_list) == 2 and not battle_prompt:
            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")

        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
            raise Exception(
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
            raise Exception(
                "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
            )

        # initialize evaluator
        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
                              config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)
        if len(args.model_name_list) == 2:
            answers1 = jload(args.answer_file_list[0])
            answers2 = jload(args.answer_file_list[1])

            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"

            evaluator.battle(answers1=answers1, answers2=answers2)
            evaluator.save(args.save_path, args.model_name_list)
        elif len(args.model_name_list) == 1:
            targets = jload(args.target_file)
            answers = jload(args.answer_file_list[0])

            assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"

            evaluator.evaluate(answers=answers, targets=targets)
            evaluator.save(args.save_path, args.model_name_list)
        else:
            raise ValueError("Unsupported number of answer files and model names!")
    else:
        raise ValueError(f'Unsupported language {config["language"]}!')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
    parser.add_argument('--config_file',
                        type=str,
                        default=None,
                        required=True,
                        help='path to the file of target results')
    parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
    parser.add_argument('--gpt_evaluation_prompt_file',
                        type=str,
                        default=None,
                        help='path to the prompt file for gpt evaluation')
    parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
    parser.add_argument('--answer_file_list',
                        type=str,
                        nargs='+',
                        default=[],
                        required=True,
                        help='path to the answer files of at most 2 models')
    parser.add_argument('--model_name_list',
                        type=str,
                        nargs='+',
                        default=[],
                        required=True,
                        help='the names of at most 2 models')
    parser.add_argument('--gpt_model',
                        default="gpt-3.5-turbo",
                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
                        help='which GPT model to use for evaluation')
    parser.add_argument('--gpt_with_reference',
                        default=False,
                        action="store_true",
                        help='whether to include reference answer in gpt evaluation')
    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
    args = parser.parse_args()

    if args.openai_key is not None:
        os.environ["OPENAI_API_KEY"] = args.openai_key
    openai.api_key = os.getenv("OPENAI_API_KEY")

    main(args)
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2023-05-24 03:18:23 +00:00			`import argparse`
			`import json`
			`import os`

			`import openai`
			`from evaluator import Evaluator`
			`from utils import jload`


			`def main(args):`
			`assert len(args.answer_file_list) == len(`
			`args.model_name_list), "The number of answer files and model names should be equal!"`

			`# load config`
			`config = jload(args.config_file)`

support evaluation for english (#3880) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-06-05 13:24:21 +00:00			`if config["language"] in ["cn", "en"]:`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2023-05-24 03:18:23 +00:00			`# get metric settings for all categories`
			`metrics_per_category = {}`
			`for category in config["category"].keys():`
			`metrics_all = {}`
			`for metric_type, metrics in config["category"][category].items():`
			`metrics_all[metric_type] = metrics`
			`metrics_per_category[category] = metrics_all`

			`battle_prompt = None`
			`if args.battle_prompt_file:`
			`battle_prompt = jload(args.battle_prompt_file)`

			`gpt_evaluation_prompt = None`
			`if args.gpt_evaluation_prompt_file:`
			`gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)`

			`if len(args.model_name_list) == 2 and not battle_prompt:`
			`raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")`

			`if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:`
			`raise Exception(`
			`"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")`

[evaluate] support gpt evaluation with reference (#3972) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-06-13 07:12:29 +00:00			`if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:`
			`raise Exception(`
			`"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."`
			`)`

[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2023-05-24 03:18:23 +00:00			`# initialize evaluator`
[evaluation] improvement on evaluation (#3862) * fix a bug when the config file contains one category but the answer file doesn't contains that category * fix Chinese prompt file * support gpt-3.5-turbo and gpt-4 evaluation * polish and update README * resolve pr comments --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-05-30 03:48:41 +00:00			`evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,`
[evaluate] support gpt evaluation with reference (#3972) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-06-13 07:12:29 +00:00			`config["language"], config.get("path_for_UniEval", None), args.gpt_with_reference)`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2023-05-24 03:18:23 +00:00			`if len(args.model_name_list) == 2:`
			`answers1 = jload(args.answer_file_list[0])`
			`answers2 = jload(args.answer_file_list[1])`

			`assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"`

			`evaluator.battle(answers1=answers1, answers2=answers2)`
			`evaluator.save(args.save_path, args.model_name_list)`
			`elif len(args.model_name_list) == 1:`
			`targets = jload(args.target_file)`
			`answers = jload(args.answer_file_list[0])`

			`assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"`

			`evaluator.evaluate(answers=answers, targets=targets)`
			`evaluator.save(args.save_path, args.model_name_list)`
			`else:`
			`raise ValueError("Unsupported number of answer files and model names!")`
			`else:`
			`raise ValueError(f'Unsupported language {config["language"]}!')`


			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')`
			`parser.add_argument('--config_file',`
			`type=str,`
			`default=None,`
			`required=True,`
			`help='path to the file of target results')`
			`parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')`
			`parser.add_argument('--gpt_evaluation_prompt_file',`
			`type=str,`
			`default=None,`
			`help='path to the prompt file for gpt evaluation')`
			`parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')`
			`parser.add_argument('--answer_file_list',`
			`type=str,`
			`nargs='+',`
			`default=[],`
			`required=True,`
			`help='path to the answer files of at most 2 models')`
			`parser.add_argument('--model_name_list',`
			`type=str,`
			`nargs='+',`
			`default=[],`
			`required=True,`
			`help='the names of at most 2 models')`
[evaluation] improvement on evaluation (#3862) * fix a bug when the config file contains one category but the answer file doesn't contains that category * fix Chinese prompt file * support gpt-3.5-turbo and gpt-4 evaluation * polish and update README * resolve pr comments --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-05-30 03:48:41 +00:00			`parser.add_argument('--gpt_model',`
			`default="gpt-3.5-turbo",`
			`choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],`
			`help='which GPT model to use for evaluation')`
[evaluate] support gpt evaluation with reference (#3972) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 2023-06-13 07:12:29 +00:00			`parser.add_argument('--gpt_with_reference',`
			`default=False,`
			`action="store_true",`
			`help='whether to include reference answer in gpt evaluation')`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2023-05-24 03:18:23 +00:00			`parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')`
			`parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')`
			`args = parser.parse_args()`

			`if args.openai_key is not None:`
			`os.environ["OPENAI_API_KEY"] = args.openai_key`
			`openai.api_key = os.getenv("OPENAI_API_KEY")`

			`main(args)`