ColossalAI/applications/ColossalEval/examples/gpt_evaluation/eval.py

import argparse
import os

import openai
from colossal_eval.evaluate.evaluator import Evaluator
from colossal_eval.utils import jload


def main(args):
    assert len(args.answer_file_list) == len(
        args.model_name_list
    ), "The number of answer files and model names should be equal!"

    # load config
    config = jload(args.config_file)

    if config["language"] in ["cn", "en"]:
        # get metric settings for all categories
        metrics_per_category = {}
        for category in config["category"].keys():
            metrics_all = {}
            for metric_type, metrics in config["category"][category].items():
                metrics_all[metric_type] = metrics
            metrics_per_category[category] = metrics_all

        battle_prompt = None
        if args.battle_prompt_file:
            battle_prompt = jload(args.battle_prompt_file)

        gpt_evaluation_prompt = None
        if args.gpt_evaluation_prompt_file:
            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)

        if len(args.model_name_list) == 2 and not battle_prompt:
            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")

        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
            raise Exception(
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
            )

        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
            raise Exception(
                "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
            )

        # initialize evaluator
        evaluator = Evaluator(
            metrics_per_category,
            battle_prompt,
            gpt_evaluation_prompt,
            args.gpt_model,
            config["language"],
            args.gpt_with_reference,
        )
        if len(args.model_name_list) == 2:
            answers_1 = jload(args.answer_file_list[0])
            answers_2 = jload(args.answer_file_list[1])

            answers1 = []
            for category, value in answers_1.items():
                answers1.extend(value["data"])

            answers2 = []
            for category, value in answers_2.items():
                answers2.extend(value["data"])

            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"

            evaluator.battle(answers1=answers1, answers2=answers2)
            evaluator.save(args.save_path, args.model_name_list)
        elif len(args.model_name_list) == 1:
            targets = jload(args.target_file)
            answers = jload(args.answer_file_list[0])

            references = []
            for category, value in targets["test"].items():
                references.extend(value["data"])

            predictions = []
            for category, value in answers.items():
                predictions.extend(value["data"])

            assert len(references) == len(
                predictions
            ), "The number of target answers and model answers should be equal!"

            evaluator.evaluate(
                answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
            )
            evaluator.save(args.save_path, args.model_name_list)
        else:
            raise ValueError("Unsupported number of answer files and model names!")
    else:
        raise ValueError(f'Unsupported language {config["language"]}!')


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
    parser.add_argument(
        "--config_file", type=str, default=None, required=True, help="path to the file of target results"
    )
    parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
    parser.add_argument(
        "--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
    )
    parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
    parser.add_argument(
        "--answer_file_list",
        type=str,
        nargs="+",
        default=[],
        required=True,
        help="path to the answer files of at most 2 models",
    )
    parser.add_argument(
        "--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
    )
    parser.add_argument(
        "--gpt_model",
        default="gpt-3.5-turbo-16k",
        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
        help="which GPT model to use for evaluation",
    )
    parser.add_argument(
        "--gpt_with_reference",
        default=False,
        action="store_true",
        help="whether to include reference answer in gpt evaluation",
    )
    parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
    parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
    args = parser.parse_args()

    if args.openai_key is not None:
        os.environ["OPENAI_API_KEY"] = args.openai_key
    openai.api_key = os.getenv("OPENAI_API_KEY")

    main(args)
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`import argparse`
			`import os`

			`import openai`
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 1 year ago			`from colossal_eval.evaluate.evaluator import Evaluator`
			`from colossal_eval.utils import jload`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago

			`def main(args):`
			`assert len(args.answer_file_list) == len(`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`args.model_name_list`
			`), "The number of answer files and model names should be equal!"`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago
			`# load config`
			`config = jload(args.config_file)`

support evaluation for english (#3880) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 1 year ago			`if config["language"] in ["cn", "en"]:`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`# get metric settings for all categories`
			`metrics_per_category = {}`
			`for category in config["category"].keys():`
			`metrics_all = {}`
			`for metric_type, metrics in config["category"][category].items():`
			`metrics_all[metric_type] = metrics`
			`metrics_per_category[category] = metrics_all`

			`battle_prompt = None`
			`if args.battle_prompt_file:`
			`battle_prompt = jload(args.battle_prompt_file)`

			`gpt_evaluation_prompt = None`
			`if args.gpt_evaluation_prompt_file:`
			`gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)`

			`if len(args.model_name_list) == 2 and not battle_prompt:`
			`raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")`

			`if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:`
			`raise Exception(`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"`
			`)`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago
[evaluate] support gpt evaluation with reference (#3972) Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> 1 year ago			`if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:`
			`raise Exception(`
			`"GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."`
			`)`

[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`# initialize evaluator`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`evaluator = Evaluator(`
			`metrics_per_category,`
			`battle_prompt,`
			`gpt_evaluation_prompt,`
			`args.gpt_model,`
			`config["language"],`
			`args.gpt_with_reference,`
			`)`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`if len(args.model_name_list) == 2:`
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 1 year ago			`answers_1 = jload(args.answer_file_list[0])`
			`answers_2 = jload(args.answer_file_list[1])`

			`answers1 = []`
			`for category, value in answers_1.items():`
			`answers1.extend(value["data"])`

			`answers2 = []`
			`for category, value in answers_2.items():`
			`answers2.extend(value["data"])`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago
			`assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"`

			`evaluator.battle(answers1=answers1, answers2=answers2)`
			`evaluator.save(args.save_path, args.model_name_list)`
			`elif len(args.model_name_list) == 1:`
			`targets = jload(args.target_file)`
			`answers = jload(args.answer_file_list[0])`

[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 1 year ago			`references = []`
			`for category, value in targets["test"].items():`
			`references.extend(value["data"])`

			`predictions = []`
			`for category, value in answers.items():`
			`predictions.extend(value["data"])`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 1 year ago			`assert len(references) == len(`
			`predictions`
			`), "The number of target answers and model answers should be equal!"`

			`evaluator.evaluate(`
			`answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]`
			`)`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`evaluator.save(args.save_path, args.model_name_list)`
			`else:`
			`raise ValueError("Unsupported number of answer files and model names!")`
			`else:`
			`raise ValueError(f'Unsupported language {config["language"]}!')`


[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")`
			`parser.add_argument(`
			`"--config_file", type=str, default=None, required=True, help="path to the file of target results"`
			`)`
			`parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")`
			`parser.add_argument(`
			`"--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"`
			`)`
			`parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")`
			`parser.add_argument(`
			`"--answer_file_list",`
			`type=str,`
			`nargs="+",`
			`default=[],`
			`required=True,`
			`help="path to the answer files of at most 2 models",`
			`)`
			`parser.add_argument(`
			`"--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"`
			`)`
			`parser.add_argument(`
			`"--gpt_model",`
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 1 year ago			`default="gpt-3.5-turbo-16k",`
			`choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`help="which GPT model to use for evaluation",`
			`)`
			`parser.add_argument(`
			`"--gpt_with_reference",`
			`default=False,`
			`action="store_true",`
			`help="whether to include reference answer in gpt evaluation",`
			`)`
			`parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")`
			`parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")`
[evaluation] add automatic evaluation pipeline (#3821) * add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com> 2 years ago			`args = parser.parse_args()`

			`if args.openai_key is not None:`
			`os.environ["OPENAI_API_KEY"] = args.openai_key`
			`openai.api_key = os.getenv("OPENAI_API_KEY")`

			`main(args)`