mirror of https://github.com/hpcaitech/ColossalAI
[evaluation] add automatic evaluation pipeline (#3821)
* add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>pull/3916/head
parent
05b8a8de58
commit
34966378e8
@ -0,0 +1,123 @@
|
|||||||
|
{
|
||||||
|
"language": "cn",
|
||||||
|
"category": {
|
||||||
|
"brainstorming": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"creativity",
|
||||||
|
"practicality",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Distinct"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"chat": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"naturalness",
|
||||||
|
"engagingness",
|
||||||
|
"reasonableness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Distinct"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"classification": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Precision",
|
||||||
|
"Recall",
|
||||||
|
"F1 score"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"closed_qa": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"BLEU",
|
||||||
|
"ROUGE",
|
||||||
|
"BERTScore"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"extraction": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Precision",
|
||||||
|
"Recall",
|
||||||
|
"F1 score"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"generation": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"diversity"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"BLEU",
|
||||||
|
"ROUGE",
|
||||||
|
"BERTScore"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"open_qa": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Distinct"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"rewriting": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"BLEU",
|
||||||
|
"ROUGE",
|
||||||
|
"BERTScore"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"roleplay": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"fidelity",
|
||||||
|
"creativity"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"Distinct"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"summarization": {
|
||||||
|
"GPT-3.5": [
|
||||||
|
"language organization",
|
||||||
|
"relevance",
|
||||||
|
"correctness",
|
||||||
|
"conciseness"
|
||||||
|
],
|
||||||
|
"Metrics": [
|
||||||
|
"BLEU",
|
||||||
|
"ROUGE",
|
||||||
|
"BERTScore"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,98 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import openai
|
||||||
|
from evaluator import Evaluator
|
||||||
|
from utils import jload
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
assert len(args.answer_file_list) == len(
|
||||||
|
args.model_name_list), "The number of answer files and model names should be equal!"
|
||||||
|
|
||||||
|
# load config
|
||||||
|
config = jload(args.config_file)
|
||||||
|
|
||||||
|
if config["language"] == "cn":
|
||||||
|
# get metric settings for all categories
|
||||||
|
metrics_per_category = {}
|
||||||
|
for category in config["category"].keys():
|
||||||
|
metrics_all = {}
|
||||||
|
for metric_type, metrics in config["category"][category].items():
|
||||||
|
metrics_all[metric_type] = metrics
|
||||||
|
metrics_per_category[category] = metrics_all
|
||||||
|
|
||||||
|
battle_prompt = None
|
||||||
|
if args.battle_prompt_file:
|
||||||
|
battle_prompt = jload(args.battle_prompt_file)
|
||||||
|
|
||||||
|
gpt_evaluation_prompt = None
|
||||||
|
if args.gpt_evaluation_prompt_file:
|
||||||
|
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
|
||||||
|
|
||||||
|
if len(args.model_name_list) == 2 and not battle_prompt:
|
||||||
|
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
|
||||||
|
|
||||||
|
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
|
||||||
|
raise Exception(
|
||||||
|
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
|
||||||
|
|
||||||
|
# initialize evaluator
|
||||||
|
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
|
||||||
|
if len(args.model_name_list) == 2:
|
||||||
|
answers1 = jload(args.answer_file_list[0])
|
||||||
|
answers2 = jload(args.answer_file_list[1])
|
||||||
|
|
||||||
|
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
|
||||||
|
|
||||||
|
evaluator.battle(answers1=answers1, answers2=answers2)
|
||||||
|
evaluator.save(args.save_path, args.model_name_list)
|
||||||
|
elif len(args.model_name_list) == 1:
|
||||||
|
targets = jload(args.target_file)
|
||||||
|
answers = jload(args.answer_file_list[0])
|
||||||
|
|
||||||
|
assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
|
||||||
|
|
||||||
|
evaluator.evaluate(answers=answers, targets=targets)
|
||||||
|
evaluator.save(args.save_path, args.model_name_list)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported number of answer files and model names!")
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Unsupported language {config["language"]}!')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
|
||||||
|
parser.add_argument('--config_file',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
required=True,
|
||||||
|
help='path to the file of target results')
|
||||||
|
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
|
||||||
|
parser.add_argument('--gpt_evaluation_prompt_file',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='path to the prompt file for gpt evaluation')
|
||||||
|
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
|
||||||
|
parser.add_argument('--answer_file_list',
|
||||||
|
type=str,
|
||||||
|
nargs='+',
|
||||||
|
default=[],
|
||||||
|
required=True,
|
||||||
|
help='path to the answer files of at most 2 models')
|
||||||
|
parser.add_argument('--model_name_list',
|
||||||
|
type=str,
|
||||||
|
nargs='+',
|
||||||
|
default=[],
|
||||||
|
required=True,
|
||||||
|
help='the names of at most 2 models')
|
||||||
|
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
|
||||||
|
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.openai_key is not None:
|
||||||
|
os.environ["OPENAI_API_KEY"] = args.openai_key
|
||||||
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
|
||||||
|
main(args)
|
@ -0,0 +1,9 @@
|
|||||||
|
python eval.py \
|
||||||
|
--config_file "path to the config file" \
|
||||||
|
--battle_prompt_file "path to the prompt file for battle" \
|
||||||
|
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
|
||||||
|
--target_file "path to the target answer file" \
|
||||||
|
--answer_file_list "path to the answer files of at most 2 models" \
|
||||||
|
--model_name_list "the names of at most 2 models" \
|
||||||
|
--save_path "path to save results" \
|
||||||
|
--openai_key "your openai key" \
|
@ -1,256 +0,0 @@
|
|||||||
# Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py
|
|
||||||
# Copyright 2023 LM-SYS@FastChat
|
|
||||||
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
import concurrent.futures
|
|
||||||
|
|
||||||
import openai
|
|
||||||
import tqdm
|
|
||||||
import shortuuid
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from utils import jload, jdump, get_json_list
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
MAX_API_RETRY = 3
|
|
||||||
|
|
||||||
|
|
||||||
def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str):
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
for _ in range(MAX_API_RETRY):
|
|
||||||
try:
|
|
||||||
response = openai.ChatCompletion.create(
|
|
||||||
model=model,
|
|
||||||
messages=[{
|
|
||||||
'role': 'system',
|
|
||||||
'content': sys_prompt
|
|
||||||
}, {
|
|
||||||
'role': 'user',
|
|
||||||
'content': user_prompt,
|
|
||||||
}],
|
|
||||||
temperature=0.2,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
|
||||||
review = response['choices'][0]['message']['content']
|
|
||||||
return {"review": review, 'id': answer_id}
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
time.sleep(1)
|
|
||||||
logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.')
|
|
||||||
return 'error'
|
|
||||||
|
|
||||||
|
|
||||||
def parse_score(review):
|
|
||||||
try:
|
|
||||||
pattern = re.compile('([0-9]|10) out of 10')
|
|
||||||
sp = re.findall(pattern, review)
|
|
||||||
if len(re.findall(pattern, review)) == 2:
|
|
||||||
return [float(sp[0]), float(sp[1])]
|
|
||||||
|
|
||||||
pattern = re.compile('a score of ([0-9]|10)')
|
|
||||||
sp = re.findall(pattern, review)
|
|
||||||
if len(re.findall(pattern, review)) == 2:
|
|
||||||
return [float(sp[0]), float(sp[1])]
|
|
||||||
|
|
||||||
pattern = re.compile('([0-9]|10)/10')
|
|
||||||
sp = re.findall(pattern, review)
|
|
||||||
if len(re.findall(pattern, review)) == 2:
|
|
||||||
return [float(sp[0]), float(sp[1])]
|
|
||||||
|
|
||||||
score_pair = review.split('\n')[0]
|
|
||||||
score_pair = score_pair.replace(',', ' ')
|
|
||||||
sp = score_pair.split(' ')
|
|
||||||
if len(sp) == 2:
|
|
||||||
return [float(sp[0]), float(sp[1])]
|
|
||||||
else:
|
|
||||||
raise Exception('Invalid score pair.')
|
|
||||||
except Exception as e:
|
|
||||||
return [-1, -1]
|
|
||||||
|
|
||||||
|
|
||||||
def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
|
|
||||||
reviewer_idx = 0
|
|
||||||
for idx, reviewer in enumerate(reviewer_jsons):
|
|
||||||
if reviewer['category'] == cat:
|
|
||||||
reviewer_idx = idx
|
|
||||||
break
|
|
||||||
prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
|
|
||||||
prompt_json = prompt_jsons[prompt_id-1]
|
|
||||||
assert prompt_json['prompt_id'] == prompt_id
|
|
||||||
|
|
||||||
sys_prompt = prompt_json['system_prompt']
|
|
||||||
prompt_template = prompt_json['prompt_template']
|
|
||||||
defaults = prompt_json['defaults']
|
|
||||||
prompt = prompt_template.format(
|
|
||||||
question=ques, answer_1=ans1, answer_2=ans2, **defaults)
|
|
||||||
|
|
||||||
return sys_prompt, prompt, reviewer_idx+1
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(args):
|
|
||||||
answer1_jsons = jload(args.answer_file_list[0])
|
|
||||||
answer2_jsons = jload(args.answer_file_list[1])
|
|
||||||
reviewer_jsons = get_json_list(args.reviewer_file)
|
|
||||||
prompt_jsons = get_json_list(args.prompt_file)
|
|
||||||
|
|
||||||
assert len(answer1_jsons) == len(answer2_jsons)
|
|
||||||
|
|
||||||
handles = []
|
|
||||||
review_jsons = []
|
|
||||||
|
|
||||||
total_len = len(answer1_jsons)
|
|
||||||
question_idx_list = list(range(total_len))
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f' Total number of answers: {len(answer2_jsons)}.')
|
|
||||||
|
|
||||||
reviews = []
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
|
|
||||||
futures = []
|
|
||||||
for i in question_idx_list:
|
|
||||||
assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
|
|
||||||
answer_id = answer1_jsons[i]['id']
|
|
||||||
|
|
||||||
ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
|
|
||||||
" " + answer1_jsons[i]['input']
|
|
||||||
cat = answer1_jsons[i]['category']
|
|
||||||
ans1 = answer1_jsons[i]['output']
|
|
||||||
ans2 = answer2_jsons[i]['output']
|
|
||||||
|
|
||||||
sys_prompt, prompt, reviewer_id = gen_prompt(
|
|
||||||
reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
|
|
||||||
|
|
||||||
review_id = shortuuid.uuid()
|
|
||||||
review_jsons.append({
|
|
||||||
'review_id': review_id,
|
|
||||||
'id': answer_id,
|
|
||||||
'reviewer_id': reviewer_id,
|
|
||||||
'metadata': {}
|
|
||||||
})
|
|
||||||
|
|
||||||
future = executor.submit(
|
|
||||||
get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model)
|
|
||||||
futures.append(future)
|
|
||||||
|
|
||||||
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
||||||
reviews.append(future.result())
|
|
||||||
|
|
||||||
reviews.sort(key=lambda x: x['id'])
|
|
||||||
review_jsons.sort(key=lambda x: x['id'])
|
|
||||||
|
|
||||||
ans1_score = 0
|
|
||||||
ans2_score = 0
|
|
||||||
better_count = 0
|
|
||||||
worse_count = 0
|
|
||||||
tie_count = 0
|
|
||||||
invalid_count = 0
|
|
||||||
|
|
||||||
better_file = []
|
|
||||||
worse_file = []
|
|
||||||
tie_file = []
|
|
||||||
invalid_file = []
|
|
||||||
output_review_file = []
|
|
||||||
|
|
||||||
for idx, review in enumerate(reviews):
|
|
||||||
scores = parse_score(review['review'])
|
|
||||||
review_jsons[idx]['review'] = review['review']
|
|
||||||
review_jsons[idx]['score'] = scores
|
|
||||||
|
|
||||||
if scores[0] == -1 and scores[1] == -1:
|
|
||||||
invalid_count += 1
|
|
||||||
invalid_file.append(review_jsons[idx])
|
|
||||||
logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.')
|
|
||||||
else:
|
|
||||||
if scores[0] > scores[1]:
|
|
||||||
worse_count += 1
|
|
||||||
worse_file.append(review_jsons[idx])
|
|
||||||
elif scores[0] < scores[1]:
|
|
||||||
better_count += 1
|
|
||||||
better_file.append(review_jsons[idx])
|
|
||||||
else:
|
|
||||||
tie_count += 1
|
|
||||||
tie_file.append(review_jsons[idx])
|
|
||||||
ans1_score += scores[0]
|
|
||||||
ans2_score += scores[1]
|
|
||||||
|
|
||||||
output_review_file.append(review_jsons[idx])
|
|
||||||
|
|
||||||
better_file.sort(key=lambda x: x['id'])
|
|
||||||
worse_file.sort(key=lambda x: x['id'])
|
|
||||||
tie_file.sort(key=lambda x: x['id'])
|
|
||||||
invalid_file.sort(key=lambda x: x['id'])
|
|
||||||
output_review_file.sort(key=lambda x: x['id'])
|
|
||||||
|
|
||||||
name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0]
|
|
||||||
name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0]
|
|
||||||
prefix = f"{name1}_vs_{name2}"
|
|
||||||
|
|
||||||
jdump(better_file, os.path.join(
|
|
||||||
args.output_folder, prefix, f"{prefix}_better.json"))
|
|
||||||
jdump(worse_file, os.path.join(
|
|
||||||
args.output_folder, prefix, f"{prefix}_worse.json"))
|
|
||||||
jdump(tie_file, os.path.join(
|
|
||||||
args.output_folder, prefix, f"{prefix}_tie.json"))
|
|
||||||
jdump(invalid_file, os.path.join(
|
|
||||||
args.output_folder, prefix, f"{prefix}_invalid.json"))
|
|
||||||
jdump(output_review_file, os.path.join(
|
|
||||||
args.output_folder, prefix, f"{prefix}_review.json"))
|
|
||||||
|
|
||||||
if os.path.exists(os.path.join(args.output_folder, "results.json")):
|
|
||||||
results = jload(os.path.join(args.output_folder, "results.json"))
|
|
||||||
else:
|
|
||||||
results = {}
|
|
||||||
results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count /
|
|
||||||
(len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]}
|
|
||||||
jdump(results, os.path.join(args.output_folder, "results.json"))
|
|
||||||
|
|
||||||
logger.info(f' Total {invalid_count} invalid score pair(s).')
|
|
||||||
logger.info(f' Model {name2} has {better_count} better answer(s).')
|
|
||||||
logger.info(f' Model {name2} has {worse_count} worse answer(s).')
|
|
||||||
logger.info(f' {tie_count} answer(s) play(s) to a tie.')
|
|
||||||
logger.info(
|
|
||||||
f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}')
|
|
||||||
logger.info(
|
|
||||||
f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}')
|
|
||||||
logger.info(
|
|
||||||
f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description='Model evaluation.')
|
|
||||||
parser.add_argument('--answer_file_list', nargs='+', default=[])
|
|
||||||
parser.add_argument('--prompt_file')
|
|
||||||
parser.add_argument('--reviewer_file')
|
|
||||||
parser.add_argument('--output_folder', type=str, default="./output")
|
|
||||||
parser.add_argument('--openai_key', type=str, default=None)
|
|
||||||
parser.add_argument('--model', type=str, default="gpt-4")
|
|
||||||
parser.add_argument('--num_workers', type=int, default=8)
|
|
||||||
parser.add_argument('--max_tokens', type=int, default=512,
|
|
||||||
help='maximum number of tokens produced in the output')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.openai_key is not None:
|
|
||||||
os.environ["OPENAI_API_KEY"] = args.openai_key
|
|
||||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
||||||
|
|
||||||
evaluate(args)
|
|
@ -1,9 +0,0 @@
|
|||||||
python evaluate.py \
|
|
||||||
--answer_file_list "path to answers of model 1" "path to answers of model 2" \
|
|
||||||
--prompt_file "path to prompt file" \
|
|
||||||
--reviewer_file "path to reviewer file" \
|
|
||||||
--output_folder "path to output folder" \
|
|
||||||
--openai_key "your openai key" \
|
|
||||||
--model "gpt-4" \
|
|
||||||
--num_workers 8 \
|
|
||||||
--max_tokens 512 \
|
|
@ -0,0 +1,130 @@
|
|||||||
|
import os
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import gpt_evaluate
|
||||||
|
import metrics
|
||||||
|
import pandas as pd
|
||||||
|
from utils import get_data_per_category, jdump
|
||||||
|
|
||||||
|
|
||||||
|
class Evaluator(object):
|
||||||
|
"""
|
||||||
|
A class named Evaluator includes GPT-3.5/GPT-4 evaluation
|
||||||
|
and automatic evaluation
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
|
||||||
|
Any]) -> None:
|
||||||
|
self.params = params
|
||||||
|
self.battle_prompt = battle_prompt
|
||||||
|
self.gpt_evaluation_prompt = gpt_evaluation_prompt
|
||||||
|
self.automatic_metric_stats = dict()
|
||||||
|
self.gpt35_evaluation_results = dict()
|
||||||
|
self.battle_results = []
|
||||||
|
|
||||||
|
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
|
||||||
|
"""
|
||||||
|
Comparison between two models using GPT-4 as the reviewer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
|
||||||
|
|
||||||
|
def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
|
||||||
|
"""
|
||||||
|
A comprehensive evaluation of the answers from the model.
|
||||||
|
The function evaluates the model's performance from different perspectives
|
||||||
|
using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
|
||||||
|
|
||||||
|
The metrics will be decided by the config file.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def switch(metric):
|
||||||
|
if metric == "BLEU":
|
||||||
|
return metrics.bleu_score(preds=predicts_list, targets=targets_list)
|
||||||
|
elif metric == "ROUGE":
|
||||||
|
return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
|
||||||
|
elif (metric == "Distinct"):
|
||||||
|
return metrics.distinct_score(preds=predicts_list)
|
||||||
|
elif (metric == "BERTScore"):
|
||||||
|
return metrics.bert_score(preds=predicts_list, targets=targets_list)
|
||||||
|
elif (metric == "Precision"):
|
||||||
|
return metrics.precision(preds=predicts_list, targets=targets_list)
|
||||||
|
elif (metric == "Recall"):
|
||||||
|
return metrics.recall(preds=predicts_list, targets=targets_list)
|
||||||
|
elif (metric == "F1 score"):
|
||||||
|
return metrics.F1_score(preds=predicts_list, targets=targets_list)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unexpected metric")
|
||||||
|
|
||||||
|
answers_per_category = get_data_per_category(answers, list(self.params.keys()))
|
||||||
|
targets_per_category = get_data_per_category(targets, list(self.params.keys()))
|
||||||
|
|
||||||
|
# automatic evaluation
|
||||||
|
for category in self.params:
|
||||||
|
category_metrics = self.params[category]["Metrics"]
|
||||||
|
self.automatic_metric_stats[category] = {}
|
||||||
|
|
||||||
|
targets_list = [
|
||||||
|
target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
|
||||||
|
]
|
||||||
|
predicts_list = [answer["output"] for answer in answers_per_category[category]]
|
||||||
|
|
||||||
|
for metric in category_metrics:
|
||||||
|
self.automatic_metric_stats[category].update(switch(metric=metric))
|
||||||
|
|
||||||
|
# gpt35 evaluation
|
||||||
|
for category in self.params:
|
||||||
|
category_metrics = self.params[category]["GPT-3.5"]
|
||||||
|
|
||||||
|
prompt = self.gpt_evaluation_prompt.get(category, None)
|
||||||
|
if prompt is None:
|
||||||
|
print(f"No prompt for category {category}! Use prompt for category general now.")
|
||||||
|
prompt = self.gpt_evaluation_prompt["general"]
|
||||||
|
|
||||||
|
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
|
||||||
|
prompt, category_metrics, category)
|
||||||
|
|
||||||
|
def save(self, path: str, model_name_list: List[str]) -> None:
|
||||||
|
"""
|
||||||
|
Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(model_name_list) == 2:
|
||||||
|
save_path = os.path.join(path, "gpt_evaluate", "battle_results")
|
||||||
|
gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
|
||||||
|
else:
|
||||||
|
# save evaluation results for automatic metrics
|
||||||
|
automatic_df = pd.DataFrame(self.automatic_metric_stats)
|
||||||
|
|
||||||
|
automatic_results_save_path = os.path.join(path, "automatic_results")
|
||||||
|
if not os.path.exists(automatic_results_save_path):
|
||||||
|
os.makedirs(automatic_results_save_path)
|
||||||
|
automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
|
||||||
|
|
||||||
|
# Save evaluation results for GPT-3.5 evaluation metrics.
|
||||||
|
all_evaluations = []
|
||||||
|
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
|
||||||
|
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
|
||||||
|
|
||||||
|
for category, evaluations in self.gpt35_evaluation_results.items():
|
||||||
|
jdump(
|
||||||
|
evaluations,
|
||||||
|
os.path.join(evaluation_results_save_path, model_name_list[0],
|
||||||
|
f"{category}_evaluation_results.json"))
|
||||||
|
all_evaluations.extend(evaluations)
|
||||||
|
|
||||||
|
jdump(all_evaluations,
|
||||||
|
os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
|
||||||
|
|
||||||
|
# Start to calculate scores and save statictics.
|
||||||
|
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
|
||||||
|
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
|
||||||
|
evaluation_statistics_save_path)
|
||||||
|
|
||||||
|
# Save charts and csv.
|
||||||
|
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
|
||||||
|
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
|
||||||
|
evaluation_analyses_save_path)
|
@ -1,173 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import copy
|
|
||||||
import math
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
|
||||||
import transformers
|
|
||||||
|
|
||||||
from coati.models.bloom import BLOOMActor
|
|
||||||
from coati.models.gpt import GPTActor
|
|
||||||
from coati.models.opt import OPTActor
|
|
||||||
from coati.models.roberta import RoBERTaActor
|
|
||||||
from coati.models.llama import LlamaActor
|
|
||||||
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
|
|
||||||
from transformers import AutoTokenizer, RobertaTokenizer
|
|
||||||
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
|
|
||||||
|
|
||||||
from colossalai.logging import get_dist_logger
|
|
||||||
|
|
||||||
from utils import jload, jdump, is_rank_0
|
|
||||||
|
|
||||||
|
|
||||||
logger = get_dist_logger()
|
|
||||||
|
|
||||||
PROMPT_DICT = {
|
|
||||||
"prompt_input":
|
|
||||||
("Below is an instruction that describes a task, paired with an input that provides further context. "
|
|
||||||
"Write a response that appropriately completes the request.\n\n"
|
|
||||||
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
|
|
||||||
"prompt_no_input": ("Below is an instruction that describes a task. "
|
|
||||||
"Write a response that appropriately completes the request.\n\n"
|
|
||||||
"### Instruction:\n{instruction}\n\n### Response:"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def generate(args):
|
|
||||||
# torch.cuda.set_per_process_memory_fraction(0.4)
|
|
||||||
if args.strategy == 'naive':
|
|
||||||
strategy = NaiveStrategy()
|
|
||||||
elif args.strategy == 'ddp':
|
|
||||||
strategy = DDPStrategy()
|
|
||||||
elif args.strategy == 'colossalai_gemini':
|
|
||||||
strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
|
|
||||||
elif args.strategy == 'colossalai_zero2':
|
|
||||||
strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
|
|
||||||
elif args.strategy == 'colossalai_zero2_cpu':
|
|
||||||
strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unsupported strategy "{args.strategy}"')
|
|
||||||
|
|
||||||
world_size = dist.get_world_size()
|
|
||||||
rank = dist.get_rank()
|
|
||||||
|
|
||||||
with strategy.model_init_context():
|
|
||||||
if args.model == 'gpt2':
|
|
||||||
actor = GPTActor(pretrained=args.model_path).to(
|
|
||||||
torch.cuda.current_device())
|
|
||||||
elif args.model == 'bloom':
|
|
||||||
actor = BLOOMActor(pretrained=args.model_path).to(
|
|
||||||
torch.cuda.current_device())
|
|
||||||
elif args.model == 'opt':
|
|
||||||
actor = OPTActor(pretrained=args.model_path).to(
|
|
||||||
torch.cuda.current_device())
|
|
||||||
elif args.model == 'roberta':
|
|
||||||
actor = RoBERTaActor(pretrained=args.model_path).to(
|
|
||||||
torch.cuda.current_device())
|
|
||||||
elif args.model == 'llama':
|
|
||||||
actor = LlamaActor(pretrained=args.model_path).to(
|
|
||||||
torch.float16).to(torch.cuda.current_device())
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unsupported model "{args.model}"')
|
|
||||||
|
|
||||||
if args.model == 'gpt2':
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
elif args.model == 'bloom':
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
elif args.model == 'opt':
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
|
|
||||||
elif args.model == 'roberta':
|
|
||||||
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
|
||||||
elif args.model == 'llama':
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.model_path,
|
|
||||||
padding_side="right",
|
|
||||||
use_fast=False,
|
|
||||||
)
|
|
||||||
tokenizer.eos_token = '<\s>'
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unsupported model "{args.model}"')
|
|
||||||
|
|
||||||
questions = []
|
|
||||||
if args.max_datasets_size is not None:
|
|
||||||
questions = random.sample(jload(args.dataset), args.max_datasets_size)
|
|
||||||
if is_rank_0():
|
|
||||||
logger.info(
|
|
||||||
f"Limiting dataset to {args.max_datasets_size} examples.")
|
|
||||||
questions = questions[rank:args.max_datasets_size:world_size]
|
|
||||||
|
|
||||||
answers = copy.deepcopy(questions)
|
|
||||||
|
|
||||||
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
|
|
||||||
sources = [
|
|
||||||
prompt_input.format_map(example) if example.get(
|
|
||||||
"input", "") != "" else prompt_no_input.format_map(example)
|
|
||||||
for example in questions
|
|
||||||
]
|
|
||||||
|
|
||||||
if is_rank_0():
|
|
||||||
logger.info("Tokenizing inputs... This may take some time...")
|
|
||||||
|
|
||||||
input_ids_list = []
|
|
||||||
|
|
||||||
for string in sources:
|
|
||||||
input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0)
|
|
||||||
input_ids_list.append(input_ids)
|
|
||||||
|
|
||||||
bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)),
|
|
||||||
desc=f'steps', disable=not is_rank_0())
|
|
||||||
|
|
||||||
actor.eval()
|
|
||||||
with torch.no_grad():
|
|
||||||
for i in range(0, len(input_ids_list), args.batch_size):
|
|
||||||
batch = input_ids_list[i:i+args.batch_size]
|
|
||||||
batch = [i.flip(dims=[0]) for i in batch]
|
|
||||||
batch = torch.nn.utils.rnn.pad_sequence(batch,
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device())
|
|
||||||
batch = batch.flip(dims=[1])
|
|
||||||
attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
|
|
||||||
|
|
||||||
outputs = actor.model.generate(batch, attention_mask=attention_mask,
|
|
||||||
max_length=args.max_length,
|
|
||||||
do_sample=True,
|
|
||||||
top_k=50,
|
|
||||||
top_p=0.95,
|
|
||||||
num_return_sequences=1)
|
|
||||||
|
|
||||||
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
|
||||||
for j in range(batch.size(0)):
|
|
||||||
answers[i +
|
|
||||||
j]['output'] = outputs[j].split("### Response:")[1].strip()
|
|
||||||
|
|
||||||
bar.update()
|
|
||||||
|
|
||||||
jdump(answers, os.path.join(args.answer_path,
|
|
||||||
f'{args.model_name}_answers_rank{rank}.json'))
|
|
||||||
|
|
||||||
if is_rank_0():
|
|
||||||
logger.info(
|
|
||||||
f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('--strategy',
|
|
||||||
choices=['naive', 'ddp', 'colossalai_gemini',
|
|
||||||
'colossalai_zero2', 'colossalai_zero2_cpu'],
|
|
||||||
default='naive')
|
|
||||||
parser.add_argument('--model', default='gpt2',
|
|
||||||
choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama'])
|
|
||||||
parser.add_argument('--model_path', type=str, default=None)
|
|
||||||
parser.add_argument('--model_name', type=str, default='model')
|
|
||||||
parser.add_argument('--dataset', type=str, default=None)
|
|
||||||
parser.add_argument('--batch_size', type=int, default=1)
|
|
||||||
parser.add_argument('--max_datasets_size', type=int, default=None)
|
|
||||||
parser.add_argument('--answer_path', type=str, default="answer")
|
|
||||||
parser.add_argument('--max_length', type=int, default=1024)
|
|
||||||
args = parser.parse_args()
|
|
||||||
generate(args)
|
|
@ -1,25 +0,0 @@
|
|||||||
device_number=number of your devices
|
|
||||||
model_name="name of your model"
|
|
||||||
model_path="path to your model"
|
|
||||||
dataset="path to the question dataset"
|
|
||||||
answer_path="path to save the model answers"
|
|
||||||
|
|
||||||
torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
|
|
||||||
--model 'llama' \
|
|
||||||
--strategy ddp \
|
|
||||||
--model_path $model_path \
|
|
||||||
--model_name $model_name \
|
|
||||||
--dataset $dataset \
|
|
||||||
--batch_size 8 \
|
|
||||||
--max_datasets_size 80 \
|
|
||||||
--answer_path $answer_path \
|
|
||||||
--max_length 512
|
|
||||||
|
|
||||||
python merge.py \
|
|
||||||
--model_name $model_name \
|
|
||||||
--shards $device_number \
|
|
||||||
--answer_path $answer_path \
|
|
||||||
|
|
||||||
for (( i=0; i<device_number; i++ )) do
|
|
||||||
rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
|
|
||||||
done
|
|
@ -1,98 +0,0 @@
|
|||||||
# Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/qa_baseline_gpt35.py
|
|
||||||
# Copyright 2023 LM-SYS@FastChat
|
|
||||||
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import concurrent.futures
|
|
||||||
|
|
||||||
import openai
|
|
||||||
import tqdm
|
|
||||||
import shortuuid
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from utils import jload, jdump
|
|
||||||
|
|
||||||
MODEL = 'gpt-3.5-turbo'
|
|
||||||
MAX_API_RETRY = 3
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def get_answer(question: str, max_tokens: int):
|
|
||||||
answer = question
|
|
||||||
prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
|
|
||||||
" " + question['input']
|
|
||||||
for _ in range(MAX_API_RETRY):
|
|
||||||
try:
|
|
||||||
response = openai.ChatCompletion.create(
|
|
||||||
model='gpt-3.5-turbo',
|
|
||||||
messages=[{
|
|
||||||
'role': 'system',
|
|
||||||
'content': 'You are a helpful assistant.'
|
|
||||||
}, {
|
|
||||||
'role': 'user',
|
|
||||||
'content': prompt,
|
|
||||||
}],
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
|
||||||
answer['output'] = response['choices'][0]['message']['content']
|
|
||||||
return answer
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
time.sleep(1)
|
|
||||||
logger.error(f' Answer {question["id"]} failed after {MAX_API_RETRY} retries.')
|
|
||||||
return answer
|
|
||||||
|
|
||||||
def evaluate_gpt35(args):
|
|
||||||
questions=jload(args.dataset)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f' Total number of answers: {len(questions)}.')
|
|
||||||
logger.info(
|
|
||||||
f' Waiting for {args.request_time_gap} seconds before sending the next request.')
|
|
||||||
|
|
||||||
answers = []
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
|
|
||||||
futures = []
|
|
||||||
for question in questions:
|
|
||||||
future = executor.submit(get_answer, question, args.max_tokens)
|
|
||||||
futures.append(future)
|
|
||||||
|
|
||||||
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
||||||
answers.append(future.result())
|
|
||||||
|
|
||||||
answers.sort(key=lambda x: x['id'])
|
|
||||||
|
|
||||||
jdump(answers, os.path.join(args.answer_path,
|
|
||||||
f'gpt35_answers.json'))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='Evaluate GPT 3.5.')
|
|
||||||
parser.add_argument('--dataset', type=str, default="questions.json")
|
|
||||||
parser.add_argument('--answer_path', type=str, default="answer")
|
|
||||||
parser.add_argument('--num_workers', type=int, default=4)
|
|
||||||
parser.add_argument('--openai_key', type=str, default=None)
|
|
||||||
parser.add_argument('--max_tokens', type=int, default=1024)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.openai_key is not None:
|
|
||||||
os.environ["OPENAI_API_KEY"] = args.openai_key
|
|
||||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
||||||
|
|
||||||
evaluate_gpt35(args)
|
|
@ -1,6 +0,0 @@
|
|||||||
python generate_gpt35_answers.py \
|
|
||||||
--dataset "path to the question dataset" \
|
|
||||||
--answer_path "path to answer folder" \
|
|
||||||
--num_workers 4 \
|
|
||||||
--openai_key "your openai key" \
|
|
||||||
--max_tokens 512 \
|
|
@ -0,0 +1,496 @@
|
|||||||
|
import concurrent.futures
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from copy import deepcopy
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import openai
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import tqdm
|
||||||
|
from utils import jdump, jload
|
||||||
|
|
||||||
|
|
||||||
|
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get evaluation from GPT-4.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sys_prompt: prompt for the system.
|
||||||
|
user_prompt: prompt for the user.
|
||||||
|
id: id of the answers for comparison.
|
||||||
|
max_tokens: the maximum number of tokens to generate in the chat completion.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An evaluation of one comparison.
|
||||||
|
"""
|
||||||
|
|
||||||
|
MAX_API_RETRY = 3
|
||||||
|
for _ in range(MAX_API_RETRY):
|
||||||
|
try:
|
||||||
|
response = openai.ChatCompletion.create(
|
||||||
|
model="gpt-4",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": sys_prompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": user_prompt,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
temperature=0.2,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
evaluation = response["choices"][0]["message"]["content"]
|
||||||
|
return {"evaluation": evaluation, "id": id}
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(1)
|
||||||
|
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
|
||||||
|
return {"evaluation": "", "id": id}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_battle_score(evaluation: str) -> List[float]:
|
||||||
|
"""
|
||||||
|
Parse evaluation from GPT-4 and get the scores of model 1 and 2.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
evaluation: evaluation from GPT-4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A score pair of two different model answers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
pattern = re.compile("([0-9]|10) out of 10")
|
||||||
|
sp = re.findall(pattern, evaluation)
|
||||||
|
if len(re.findall(pattern, evaluation)) == 2:
|
||||||
|
return [float(sp[0]), float(sp[1])]
|
||||||
|
|
||||||
|
pattern = re.compile("a score of ([0-9]|10)")
|
||||||
|
sp = re.findall(pattern, evaluation)
|
||||||
|
if len(re.findall(pattern, evaluation)) == 2:
|
||||||
|
return [float(sp[0]), float(sp[1])]
|
||||||
|
|
||||||
|
pattern = re.compile("([0-9]|10)/10")
|
||||||
|
sp = re.findall(pattern, evaluation)
|
||||||
|
if len(re.findall(pattern, evaluation)) == 2:
|
||||||
|
return [float(sp[0]), float(sp[1])]
|
||||||
|
|
||||||
|
score_pair = evaluation.split("\n")[0]
|
||||||
|
score_pair = score_pair.replace(",", " ")
|
||||||
|
sp = score_pair.split(" ")
|
||||||
|
if len(sp) == 2:
|
||||||
|
return [float(sp[0]), float(sp[1])]
|
||||||
|
else:
|
||||||
|
raise Exception(f"Invalid score pair. Got {evaluation}.")
|
||||||
|
except Exception as e:
|
||||||
|
return [-1, -1]
|
||||||
|
|
||||||
|
|
||||||
|
def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Use GPT-4 to compare answers of two different models.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
answer1: answers of model 1.
|
||||||
|
answer2: answers of model 2.
|
||||||
|
prompt_dict: prompt for battle.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Evaluations of all comparison pairs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert len(answer1) == len(answer2)
|
||||||
|
|
||||||
|
handles = []
|
||||||
|
evaluation_file = []
|
||||||
|
|
||||||
|
total_len = len(answer1)
|
||||||
|
question_idx_list = list(range(total_len))
|
||||||
|
|
||||||
|
print(f" Total number of answers: {len(answer1)}.")
|
||||||
|
|
||||||
|
evaluations = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
futures = []
|
||||||
|
for i in question_idx_list:
|
||||||
|
assert answer1[i]["id"] == answer2[i]["id"]
|
||||||
|
answer_id = answer1[i]["id"]
|
||||||
|
|
||||||
|
ques = answer1[i]["instruction"] if answer1[i][
|
||||||
|
"input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
|
||||||
|
cat = answer1[i]["category"]
|
||||||
|
ans1 = answer1[i]["output"]
|
||||||
|
ans2 = answer2[i]["output"]
|
||||||
|
|
||||||
|
sys_prompt = prompt_dict["system_prompt"]
|
||||||
|
prompt_template = prompt_dict["prompt_template"]
|
||||||
|
prompt = prompt_template.format(
|
||||||
|
question=ques,
|
||||||
|
answer_1=ans1,
|
||||||
|
answer_2=ans2,
|
||||||
|
prompt=prompt_dict["prompt"],
|
||||||
|
)
|
||||||
|
|
||||||
|
future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
|
||||||
|
futures.append(future)
|
||||||
|
|
||||||
|
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
||||||
|
evaluations.append(future.result())
|
||||||
|
|
||||||
|
evaluations.sort(key=lambda x: x["id"])
|
||||||
|
|
||||||
|
return evaluations
|
||||||
|
|
||||||
|
|
||||||
|
def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Save evaluation results (model 1 vs model 2) from GPT-4.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
evaluations: evaluation results from GPT-4.
|
||||||
|
name1: model 1 's name.
|
||||||
|
name2: model 2 's name.
|
||||||
|
save_path: path to save battle results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
evaluation_file = deepcopy(evaluations)
|
||||||
|
|
||||||
|
ans1_score = 0
|
||||||
|
ans2_score = 0
|
||||||
|
better_count = 0
|
||||||
|
worse_count = 0
|
||||||
|
tie_count = 0
|
||||||
|
invalid_count = 0
|
||||||
|
|
||||||
|
better_file = []
|
||||||
|
worse_file = []
|
||||||
|
tie_file = []
|
||||||
|
invalid_file = []
|
||||||
|
|
||||||
|
for idx, evaluation in enumerate(evaluations):
|
||||||
|
scores = parse_battle_score(evaluation["evaluation"])
|
||||||
|
evaluation_file[idx]["score"] = scores
|
||||||
|
|
||||||
|
if scores[0] == -1 and scores[1] == -1:
|
||||||
|
invalid_count += 1
|
||||||
|
invalid_file.append(evaluation_file[idx])
|
||||||
|
print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
|
||||||
|
else:
|
||||||
|
if scores[0] > scores[1]:
|
||||||
|
worse_count += 1
|
||||||
|
worse_file.append(evaluation_file[idx])
|
||||||
|
elif scores[0] < scores[1]:
|
||||||
|
better_count += 1
|
||||||
|
better_file.append(evaluation_file[idx])
|
||||||
|
else:
|
||||||
|
tie_count += 1
|
||||||
|
tie_file.append(evaluation_file[idx])
|
||||||
|
ans1_score += scores[0]
|
||||||
|
ans2_score += scores[1]
|
||||||
|
|
||||||
|
prefix = f"{name1}_vs_{name2}"
|
||||||
|
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
|
||||||
|
jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
|
||||||
|
jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
|
||||||
|
jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
|
||||||
|
jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
|
||||||
|
jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
|
||||||
|
|
||||||
|
if os.path.exists(os.path.join(save_path, "battle_results.json")):
|
||||||
|
results = jload(os.path.join(save_path, "battle_results.json"))
|
||||||
|
else:
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
results[prefix] = {
|
||||||
|
"model": [name1, name2],
|
||||||
|
"better": better_count,
|
||||||
|
"worse": worse_count,
|
||||||
|
"tie": tie_count,
|
||||||
|
"win_rate": better_count / (len(evaluations) - invalid_count),
|
||||||
|
"score": [
|
||||||
|
ans1_score / (len(evaluations) - invalid_count),
|
||||||
|
ans2_score / (len(evaluations) - invalid_count),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
jdump(results, os.path.join(save_path, "battle_results.json"))
|
||||||
|
|
||||||
|
print(f"Total {invalid_count} invalid score pair(s).")
|
||||||
|
print(f"Model {name2} has {better_count} better answer(s).")
|
||||||
|
print(f"Model {name2} has {worse_count} worse answer(s).")
|
||||||
|
print(f"{tie_count} answer(s) play(s) to a tie.")
|
||||||
|
print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
|
||||||
|
print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
|
||||||
|
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpt35_evaluation(prompt: Dict[str, Any],
|
||||||
|
inst: Dict[str, Any],
|
||||||
|
metrics: List[str],
|
||||||
|
max_tokens: int = 2048) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Use GPT-3.5 to evaluate one model answer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: a dictionary including prompt template, CoT and metrics.
|
||||||
|
inst: the instruction that is needed to be evaluated.
|
||||||
|
metrics: the metrics for evaluation.
|
||||||
|
max_tokens: the maximum number of tokens to generate in the completion.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An evaluation of one answer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
MAX_API_RETRY = 3
|
||||||
|
|
||||||
|
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
|
||||||
|
answer = inst["output"]
|
||||||
|
inst["evaluation"] = {}
|
||||||
|
|
||||||
|
for metric in metrics:
|
||||||
|
if prompt["metrics"].get(metric, None) is None:
|
||||||
|
raise Exception(
|
||||||
|
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
|
||||||
|
)
|
||||||
|
for i in range(MAX_API_RETRY):
|
||||||
|
try:
|
||||||
|
response = openai.Completion.create(
|
||||||
|
model="text-davinci-003",
|
||||||
|
prompt=prompt["prompt"].format(
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
metric=prompt["metrics"][metric],
|
||||||
|
steps=prompt["CoT"][metric],
|
||||||
|
),
|
||||||
|
logprobs=5,
|
||||||
|
temperature=0,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
inst["evaluation"][metric] = {
|
||||||
|
"response": response["choices"][0]["text"],
|
||||||
|
"logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
|
||||||
|
}
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
time.sleep(1)
|
||||||
|
return inst
|
||||||
|
|
||||||
|
|
||||||
|
def gpt35_evaluate(
|
||||||
|
answers: List[Dict],
|
||||||
|
prompt: Dict[str, Any],
|
||||||
|
metrics: List[str],
|
||||||
|
category: str,
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Use GPT-3.5 to evaluate model answers and save evaluation results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
answers: model answers.
|
||||||
|
prompt: prompt for GPT-3.5 evaluation.
|
||||||
|
metrics: metrics for GPT-3.5 evaluation.
|
||||||
|
category: the category of the model answers for evaluation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Evaluations of the given answers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
print(f"The number of instances of category {category}'s is {len(answers)}.")
|
||||||
|
|
||||||
|
evaluations = []
|
||||||
|
|
||||||
|
metrics_str = ", ".join(x for x in metrics)
|
||||||
|
print(f"Category {category}'s metrics are {metrics_str}.")
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||||
|
futures = []
|
||||||
|
for inst in answers:
|
||||||
|
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
|
||||||
|
futures.append(future)
|
||||||
|
|
||||||
|
for future in tqdm.tqdm(
|
||||||
|
concurrent.futures.as_completed(futures),
|
||||||
|
desc=f"{category}: ",
|
||||||
|
total=len(futures),
|
||||||
|
):
|
||||||
|
evaluations.append(future.result())
|
||||||
|
|
||||||
|
evaluations.sort(key=lambda x: x["id"])
|
||||||
|
|
||||||
|
print(f"{category} done.")
|
||||||
|
|
||||||
|
return evaluations
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
|
||||||
|
"""
|
||||||
|
Calculate score from log probabilities returned by text-davinci-003.
|
||||||
|
Only openai.Completion can return logprobs.
|
||||||
|
|
||||||
|
Calculation formula:
|
||||||
|
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
|
||||||
|
|
||||||
|
Ref: https://arxiv.org/abs/2303.16634
|
||||||
|
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logprobs: logprobs returned by openai.Completion.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Score of one answer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# GPT-3.5 only returns score of 1 to 5.
|
||||||
|
prob = np.zeros(5)
|
||||||
|
|
||||||
|
for key, value in logprobs.items():
|
||||||
|
# Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
|
||||||
|
# It is meaningless and thus we don't calculate probability.
|
||||||
|
if "bytes" in key:
|
||||||
|
continue
|
||||||
|
# results[0] is the score which corresponds to the key(predicted token).
|
||||||
|
# For example, key "5" corresponds to score 5.
|
||||||
|
results = re.findall(r"\d", key)
|
||||||
|
if len(results) == 1:
|
||||||
|
prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
|
||||||
|
|
||||||
|
score = np.dot(np.arange(1, 6), prob)
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Generate statistics for one model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: name of the model for saving statistics.
|
||||||
|
evaluations: evaluations for all of the model answers.
|
||||||
|
save_path: path to save GPT-3.5 evaluation statistics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
|
||||||
|
data_per_category = {}
|
||||||
|
for evaluation in evaluations:
|
||||||
|
category = evaluation["category"]
|
||||||
|
if evaluation["category"] in data_per_category.keys():
|
||||||
|
data_per_category[category].append(evaluation)
|
||||||
|
else:
|
||||||
|
data_per_category[category] = [evaluation]
|
||||||
|
|
||||||
|
all_statistics = {}
|
||||||
|
for category, data in data_per_category.items():
|
||||||
|
metrics = data[0]["evaluation"].keys()
|
||||||
|
scores = {metric: [] for metric in metrics}
|
||||||
|
for evaluation in data:
|
||||||
|
for metric in metrics:
|
||||||
|
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
|
||||||
|
|
||||||
|
statistics = {}
|
||||||
|
for metric in metrics:
|
||||||
|
arg_sort = np.argsort(scores[metric])
|
||||||
|
statistics[metric] = {}
|
||||||
|
statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
|
||||||
|
statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
|
||||||
|
statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
|
||||||
|
|
||||||
|
all_statistics[category] = statistics
|
||||||
|
|
||||||
|
jdump(
|
||||||
|
all_statistics,
|
||||||
|
os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
statistics_path: path to all the models' statistics.
|
||||||
|
save_path: path to save table and visualization results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not os.path.exists(statistics_path):
|
||||||
|
raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
|
||||||
|
|
||||||
|
all_statistics = {}
|
||||||
|
|
||||||
|
for file_name in os.listdir(statistics_path):
|
||||||
|
if file_name.endswith("_evaluation_statistics.json"):
|
||||||
|
model_name = file_name.split("_evaluation_statistics.json")[0]
|
||||||
|
all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
|
||||||
|
|
||||||
|
if len(list(all_statistics.keys())) == 0:
|
||||||
|
raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
|
||||||
|
|
||||||
|
frame_all = {
|
||||||
|
"model": [],
|
||||||
|
"category": [],
|
||||||
|
"metric": [],
|
||||||
|
"avg_score": [],
|
||||||
|
"best_3": [],
|
||||||
|
"worst_3": [],
|
||||||
|
}
|
||||||
|
frame_per_category = {}
|
||||||
|
for model_name, model_statistics in all_statistics.items():
|
||||||
|
for category, category_statistics in model_statistics.items():
|
||||||
|
if frame_per_category.get(category) is None:
|
||||||
|
frame_per_category[category] = {
|
||||||
|
"model": [],
|
||||||
|
"metric": [],
|
||||||
|
"avg_score": [],
|
||||||
|
"best_3": [],
|
||||||
|
"worst_3": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for metric, metric_statistics in category_statistics.items():
|
||||||
|
frame_all["model"].append(model_name)
|
||||||
|
frame_all["category"].append(category)
|
||||||
|
frame_all["metric"].append(metric)
|
||||||
|
frame_all["avg_score"].append(metric_statistics["avg_score"])
|
||||||
|
frame_all["best_3"].append(metric_statistics["best_3"])
|
||||||
|
frame_all["worst_3"].append(metric_statistics["worst_3"])
|
||||||
|
|
||||||
|
frame_per_category[category]["model"].append(model_name)
|
||||||
|
frame_per_category[category]["metric"].append(metric)
|
||||||
|
frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
|
||||||
|
frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
|
||||||
|
frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
|
||||||
|
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
|
||||||
|
frame_all = pd.DataFrame(frame_all)
|
||||||
|
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
|
||||||
|
|
||||||
|
for category in tqdm.tqdm(
|
||||||
|
frame_per_category.keys(),
|
||||||
|
desc=f"category: ",
|
||||||
|
total=len(frame_per_category.keys()),
|
||||||
|
):
|
||||||
|
data = pd.DataFrame(frame_per_category[category])
|
||||||
|
|
||||||
|
sns.set()
|
||||||
|
fig = plt.figure(figsize=(16, 10))
|
||||||
|
plt.ylim((0, 5))
|
||||||
|
|
||||||
|
fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
|
||||||
|
fig.set_title(f"Comparison between Different Models for Category {category.title()}")
|
||||||
|
plt.xlabel("Evaluation Metric")
|
||||||
|
plt.ylabel("Average Score")
|
||||||
|
|
||||||
|
figure = fig.get_figure()
|
||||||
|
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
|
@ -1,25 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
from utils import jload, jdump
|
|
||||||
|
|
||||||
|
|
||||||
def generate(args):
|
|
||||||
dataset = []
|
|
||||||
for i in range(args.shards):
|
|
||||||
shard = jload(os.path.join(args.answer_path,
|
|
||||||
f'{args.model_name}_answers_rank{i}.json'))
|
|
||||||
dataset.extend(shard)
|
|
||||||
|
|
||||||
dataset.sort(key=lambda x: x['id'])
|
|
||||||
jdump(dataset, os.path.join(args.answer_path,
|
|
||||||
f'{args.model_name}_answers.json'))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('--model_name', type=str, default='model')
|
|
||||||
parser.add_argument('--shards', type=int, default=4)
|
|
||||||
parser.add_argument('--answer_path', type=str, default="answer")
|
|
||||||
args = parser.parse_args()
|
|
||||||
generate(args)
|
|
@ -0,0 +1,169 @@
|
|||||||
|
import statistics
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
from bert_score import score
|
||||||
|
from nltk.translate.bleu_score import sentence_bleu
|
||||||
|
from rouge_chinese import Rouge as Rouge_cn
|
||||||
|
from sklearn.metrics import f1_score, precision_score, recall_score
|
||||||
|
|
||||||
|
|
||||||
|
def bleu_score(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate BLEU Score Metric
|
||||||
|
|
||||||
|
The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
|
||||||
|
BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
|
||||||
|
accuracy in word level, other n-gram evaluate the fluency in
|
||||||
|
sentence level.
|
||||||
|
"""
|
||||||
|
bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
|
||||||
|
cumulative_bleu = [0] * 4
|
||||||
|
weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
|
||||||
|
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
|
||||||
|
|
||||||
|
for pred, target in zip(preds, targets):
|
||||||
|
pred_list = (' '.join(jieba.cut(pred))).split()
|
||||||
|
target_list = [(' '.join(jieba.cut(target))).split()]
|
||||||
|
|
||||||
|
bleu = sentence_bleu(target_list, pred_list, weights=weights)
|
||||||
|
cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
|
||||||
|
|
||||||
|
for i in range(len(cumulative_bleu)):
|
||||||
|
bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
|
||||||
|
|
||||||
|
return bleu_scores
|
||||||
|
|
||||||
|
|
||||||
|
def rouge_cn_score(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate Chinese ROUGE Score Metric
|
||||||
|
|
||||||
|
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
|
||||||
|
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
|
||||||
|
the preds and targets. ROUGE-L measures the number of matching
|
||||||
|
longest common subsequence (LCS) between preds and targets.
|
||||||
|
"""
|
||||||
|
rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
|
||||||
|
all_preds = []
|
||||||
|
all_targets = []
|
||||||
|
|
||||||
|
for pred, target in zip(preds, targets):
|
||||||
|
pred_list = ' '.join(jieba.cut(pred))
|
||||||
|
target_list = ' '.join(jieba.cut(target))
|
||||||
|
all_preds.append(pred_list)
|
||||||
|
all_targets.append(target_list)
|
||||||
|
|
||||||
|
rouge_cn = Rouge_cn()
|
||||||
|
rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
|
||||||
|
|
||||||
|
rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
|
||||||
|
rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
|
||||||
|
rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
|
||||||
|
|
||||||
|
return rouge_scores
|
||||||
|
|
||||||
|
|
||||||
|
def distinct_score(preds: list) -> dict:
|
||||||
|
"""Calculate Distinct Score Metric
|
||||||
|
|
||||||
|
This metric refers to https://arxiv.org/abs/1510.03055.
|
||||||
|
It evaluates the diversity of generation text by counting
|
||||||
|
the unique n-grams.
|
||||||
|
"""
|
||||||
|
distinct_score = {"distinct": 0}
|
||||||
|
cumulative_distinct = []
|
||||||
|
|
||||||
|
for pred in preds:
|
||||||
|
pred_seg_list = list(' '.join(jieba.cut(pred)))
|
||||||
|
count_segs = len(pred_seg_list)
|
||||||
|
unique_segs = set(pred_seg_list)
|
||||||
|
count_unique_chars = len(unique_segs)
|
||||||
|
|
||||||
|
cumulative_distinct.append(count_unique_chars / count_segs)
|
||||||
|
|
||||||
|
distinct_score["distinct"] = statistics.mean(cumulative_distinct)
|
||||||
|
|
||||||
|
return distinct_score
|
||||||
|
|
||||||
|
|
||||||
|
def bert_score(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate BERTScore Metric
|
||||||
|
|
||||||
|
The BERTScore evaluates the semantic similarity between
|
||||||
|
tokens of preds and targets with BERT.
|
||||||
|
"""
|
||||||
|
bert_score = {"bert_score": 0}
|
||||||
|
pred_list = []
|
||||||
|
target_list = []
|
||||||
|
|
||||||
|
for pred, target in zip(preds, targets):
|
||||||
|
pred_list.append(' '.join(jieba.cut(pred)))
|
||||||
|
target_list.append(' '.join(jieba.cut(target)))
|
||||||
|
|
||||||
|
_, _, F = score(pred_list, target_list, lang="zh", verbose=True)
|
||||||
|
|
||||||
|
bert_score["bert_score"] = F.mean().item()
|
||||||
|
|
||||||
|
return bert_score
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
|
||||||
|
"""Precision, Recall and F1-Score Calculation
|
||||||
|
|
||||||
|
The calculation of precision, recall and f1-score is realized by counting
|
||||||
|
the number f overlaps between the preds and target. The comparison length
|
||||||
|
limited by the shorter one of preds and targets. This design is mainly
|
||||||
|
considered for classifiction and extraction categories.
|
||||||
|
"""
|
||||||
|
precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
|
||||||
|
precision_scores = []
|
||||||
|
recall_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
for pred, target in zip(preds, targets):
|
||||||
|
pred_list = [char for char in pred]
|
||||||
|
target_list = [char for char in target]
|
||||||
|
|
||||||
|
target_labels = [1] * min(len(target_list), len(pred_list))
|
||||||
|
pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
|
||||||
|
|
||||||
|
precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
|
||||||
|
recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
|
||||||
|
f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
|
||||||
|
|
||||||
|
precision_recall_f1["precision"] = statistics.mean(precision_scores)
|
||||||
|
precision_recall_f1["recall"] = statistics.mean(recall_scores)
|
||||||
|
precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
|
||||||
|
|
||||||
|
return precision_recall_f1
|
||||||
|
|
||||||
|
|
||||||
|
def precision(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate Precision Metric
|
||||||
|
(design for classifiction and extraction categories)
|
||||||
|
|
||||||
|
Calculating precision by counting the number of overlaps between the preds and target.
|
||||||
|
"""
|
||||||
|
precision = {"precision": 0}
|
||||||
|
precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
|
||||||
|
return precision
|
||||||
|
|
||||||
|
|
||||||
|
def recall(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate Recall Metric
|
||||||
|
(design for classifiction and extraction categories)
|
||||||
|
|
||||||
|
Calculating recall by counting the number of overlaps between the preds and target.
|
||||||
|
"""
|
||||||
|
recall = {"recall": 0}
|
||||||
|
recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
|
||||||
|
return recall
|
||||||
|
|
||||||
|
|
||||||
|
def F1_score(preds: list, targets: list) -> dict:
|
||||||
|
"""Calculate F1-score Metric
|
||||||
|
(design for classifiction and extraction categories)
|
||||||
|
|
||||||
|
Calculating f1-score by counting the number of overlaps between the preds and target.
|
||||||
|
"""
|
||||||
|
f1 = {"f1_score": 0}
|
||||||
|
f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
|
||||||
|
return f1
|
@ -0,0 +1,10 @@
|
|||||||
|
jieba
|
||||||
|
bert-score
|
||||||
|
rouge_chinese
|
||||||
|
scikit-metrics
|
||||||
|
nltk
|
||||||
|
openai
|
||||||
|
seaborn
|
||||||
|
pandas
|
||||||
|
matplotlib
|
||||||
|
numpy
|
@ -1,9 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"id": 0,
|
|
||||||
"instruction": "Help me summarize the following news?",
|
|
||||||
"input": "National Commercial Bank (NCB), Saudi Arabia's largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba's Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region's third-largest lender. The entity's $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East's biggest lender with about $268 billion of assets.",
|
|
||||||
"output": "NCB to pay 28.45 riyals for each Samba share. Deal will create Gulf region's third-largest lender",
|
|
||||||
"category": "closed qa"
|
|
||||||
}
|
|
||||||
]
|
|
Loading…
Reference in new issue