mirror of https://github.com/hpcaitech/ColossalAI
Yuanchen
1 year ago
committed by
GitHub
12 changed files with 984 additions and 67 deletions
@ -0,0 +1,12 @@
|
||||
from .evaluator import get_evaluator |
||||
from .utils import ( |
||||
analyze_unieval_results, |
||||
calculate_average_score, |
||||
convert_data_to_unieval_format, |
||||
save_unieval_results, |
||||
) |
||||
|
||||
__all__ = [ |
||||
'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results', |
||||
'analyze_unieval_results' |
||||
] |
@ -0,0 +1,330 @@
|
||||
# MIT License |
||||
|
||||
# Copyright (c) 2022 Ming Zhong |
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
|
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
import numpy as np |
||||
from nltk import sent_tokenize |
||||
|
||||
from .scorer import UniEvaluator |
||||
from .utils import add_question |
||||
|
||||
|
||||
class SumEvaluator: |
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): |
||||
""" Set up evaluator for text summarization """ |
||||
self.scorer = UniEvaluator( |
||||
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
self.task = 'summarization' |
||||
self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance'] |
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True): |
||||
""" |
||||
Get the scores of all the given dimensions |
||||
|
||||
category: The category to be evaluated. |
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate |
||||
four dimensions: coherence, consistency, fluency, relevance. |
||||
|
||||
overall: indicates whether the overall score is to be calculated. |
||||
Overall score can be customized to a combination of scores based on different |
||||
dimensions. The default here is the average score of all the given dimensions. |
||||
""" |
||||
n_data = len(data) |
||||
eval_scores = [{} for _ in range(n_data)] |
||||
|
||||
if dims == None: |
||||
eval_dims = self.dimensions |
||||
else: |
||||
assert isinstance(dims, list) |
||||
eval_dims = dims |
||||
|
||||
for dim in eval_dims: |
||||
# Calculate average sentence-level scores for 'consistency' and 'fluency' |
||||
if dim == 'consistency' or dim == 'fluency': |
||||
src_list, output_list = [], [] |
||||
n_sents = [] # the number of sentences in each generated summary |
||||
for i in range(n_data): |
||||
source = data[i]['source'] |
||||
system_outputs = sent_tokenize(data[i]['system_output']) |
||||
n_sents.append(len(system_outputs)) |
||||
for j in range(len(system_outputs)): |
||||
src_list.append(source) |
||||
output_list.append(system_outputs[j]) |
||||
input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task) |
||||
sent_score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
# Get average score for each sample |
||||
start_idx = 0 |
||||
score = [] |
||||
for cur_n_sent in n_sents: |
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent) |
||||
start_idx += cur_n_sent |
||||
|
||||
# Calculate summary-level score for 'coherence' and 'relevance' |
||||
elif dim == 'coherence' or dim == 'relevance': |
||||
src_list, output_list, ref_list = [], [], [] |
||||
for i in range(n_data): |
||||
src_list.append(data[i]['source']) |
||||
output_list.append(data[i]['system_output']) |
||||
if dim == 'relevance': |
||||
ref_list.append(data[i]['reference']) |
||||
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task) |
||||
score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
# Please customize other dimensions here for summarization |
||||
else: |
||||
raise NotImplementedError('The input format for this dimension is still undefined. \ |
||||
Please customize it first.') |
||||
|
||||
for i in range(n_data): |
||||
eval_scores[i][dim] = score[i] |
||||
|
||||
# Customize your overall score here. |
||||
if overall == True: |
||||
for i in range(n_data): |
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) |
||||
|
||||
return eval_scores |
||||
|
||||
|
||||
class DialogEvaluator: |
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): |
||||
""" Set up evaluator for dialogues """ |
||||
self.scorer = UniEvaluator( |
||||
model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
self.task = 'dialogue' |
||||
self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability'] |
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True): |
||||
""" |
||||
Get the scores of all the given dimensions |
||||
|
||||
category: The category to be evaluated. |
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate |
||||
five dimensions: naturalness, coherence, engagingness, groundedness and understandability. |
||||
|
||||
overall: indicates whether the overall score is to be calculated. |
||||
Overall score can be customized to a combination of scores based on different |
||||
dimensions. The default here is the average score of all the given dimensions. |
||||
""" |
||||
n_data = len(data) |
||||
eval_scores = [{} for _ in range(n_data)] |
||||
|
||||
if dims == None: |
||||
eval_dims = self.dimensions |
||||
else: |
||||
assert isinstance(dims, list) |
||||
eval_dims = dims |
||||
|
||||
for dim in eval_dims: |
||||
# Calculate summation score for 'engagingness' |
||||
if dim == 'engagingness': |
||||
src_list, output_list, context_list = [], [], [] |
||||
n_sents = [] # the number of sentences in each generated response |
||||
for i in range(n_data): |
||||
source = data[i]['source'] |
||||
context = data[i]['context'] |
||||
system_outputs = sent_tokenize(data[i]['system_output']) |
||||
n_sents.append(len(system_outputs)) |
||||
for j in range(len(system_outputs)): |
||||
src_list.append(source) |
||||
context_list.append(context) |
||||
output_list.append(system_outputs[j]) |
||||
input_list = add_question(dimension=dim, |
||||
output=output_list, |
||||
src=src_list, |
||||
context=context_list, |
||||
task=self.task) |
||||
sent_score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
# Get the summation score for each sample |
||||
start_idx = 0 |
||||
score = [] |
||||
for cur_n_sent in n_sents: |
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent])) |
||||
start_idx += cur_n_sent |
||||
|
||||
# Calculate turn-level score for other dimensions |
||||
elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']: |
||||
src_list, output_list, context_list = [], [], [] |
||||
for i in range(n_data): |
||||
src_list.append(data[i]['source']) |
||||
output_list.append(data[i]['system_output']) |
||||
context_list.append(data[i]['context']) |
||||
input_list = add_question(dimension=dim, |
||||
output=output_list, |
||||
src=src_list, |
||||
context=context_list, |
||||
task=self.task) |
||||
score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
# Please customize other dimensions here for summarization |
||||
else: |
||||
raise NotImplementedError('The input format for this dimension is still undefined. \ |
||||
Please customize it first.') |
||||
|
||||
for i in range(n_data): |
||||
eval_scores[i][dim] = score[i] |
||||
|
||||
# Customize your overall score here. |
||||
if overall == True: |
||||
for i in range(n_data): |
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) |
||||
|
||||
return eval_scores |
||||
|
||||
|
||||
class D2tEvaluator: |
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): |
||||
""" Set up evaluator for data-to-text """ |
||||
self.scorer = UniEvaluator( |
||||
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
self.task = 'data2text' |
||||
self.dimensions = ['naturalness', 'informativeness'] |
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True): |
||||
""" |
||||
Get the scores of all the given dimensions |
||||
|
||||
category: The category to be evaluated. |
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate |
||||
two dimensions: naturalness and informativeness. |
||||
|
||||
overall: indicates whether the overall score is to be calculated. |
||||
Overall score can be customized to a combination of scores based on different |
||||
dimensions. The default here is the average score of all the given dimensions. |
||||
""" |
||||
n_data = len(data) |
||||
eval_scores = [{} for _ in range(n_data)] |
||||
|
||||
if dims == None: |
||||
eval_dims = self.dimensions |
||||
else: |
||||
assert isinstance(dims, list) |
||||
eval_dims = dims |
||||
|
||||
for dim in eval_dims: |
||||
output_list, ref_list = [], [] |
||||
for i in range(n_data): |
||||
output_list.append(data[i]['system_output']) |
||||
ref_list.append(data[i]['reference']) |
||||
|
||||
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task) |
||||
score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
for i in range(n_data): |
||||
eval_scores[i][dim] = score[i] |
||||
|
||||
# Customize your overall score here. |
||||
if overall == True: |
||||
for i in range(n_data): |
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) |
||||
|
||||
return eval_scores |
||||
|
||||
|
||||
class FactEvaluator: |
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): |
||||
""" Set up evaluator for factual consistency detection """ |
||||
self.scorer = UniEvaluator( |
||||
model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
self.task = 'fact' |
||||
self.dim = 'consistency' |
||||
|
||||
def evaluate(self, data, category): |
||||
""" |
||||
Get the factual consistency score (only 1 dimension for this task) |
||||
|
||||
category: The category to be evaluated. |
||||
""" |
||||
n_data = len(data) |
||||
eval_scores = [{} for _ in range(n_data)] |
||||
|
||||
# Calculate average sentence-level scores for facutal consistency |
||||
src_list, output_list = [], [] |
||||
n_sents = [] # the number of sentences in the claim |
||||
for i in range(n_data): |
||||
source = data[i]['source'] |
||||
system_outputs = sent_tokenize(data[i]['system_output']) |
||||
n_sents.append(len(system_outputs)) |
||||
for j in range(len(system_outputs)): |
||||
src_list.append(source) |
||||
output_list.append(system_outputs[j]) |
||||
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task) |
||||
sent_score = self.scorer.score(input_list, self.task, category, dim) |
||||
|
||||
# Get average score for each sample |
||||
start_idx = 0 |
||||
score = [] |
||||
for cur_n_sent in n_sents: |
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent) |
||||
start_idx += cur_n_sent |
||||
|
||||
for i in range(n_data): |
||||
eval_scores[i][self.dim] = score[i] |
||||
|
||||
return eval_scores |
||||
|
||||
|
||||
def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None): |
||||
assert task in ['summarization', 'dialogue', 'data2text', 'fact'] |
||||
if task == 'summarization': |
||||
return SumEvaluator(model_name_or_path=model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
elif task == 'dialogue': |
||||
return DialogEvaluator(model_name_or_path=model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
elif task == 'data2text': |
||||
return D2tEvaluator(model_name_or_path=model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
elif task == 'fact': |
||||
return FactEvaluator(model_name_or_path=model_name_or_path, |
||||
max_length=max_length, |
||||
device=device, |
||||
cache_dir=cache_dir) |
||||
else: |
||||
raise NotImplementedError('Other tasks are not implemented, \ |
||||
please customize specific tasks here.') |
@ -0,0 +1,101 @@
|
||||
# MIT License |
||||
|
||||
# Copyright (c) 2022 Ming Zhong |
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
|
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
import torch |
||||
import torch.nn as nn |
||||
from tqdm import tqdm |
||||
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer |
||||
|
||||
|
||||
class UniEvaluator: |
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): |
||||
""" Set up model """ |
||||
self.device = device |
||||
self.max_length = max_length |
||||
|
||||
self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir) |
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) |
||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir) |
||||
|
||||
self.model.eval() |
||||
self.model.to(device) |
||||
|
||||
self.softmax = nn.Softmax(dim=1) |
||||
|
||||
self.pos_id = self.tokenizer("Yes")["input_ids"][0] |
||||
self.neg_id = self.tokenizer("No")["input_ids"][0] |
||||
|
||||
def score(self, inputs, task, category, dim, batch_size=8): |
||||
""" |
||||
Get scores for the given samples. |
||||
final_score = postive_score / (postive_score + negative_score) |
||||
""" |
||||
|
||||
# The implementation of "forward" in T5 still requires decoder_input_ids. |
||||
# Therefore, we construct a random one-word target sequence. |
||||
# The content of the target has no effect on the final scores. |
||||
tgts = ["No" for _ in range(len(inputs))] |
||||
|
||||
pos_score_list, neg_score_list = [], [] |
||||
for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "): |
||||
src_list = inputs[i:i + batch_size] |
||||
tgt_list = tgts[i:i + batch_size] |
||||
try: |
||||
with torch.no_grad(): |
||||
encoded_src = self.tokenizer(src_list, |
||||
max_length=self.max_length, |
||||
truncation=True, |
||||
padding=True, |
||||
return_tensors='pt') |
||||
encoded_tgt = self.tokenizer(tgt_list, |
||||
max_length=self.max_length, |
||||
truncation=True, |
||||
padding=True, |
||||
return_tensors='pt') |
||||
|
||||
src_tokens = encoded_src['input_ids'].to(self.device) |
||||
src_mask = encoded_src['attention_mask'].to(self.device) |
||||
|
||||
tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1) |
||||
|
||||
output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens) |
||||
logits = output.logits.view(-1, self.model.config.vocab_size) |
||||
|
||||
pos_score = self.softmax(logits)[:, self.pos_id] # Yes |
||||
neg_score = self.softmax(logits)[:, self.neg_id] # No |
||||
|
||||
cur_pos_score = [x.item() for x in pos_score] |
||||
cur_neg_score = [x.item() for x in neg_score] |
||||
pos_score_list += cur_pos_score |
||||
neg_score_list += cur_neg_score |
||||
|
||||
except RuntimeError: |
||||
print(f'source: {src_list}') |
||||
print(f'target: {tgt_list}') |
||||
exit(0) |
||||
|
||||
score_list = [] |
||||
for i in range(len(pos_score_list)): |
||||
score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i])) |
||||
|
||||
return score_list |
@ -0,0 +1,248 @@
|
||||
# MIT License |
||||
|
||||
# Copyright (c) 2022 Ming Zhong |
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
# of this software and associated documentation files (the "Software"), to deal |
||||
# in the Software without restriction, including without limitation the rights |
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
# copies of the Software, and to permit persons to whom the Software is |
||||
# furnished to do so, subject to the following conditions: |
||||
|
||||
# The above copyright notice and this permission notice shall be included in all |
||||
# copies or substantial portions of the Software. |
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
# SOFTWARE. |
||||
|
||||
import os |
||||
from typing import Dict |
||||
|
||||
import matplotlib.pyplot as plt |
||||
import pandas as pd |
||||
import seaborn as sns |
||||
import tqdm |
||||
|
||||
|
||||
def add_question(dimension, output, src=None, ref=None, context=None, task=None): |
||||
""" |
||||
Add questions to generate input in Bool-QA format for UniEval. |
||||
|
||||
dimension: specific dimension to be evaluated |
||||
src: source input for different NLG tasks. For example, source document for summarization |
||||
and dialogue history for dialogue response generation. |
||||
output: output text generated by the models |
||||
ref: human-annotataed groundtruth |
||||
context: the context needed to evaluate several specific dimension. For example, |
||||
additional factual information when evaluating engagingness and groundedness in dialogues. |
||||
""" |
||||
|
||||
input_with_question = [] |
||||
for i in range(len(output)): |
||||
# For summarization |
||||
if task == 'summarization': |
||||
if dimension == 'fluency': |
||||
cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i] |
||||
elif dimension == 'coherence': |
||||
cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[ |
||||
i] + ' </s> document: ' + src[i] |
||||
elif dimension == 'consistency': |
||||
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[ |
||||
i] + ' </s> document: ' + src[i] |
||||
elif dimension == 'relevance': |
||||
cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[ |
||||
i] + ' </s> reference: ' + ref[i] |
||||
else: |
||||
raise NotImplementedError( |
||||
'The input format for this dimension is still undefined. Please customize it first.') |
||||
# For dialogues |
||||
elif task == 'dialogue': |
||||
if dimension == 'naturalness': |
||||
cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i] |
||||
elif dimension == 'coherence': |
||||
cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\ |
||||
+ output[i] + ' </s> dialogue history: ' + src[i] |
||||
elif dimension == 'engagingness': |
||||
cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\ |
||||
+ output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i] |
||||
elif dimension == 'groundedness': |
||||
cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\ |
||||
+ output[i] + ' </s> fact: ' + context[i] |
||||
elif dimension == 'understandability': |
||||
cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i] |
||||
else: |
||||
raise NotImplementedError( |
||||
'The input format for this dimension is still undefined. Please customize it first.') |
||||
# For data-to-text |
||||
elif task == 'data2text': |
||||
if dimension == 'naturalness': |
||||
cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i] |
||||
elif dimension == 'informativeness': |
||||
cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\ |
||||
+ output[i] + ' </s> reference: ' + ref[i] |
||||
else: |
||||
raise NotImplementedError( |
||||
'The input format for this dimension is still undefined. Please customize it first.') |
||||
# For factual consistency detection |
||||
elif task == 'fact': |
||||
if dimension == 'consistency': |
||||
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[ |
||||
i] + ' </s> document: ' + src[i] |
||||
else: |
||||
raise NotImplementedError('No other dimensions for the factual consistency detection task.') |
||||
# For new customized tasks |
||||
else: |
||||
raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.') |
||||
input_with_question.append(cur_input) |
||||
return input_with_question |
||||
|
||||
|
||||
def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None): |
||||
""" |
||||
Convert the data into the unieval's format. |
||||
|
||||
output_list: a list of model output |
||||
|
||||
src_list: source input for different NLG tasks. For example, source document for summarization |
||||
and dialogue history for dialogue response generation |
||||
ref_list: human-annotated groundtruth |
||||
""" |
||||
json_data = [] |
||||
for i in range(len(output_list)): |
||||
cur = {} |
||||
cur['system_output'] = output_list[i] |
||||
if src_list is not None: |
||||
cur['source'] = src_list[i] |
||||
if ref_list is not None: |
||||
cur['reference'] = ref_list[i] |
||||
cur['context'] = "" |
||||
json_data.append(cur) |
||||
return json_data |
||||
|
||||
|
||||
def calculate_average_score(scores): |
||||
""" |
||||
Calculate average scores for different metrics |
||||
|
||||
scores: a list of scores for different metrics for each answer |
||||
|
||||
""" |
||||
metrics = {metric: 0 for metric in scores[0]} |
||||
|
||||
for score in scores: |
||||
for metric in score: |
||||
metrics[metric] += score[metric] |
||||
|
||||
for metric in metrics: |
||||
metrics[metric] /= len(scores) |
||||
|
||||
return metrics |
||||
|
||||
|
||||
def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None: |
||||
""" |
||||
Save UniEval evaluation results of different categories for one model. |
||||
|
||||
""" |
||||
|
||||
if not os.path.exists(save_path): |
||||
os.makedirs(save_path) |
||||
|
||||
unieval_metric_stats_per_category = {} |
||||
for task, category_stat in unieval_metric_stats.items(): |
||||
for category, metric_stat in category_stat.items(): |
||||
if unieval_metric_stats_per_category.get(category, None) is None: |
||||
unieval_metric_stats_per_category[category] = {} |
||||
for metric, score in metric_stat.items(): |
||||
unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score |
||||
|
||||
automatic_df = pd.DataFrame(unieval_metric_stats_per_category) |
||||
automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True) |
||||
|
||||
|
||||
def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]: |
||||
""" |
||||
Read a csv file and return a dictionary which stores scores per metric. |
||||
|
||||
""" |
||||
|
||||
results = pd.read_csv(os.path.join(results_path, file_name), index_col=0) |
||||
|
||||
results_dict = {metric: {} for metric in list(results.index)} |
||||
for i, metric in enumerate(results_dict.keys()): |
||||
for j, category in enumerate(list(results.columns)): |
||||
if pd.isnull(results.iloc[i][j]): |
||||
continue |
||||
results_dict[metric][category] = results.iloc[i][j] |
||||
|
||||
return results_dict |
||||
|
||||
|
||||
def analyze_unieval_results(results_path: str, save_path: str) -> None: |
||||
""" |
||||
Analyze and visualize all csv files in the given folder. |
||||
|
||||
""" |
||||
|
||||
if not os.path.exists(results_path): |
||||
raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!') |
||||
|
||||
all_statistics = {} |
||||
|
||||
for file_name in os.listdir(results_path): |
||||
if file_name.endswith("_results.csv"): |
||||
model_name = file_name.split("_results.csv")[0] |
||||
all_statistics[model_name] = read_unieval_results(results_path, file_name) |
||||
|
||||
if len(list(all_statistics.keys())) == 0: |
||||
raise Exception(f'There are no csv files in the given directory "{results_path}"!') |
||||
|
||||
frame_all = {"model": [], "category": [], "metric": [], "score": []} |
||||
frame_per_metric = {} |
||||
for model_name, model_statistics in all_statistics.items(): |
||||
for metric, metric_statistics in model_statistics.items(): |
||||
if frame_per_metric.get(metric) is None: |
||||
frame_per_metric[metric] = {"model": [], "category": [], "score": []} |
||||
|
||||
for category, category_score in metric_statistics.items(): |
||||
frame_all["model"].append(model_name) |
||||
frame_all["category"].append(category) |
||||
frame_all["metric"].append(metric) |
||||
frame_all["score"].append(category_score) |
||||
|
||||
frame_per_metric[metric]["model"].append(model_name) |
||||
frame_per_metric[metric]["category"].append(category) |
||||
frame_per_metric[metric]["score"].append(category_score) |
||||
|
||||
if not os.path.exists(save_path): |
||||
os.makedirs(save_path) |
||||
|
||||
frame_all = pd.DataFrame(frame_all) |
||||
frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv")) |
||||
|
||||
for metric in tqdm.tqdm( |
||||
frame_per_metric.keys(), |
||||
desc=f"UniEval metrics: ", |
||||
total=len(frame_per_metric.keys()), |
||||
): |
||||
data = pd.DataFrame(frame_per_metric[metric]) |
||||
|
||||
sns.set() |
||||
fig = plt.figure(figsize=(16, 10)) |
||||
|
||||
fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True) |
||||
fig.set_title( |
||||
f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}" |
||||
) |
||||
plt.xlabel("Evaluation Category") |
||||
plt.ylabel("Score") |
||||
|
||||
figure = fig.get_figure() |
||||
figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400) |
||||
|
||||
plt.close() |
Loading…
Reference in new issue