mirror of https://github.com/hpcaitech/ColossalAI
support UniEval and add CHRF metric (#3924)
Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>pull/3942/head
parent
33eef714db
commit
21c4c0b1a0
@ -0,0 +1,12 @@
|
||||
from .evaluator import get_evaluator
|
||||
from .utils import (
|
||||
analyze_unieval_results,
|
||||
calculate_average_score,
|
||||
convert_data_to_unieval_format,
|
||||
save_unieval_results,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'get_evaluator', 'convert_data_to_unieval_format', 'calculate_average_score', 'save_unieval_results',
|
||||
'analyze_unieval_results'
|
||||
]
|
@ -0,0 +1,330 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2022 Ming Zhong
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import numpy as np
|
||||
from nltk import sent_tokenize
|
||||
|
||||
from .scorer import UniEvaluator
|
||||
from .utils import add_question
|
||||
|
||||
|
||||
class SumEvaluator:
|
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
|
||||
""" Set up evaluator for text summarization """
|
||||
self.scorer = UniEvaluator(
|
||||
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
self.task = 'summarization'
|
||||
self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance']
|
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True):
|
||||
"""
|
||||
Get the scores of all the given dimensions
|
||||
|
||||
category: The category to be evaluated.
|
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
|
||||
four dimensions: coherence, consistency, fluency, relevance.
|
||||
|
||||
overall: indicates whether the overall score is to be calculated.
|
||||
Overall score can be customized to a combination of scores based on different
|
||||
dimensions. The default here is the average score of all the given dimensions.
|
||||
"""
|
||||
n_data = len(data)
|
||||
eval_scores = [{} for _ in range(n_data)]
|
||||
|
||||
if dims == None:
|
||||
eval_dims = self.dimensions
|
||||
else:
|
||||
assert isinstance(dims, list)
|
||||
eval_dims = dims
|
||||
|
||||
for dim in eval_dims:
|
||||
# Calculate average sentence-level scores for 'consistency' and 'fluency'
|
||||
if dim == 'consistency' or dim == 'fluency':
|
||||
src_list, output_list = [], []
|
||||
n_sents = [] # the number of sentences in each generated summary
|
||||
for i in range(n_data):
|
||||
source = data[i]['source']
|
||||
system_outputs = sent_tokenize(data[i]['system_output'])
|
||||
n_sents.append(len(system_outputs))
|
||||
for j in range(len(system_outputs)):
|
||||
src_list.append(source)
|
||||
output_list.append(system_outputs[j])
|
||||
input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
|
||||
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
# Get average score for each sample
|
||||
start_idx = 0
|
||||
score = []
|
||||
for cur_n_sent in n_sents:
|
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
|
||||
start_idx += cur_n_sent
|
||||
|
||||
# Calculate summary-level score for 'coherence' and 'relevance'
|
||||
elif dim == 'coherence' or dim == 'relevance':
|
||||
src_list, output_list, ref_list = [], [], []
|
||||
for i in range(n_data):
|
||||
src_list.append(data[i]['source'])
|
||||
output_list.append(data[i]['system_output'])
|
||||
if dim == 'relevance':
|
||||
ref_list.append(data[i]['reference'])
|
||||
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
|
||||
score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
# Please customize other dimensions here for summarization
|
||||
else:
|
||||
raise NotImplementedError('The input format for this dimension is still undefined. \
|
||||
Please customize it first.')
|
||||
|
||||
for i in range(n_data):
|
||||
eval_scores[i][dim] = score[i]
|
||||
|
||||
# Customize your overall score here.
|
||||
if overall == True:
|
||||
for i in range(n_data):
|
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
|
||||
|
||||
return eval_scores
|
||||
|
||||
|
||||
class DialogEvaluator:
|
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
|
||||
""" Set up evaluator for dialogues """
|
||||
self.scorer = UniEvaluator(
|
||||
model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
self.task = 'dialogue'
|
||||
self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability']
|
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True):
|
||||
"""
|
||||
Get the scores of all the given dimensions
|
||||
|
||||
category: The category to be evaluated.
|
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
|
||||
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
|
||||
|
||||
overall: indicates whether the overall score is to be calculated.
|
||||
Overall score can be customized to a combination of scores based on different
|
||||
dimensions. The default here is the average score of all the given dimensions.
|
||||
"""
|
||||
n_data = len(data)
|
||||
eval_scores = [{} for _ in range(n_data)]
|
||||
|
||||
if dims == None:
|
||||
eval_dims = self.dimensions
|
||||
else:
|
||||
assert isinstance(dims, list)
|
||||
eval_dims = dims
|
||||
|
||||
for dim in eval_dims:
|
||||
# Calculate summation score for 'engagingness'
|
||||
if dim == 'engagingness':
|
||||
src_list, output_list, context_list = [], [], []
|
||||
n_sents = [] # the number of sentences in each generated response
|
||||
for i in range(n_data):
|
||||
source = data[i]['source']
|
||||
context = data[i]['context']
|
||||
system_outputs = sent_tokenize(data[i]['system_output'])
|
||||
n_sents.append(len(system_outputs))
|
||||
for j in range(len(system_outputs)):
|
||||
src_list.append(source)
|
||||
context_list.append(context)
|
||||
output_list.append(system_outputs[j])
|
||||
input_list = add_question(dimension=dim,
|
||||
output=output_list,
|
||||
src=src_list,
|
||||
context=context_list,
|
||||
task=self.task)
|
||||
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
# Get the summation score for each sample
|
||||
start_idx = 0
|
||||
score = []
|
||||
for cur_n_sent in n_sents:
|
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]))
|
||||
start_idx += cur_n_sent
|
||||
|
||||
# Calculate turn-level score for other dimensions
|
||||
elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']:
|
||||
src_list, output_list, context_list = [], [], []
|
||||
for i in range(n_data):
|
||||
src_list.append(data[i]['source'])
|
||||
output_list.append(data[i]['system_output'])
|
||||
context_list.append(data[i]['context'])
|
||||
input_list = add_question(dimension=dim,
|
||||
output=output_list,
|
||||
src=src_list,
|
||||
context=context_list,
|
||||
task=self.task)
|
||||
score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
# Please customize other dimensions here for summarization
|
||||
else:
|
||||
raise NotImplementedError('The input format for this dimension is still undefined. \
|
||||
Please customize it first.')
|
||||
|
||||
for i in range(n_data):
|
||||
eval_scores[i][dim] = score[i]
|
||||
|
||||
# Customize your overall score here.
|
||||
if overall == True:
|
||||
for i in range(n_data):
|
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
|
||||
|
||||
return eval_scores
|
||||
|
||||
|
||||
class D2tEvaluator:
|
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
|
||||
""" Set up evaluator for data-to-text """
|
||||
self.scorer = UniEvaluator(
|
||||
model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
self.task = 'data2text'
|
||||
self.dimensions = ['naturalness', 'informativeness']
|
||||
|
||||
def evaluate(self, data, category, dims=None, overall=True):
|
||||
"""
|
||||
Get the scores of all the given dimensions
|
||||
|
||||
category: The category to be evaluated.
|
||||
|
||||
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
|
||||
two dimensions: naturalness and informativeness.
|
||||
|
||||
overall: indicates whether the overall score is to be calculated.
|
||||
Overall score can be customized to a combination of scores based on different
|
||||
dimensions. The default here is the average score of all the given dimensions.
|
||||
"""
|
||||
n_data = len(data)
|
||||
eval_scores = [{} for _ in range(n_data)]
|
||||
|
||||
if dims == None:
|
||||
eval_dims = self.dimensions
|
||||
else:
|
||||
assert isinstance(dims, list)
|
||||
eval_dims = dims
|
||||
|
||||
for dim in eval_dims:
|
||||
output_list, ref_list = [], []
|
||||
for i in range(n_data):
|
||||
output_list.append(data[i]['system_output'])
|
||||
ref_list.append(data[i]['reference'])
|
||||
|
||||
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
|
||||
score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
for i in range(n_data):
|
||||
eval_scores[i][dim] = score[i]
|
||||
|
||||
# Customize your overall score here.
|
||||
if overall == True:
|
||||
for i in range(n_data):
|
||||
eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values()))
|
||||
|
||||
return eval_scores
|
||||
|
||||
|
||||
class FactEvaluator:
|
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
|
||||
""" Set up evaluator for factual consistency detection """
|
||||
self.scorer = UniEvaluator(
|
||||
model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
self.task = 'fact'
|
||||
self.dim = 'consistency'
|
||||
|
||||
def evaluate(self, data, category):
|
||||
"""
|
||||
Get the factual consistency score (only 1 dimension for this task)
|
||||
|
||||
category: The category to be evaluated.
|
||||
"""
|
||||
n_data = len(data)
|
||||
eval_scores = [{} for _ in range(n_data)]
|
||||
|
||||
# Calculate average sentence-level scores for facutal consistency
|
||||
src_list, output_list = [], []
|
||||
n_sents = [] # the number of sentences in the claim
|
||||
for i in range(n_data):
|
||||
source = data[i]['source']
|
||||
system_outputs = sent_tokenize(data[i]['system_output'])
|
||||
n_sents.append(len(system_outputs))
|
||||
for j in range(len(system_outputs)):
|
||||
src_list.append(source)
|
||||
output_list.append(system_outputs[j])
|
||||
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
|
||||
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
||||
|
||||
# Get average score for each sample
|
||||
start_idx = 0
|
||||
score = []
|
||||
for cur_n_sent in n_sents:
|
||||
score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
|
||||
start_idx += cur_n_sent
|
||||
|
||||
for i in range(n_data):
|
||||
eval_scores[i][self.dim] = score[i]
|
||||
|
||||
return eval_scores
|
||||
|
||||
|
||||
def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None):
|
||||
assert task in ['summarization', 'dialogue', 'data2text', 'fact']
|
||||
if task == 'summarization':
|
||||
return SumEvaluator(model_name_or_path=model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
elif task == 'dialogue':
|
||||
return DialogEvaluator(model_name_or_path=model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
elif task == 'data2text':
|
||||
return D2tEvaluator(model_name_or_path=model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
elif task == 'fact':
|
||||
return FactEvaluator(model_name_or_path=model_name_or_path,
|
||||
max_length=max_length,
|
||||
device=device,
|
||||
cache_dir=cache_dir)
|
||||
else:
|
||||
raise NotImplementedError('Other tasks are not implemented, \
|
||||
please customize specific tasks here.')
|
@ -0,0 +1,101 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2022 Ming Zhong
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
|
||||
|
||||
class UniEvaluator:
|
||||
|
||||
def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
|
||||
""" Set up model """
|
||||
self.device = device
|
||||
self.max_length = max_length
|
||||
|
||||
self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
|
||||
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
|
||||
|
||||
self.model.eval()
|
||||
self.model.to(device)
|
||||
|
||||
self.softmax = nn.Softmax(dim=1)
|
||||
|
||||
self.pos_id = self.tokenizer("Yes")["input_ids"][0]
|
||||
self.neg_id = self.tokenizer("No")["input_ids"][0]
|
||||
|
||||
def score(self, inputs, task, category, dim, batch_size=8):
|
||||
"""
|
||||
Get scores for the given samples.
|
||||
final_score = postive_score / (postive_score + negative_score)
|
||||
"""
|
||||
|
||||
# The implementation of "forward" in T5 still requires decoder_input_ids.
|
||||
# Therefore, we construct a random one-word target sequence.
|
||||
# The content of the target has no effect on the final scores.
|
||||
tgts = ["No" for _ in range(len(inputs))]
|
||||
|
||||
pos_score_list, neg_score_list = [], []
|
||||
for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
|
||||
src_list = inputs[i:i + batch_size]
|
||||
tgt_list = tgts[i:i + batch_size]
|
||||
try:
|
||||
with torch.no_grad():
|
||||
encoded_src = self.tokenizer(src_list,
|
||||
max_length=self.max_length,
|
||||
truncation=True,
|
||||
padding=True,
|
||||
return_tensors='pt')
|
||||
encoded_tgt = self.tokenizer(tgt_list,
|
||||
max_length=self.max_length,
|
||||
truncation=True,
|
||||
padding=True,
|
||||
return_tensors='pt')
|
||||
|
||||
src_tokens = encoded_src['input_ids'].to(self.device)
|
||||
src_mask = encoded_src['attention_mask'].to(self.device)
|
||||
|
||||
tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
|
||||
|
||||
output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
|
||||
logits = output.logits.view(-1, self.model.config.vocab_size)
|
||||
|
||||
pos_score = self.softmax(logits)[:, self.pos_id] # Yes
|
||||
neg_score = self.softmax(logits)[:, self.neg_id] # No
|
||||
|
||||
cur_pos_score = [x.item() for x in pos_score]
|
||||
cur_neg_score = [x.item() for x in neg_score]
|
||||
pos_score_list += cur_pos_score
|
||||
neg_score_list += cur_neg_score
|
||||
|
||||
except RuntimeError:
|
||||
print(f'source: {src_list}')
|
||||
print(f'target: {tgt_list}')
|
||||
exit(0)
|
||||
|
||||
score_list = []
|
||||
for i in range(len(pos_score_list)):
|
||||
score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
|
||||
|
||||
return score_list
|
@ -0,0 +1,248 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2022 Ming Zhong
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import tqdm
|
||||
|
||||
|
||||
def add_question(dimension, output, src=None, ref=None, context=None, task=None):
|
||||
"""
|
||||
Add questions to generate input in Bool-QA format for UniEval.
|
||||
|
||||
dimension: specific dimension to be evaluated
|
||||
src: source input for different NLG tasks. For example, source document for summarization
|
||||
and dialogue history for dialogue response generation.
|
||||
output: output text generated by the models
|
||||
ref: human-annotataed groundtruth
|
||||
context: the context needed to evaluate several specific dimension. For example,
|
||||
additional factual information when evaluating engagingness and groundedness in dialogues.
|
||||
"""
|
||||
|
||||
input_with_question = []
|
||||
for i in range(len(output)):
|
||||
# For summarization
|
||||
if task == 'summarization':
|
||||
if dimension == 'fluency':
|
||||
cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
|
||||
elif dimension == 'coherence':
|
||||
cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[
|
||||
i] + ' </s> document: ' + src[i]
|
||||
elif dimension == 'consistency':
|
||||
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
|
||||
i] + ' </s> document: ' + src[i]
|
||||
elif dimension == 'relevance':
|
||||
cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[
|
||||
i] + ' </s> reference: ' + ref[i]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'The input format for this dimension is still undefined. Please customize it first.')
|
||||
# For dialogues
|
||||
elif task == 'dialogue':
|
||||
if dimension == 'naturalness':
|
||||
cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
|
||||
elif dimension == 'coherence':
|
||||
cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
|
||||
+ output[i] + ' </s> dialogue history: ' + src[i]
|
||||
elif dimension == 'engagingness':
|
||||
cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
|
||||
+ output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
|
||||
elif dimension == 'groundedness':
|
||||
cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
|
||||
+ output[i] + ' </s> fact: ' + context[i]
|
||||
elif dimension == 'understandability':
|
||||
cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'The input format for this dimension is still undefined. Please customize it first.')
|
||||
# For data-to-text
|
||||
elif task == 'data2text':
|
||||
if dimension == 'naturalness':
|
||||
cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
|
||||
elif dimension == 'informativeness':
|
||||
cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
|
||||
+ output[i] + ' </s> reference: ' + ref[i]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'The input format for this dimension is still undefined. Please customize it first.')
|
||||
# For factual consistency detection
|
||||
elif task == 'fact':
|
||||
if dimension == 'consistency':
|
||||
cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[
|
||||
i] + ' </s> document: ' + src[i]
|
||||
else:
|
||||
raise NotImplementedError('No other dimensions for the factual consistency detection task.')
|
||||
# For new customized tasks
|
||||
else:
|
||||
raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
|
||||
input_with_question.append(cur_input)
|
||||
return input_with_question
|
||||
|
||||
|
||||
def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
|
||||
"""
|
||||
Convert the data into the unieval's format.
|
||||
|
||||
output_list: a list of model output
|
||||
|
||||
src_list: source input for different NLG tasks. For example, source document for summarization
|
||||
and dialogue history for dialogue response generation
|
||||
ref_list: human-annotated groundtruth
|
||||
"""
|
||||
json_data = []
|
||||
for i in range(len(output_list)):
|
||||
cur = {}
|
||||
cur['system_output'] = output_list[i]
|
||||
if src_list is not None:
|
||||
cur['source'] = src_list[i]
|
||||
if ref_list is not None:
|
||||
cur['reference'] = ref_list[i]
|
||||
cur['context'] = ""
|
||||
json_data.append(cur)
|
||||
return json_data
|
||||
|
||||
|
||||
def calculate_average_score(scores):
|
||||
"""
|
||||
Calculate average scores for different metrics
|
||||
|
||||
scores: a list of scores for different metrics for each answer
|
||||
|
||||
"""
|
||||
metrics = {metric: 0 for metric in scores[0]}
|
||||
|
||||
for score in scores:
|
||||
for metric in score:
|
||||
metrics[metric] += score[metric]
|
||||
|
||||
for metric in metrics:
|
||||
metrics[metric] /= len(scores)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
|
||||
"""
|
||||
Save UniEval evaluation results of different categories for one model.
|
||||
|
||||
"""
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
unieval_metric_stats_per_category = {}
|
||||
for task, category_stat in unieval_metric_stats.items():
|
||||
for category, metric_stat in category_stat.items():
|
||||
if unieval_metric_stats_per_category.get(category, None) is None:
|
||||
unieval_metric_stats_per_category[category] = {}
|
||||
for metric, score in metric_stat.items():
|
||||
unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
|
||||
|
||||
automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
|
||||
automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
|
||||
|
||||
|
||||
def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
|
||||
"""
|
||||
Read a csv file and return a dictionary which stores scores per metric.
|
||||
|
||||
"""
|
||||
|
||||
results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
|
||||
|
||||
results_dict = {metric: {} for metric in list(results.index)}
|
||||
for i, metric in enumerate(results_dict.keys()):
|
||||
for j, category in enumerate(list(results.columns)):
|
||||
if pd.isnull(results.iloc[i][j]):
|
||||
continue
|
||||
results_dict[metric][category] = results.iloc[i][j]
|
||||
|
||||
return results_dict
|
||||
|
||||
|
||||
def analyze_unieval_results(results_path: str, save_path: str) -> None:
|
||||
"""
|
||||
Analyze and visualize all csv files in the given folder.
|
||||
|
||||
"""
|
||||
|
||||
if not os.path.exists(results_path):
|
||||
raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
|
||||
|
||||
all_statistics = {}
|
||||
|
||||
for file_name in os.listdir(results_path):
|
||||
if file_name.endswith("_results.csv"):
|
||||
model_name = file_name.split("_results.csv")[0]
|
||||
all_statistics[model_name] = read_unieval_results(results_path, file_name)
|
||||
|
||||
if len(list(all_statistics.keys())) == 0:
|
||||
raise Exception(f'There are no csv files in the given directory "{results_path}"!')
|
||||
|
||||
frame_all = {"model": [], "category": [], "metric": [], "score": []}
|
||||
frame_per_metric = {}
|
||||
for model_name, model_statistics in all_statistics.items():
|
||||
for metric, metric_statistics in model_statistics.items():
|
||||
if frame_per_metric.get(metric) is None:
|
||||
frame_per_metric[metric] = {"model": [], "category": [], "score": []}
|
||||
|
||||
for category, category_score in metric_statistics.items():
|
||||
frame_all["model"].append(model_name)
|
||||
frame_all["category"].append(category)
|
||||
frame_all["metric"].append(metric)
|
||||
frame_all["score"].append(category_score)
|
||||
|
||||
frame_per_metric[metric]["model"].append(model_name)
|
||||
frame_per_metric[metric]["category"].append(category)
|
||||
frame_per_metric[metric]["score"].append(category_score)
|
||||
|
||||
if not os.path.exists(save_path):
|
||||
os.makedirs(save_path)
|
||||
|
||||
frame_all = pd.DataFrame(frame_all)
|
||||
frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
|
||||
|
||||
for metric in tqdm.tqdm(
|
||||
frame_per_metric.keys(),
|
||||
desc=f"UniEval metrics: ",
|
||||
total=len(frame_per_metric.keys()),
|
||||
):
|
||||
data = pd.DataFrame(frame_per_metric[metric])
|
||||
|
||||
sns.set()
|
||||
fig = plt.figure(figsize=(16, 10))
|
||||
|
||||
fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
|
||||
fig.set_title(
|
||||
f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
|
||||
)
|
||||
plt.xlabel("Evaluation Category")
|
||||
plt.ylabel("Score")
|
||||
|
||||
figure = fig.get_figure()
|
||||
figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
|
||||
|
||||
plt.close()
|
Loading…
Reference in new issue