mirror of https://github.com/hpcaitech/ColossalAI
330 lines
14 KiB
Python
330 lines
14 KiB
Python
# MIT License
|
|
|
|
# Copyright (c) 2022 Ming Zhong
|
|
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
# in the Software without restriction, including without limitation the rights
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|
# furnished to do so, subject to the following conditions:
|
|
|
|
# The above copyright notice and this permission notice shall be included in all
|
|
# copies or substantial portions of the Software.
|
|
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
# SOFTWARE.
|
|
|
|
import numpy as np
|
|
from nltk import sent_tokenize
|
|
|
|
from .scorer import UniEvaluator
|
|
from .utils import add_question
|
|
|
|
|
|
class SumEvaluator:
|
|
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
|
|
"""Set up evaluator for text summarization"""
|
|
self.scorer = UniEvaluator(
|
|
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
|
|
max_length=max_length,
|
|
device=device,
|
|
cache_dir=cache_dir,
|
|
)
|
|
self.task = "summarization"
|
|
self.dimensions = ["coherence", "consistency", "fluency", "relevance"]
|
|
|
|
def evaluate(self, data, category, dims=None, overall=True):
|
|
"""
|
|
Get the scores of all the given dimensions
|
|
|
|
category: The category to be evaluated.
|
|
|
|
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
|
|
four dimensions: coherence, consistency, fluency, relevance.
|
|
|
|
overall: indicates whether the overall score is to be calculated.
|
|
Overall score can be customized to a combination of scores based on different
|
|
dimensions. The default here is the average score of all the given dimensions.
|
|
"""
|
|
n_data = len(data)
|
|
eval_scores = [{} for _ in range(n_data)]
|
|
|
|
if dims == None:
|
|
eval_dims = self.dimensions
|
|
else:
|
|
assert isinstance(dims, list)
|
|
eval_dims = dims
|
|
|
|
for dim in eval_dims:
|
|
# Calculate average sentence-level scores for 'consistency' and 'fluency'
|
|
if dim == "consistency" or dim == "fluency":
|
|
src_list, output_list = [], []
|
|
n_sents = [] # the number of sentences in each generated summary
|
|
for i in range(n_data):
|
|
source = data[i]["source"]
|
|
system_outputs = sent_tokenize(data[i]["system_output"])
|
|
n_sents.append(len(system_outputs))
|
|
for j in range(len(system_outputs)):
|
|
src_list.append(source)
|
|
output_list.append(system_outputs[j])
|
|
input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
|
|
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
|
|
|
# Get average score for each sample
|
|
start_idx = 0
|
|
score = []
|
|
for cur_n_sent in n_sents:
|
|
# prevent denominator from being 0
|
|
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
|
|
start_idx += cur_n_sent
|
|
|
|
# Calculate summary-level score for 'coherence' and 'relevance'
|
|
elif dim == "coherence" or dim == "relevance":
|
|
src_list, output_list, ref_list = [], [], []
|
|
for i in range(n_data):
|
|
src_list.append(data[i]["source"])
|
|
output_list.append(data[i]["system_output"])
|
|
if dim == "relevance":
|
|
ref_list.append(data[i]["reference"])
|
|
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
|
|
score = self.scorer.score(input_list, self.task, category, dim)
|
|
|
|
# Please customize other dimensions here for summarization
|
|
else:
|
|
raise NotImplementedError(
|
|
"The input format for this dimension is still undefined. \
|
|
Please customize it first."
|
|
)
|
|
|
|
for i in range(n_data):
|
|
eval_scores[i][dim] = score[i]
|
|
|
|
# Customize your overall score here.
|
|
if overall == True:
|
|
for i in range(n_data):
|
|
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
|
|
|
|
return eval_scores
|
|
|
|
|
|
class DialogEvaluator:
|
|
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
|
|
"""Set up evaluator for dialogues"""
|
|
self.scorer = UniEvaluator(
|
|
model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
|
|
max_length=max_length,
|
|
device=device,
|
|
cache_dir=cache_dir,
|
|
)
|
|
self.task = "dialogue"
|
|
self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]
|
|
|
|
def evaluate(self, data, category, dims=None, overall=True):
|
|
"""
|
|
Get the scores of all the given dimensions
|
|
|
|
category: The category to be evaluated.
|
|
|
|
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
|
|
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
|
|
|
|
overall: indicates whether the overall score is to be calculated.
|
|
Overall score can be customized to a combination of scores based on different
|
|
dimensions. The default here is the average score of all the given dimensions.
|
|
"""
|
|
n_data = len(data)
|
|
eval_scores = [{} for _ in range(n_data)]
|
|
|
|
if dims == None:
|
|
eval_dims = self.dimensions
|
|
else:
|
|
assert isinstance(dims, list)
|
|
eval_dims = dims
|
|
|
|
for dim in eval_dims:
|
|
# Calculate summation score for 'engagingness'
|
|
if dim == "engagingness":
|
|
src_list, output_list, context_list = [], [], []
|
|
n_sents = [] # the number of sentences in each generated response
|
|
for i in range(n_data):
|
|
source = data[i]["source"]
|
|
context = data[i]["context"]
|
|
system_outputs = sent_tokenize(data[i]["system_output"])
|
|
n_sents.append(len(system_outputs))
|
|
for j in range(len(system_outputs)):
|
|
src_list.append(source)
|
|
context_list.append(context)
|
|
output_list.append(system_outputs[j])
|
|
input_list = add_question(
|
|
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
|
|
)
|
|
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
|
|
|
# Get the summation score for each sample
|
|
start_idx = 0
|
|
score = []
|
|
for cur_n_sent in n_sents:
|
|
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
|
|
start_idx += cur_n_sent
|
|
|
|
# Calculate turn-level score for other dimensions
|
|
elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
|
|
src_list, output_list, context_list = [], [], []
|
|
for i in range(n_data):
|
|
src_list.append(data[i]["source"])
|
|
output_list.append(data[i]["system_output"])
|
|
context_list.append(data[i]["context"])
|
|
input_list = add_question(
|
|
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
|
|
)
|
|
score = self.scorer.score(input_list, self.task, category, dim)
|
|
|
|
# Please customize other dimensions here for summarization
|
|
else:
|
|
raise NotImplementedError(
|
|
"The input format for this dimension is still undefined. \
|
|
Please customize it first."
|
|
)
|
|
|
|
for i in range(n_data):
|
|
eval_scores[i][dim] = score[i]
|
|
|
|
# Customize your overall score here.
|
|
if overall == True:
|
|
for i in range(n_data):
|
|
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
|
|
|
|
return eval_scores
|
|
|
|
|
|
class D2tEvaluator:
|
|
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
|
|
"""Set up evaluator for data-to-text"""
|
|
self.scorer = UniEvaluator(
|
|
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
|
|
max_length=max_length,
|
|
device=device,
|
|
cache_dir=cache_dir,
|
|
)
|
|
self.task = "data2text"
|
|
self.dimensions = ["naturalness", "informativeness"]
|
|
|
|
def evaluate(self, data, category, dims=None, overall=True):
|
|
"""
|
|
Get the scores of all the given dimensions
|
|
|
|
category: The category to be evaluated.
|
|
|
|
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
|
|
two dimensions: naturalness and informativeness.
|
|
|
|
overall: indicates whether the overall score is to be calculated.
|
|
Overall score can be customized to a combination of scores based on different
|
|
dimensions. The default here is the average score of all the given dimensions.
|
|
"""
|
|
n_data = len(data)
|
|
eval_scores = [{} for _ in range(n_data)]
|
|
|
|
if dims == None:
|
|
eval_dims = self.dimensions
|
|
else:
|
|
assert isinstance(dims, list)
|
|
eval_dims = dims
|
|
|
|
for dim in eval_dims:
|
|
output_list, ref_list = [], []
|
|
for i in range(n_data):
|
|
output_list.append(data[i]["system_output"])
|
|
ref_list.append(data[i]["reference"])
|
|
|
|
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
|
|
score = self.scorer.score(input_list, self.task, category, dim)
|
|
|
|
for i in range(n_data):
|
|
eval_scores[i][dim] = score[i]
|
|
|
|
# Customize your overall score here.
|
|
if overall == True:
|
|
for i in range(n_data):
|
|
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
|
|
|
|
return eval_scores
|
|
|
|
|
|
class FactEvaluator:
|
|
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
|
|
"""Set up evaluator for factual consistency detection"""
|
|
self.scorer = UniEvaluator(
|
|
model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
|
|
max_length=max_length,
|
|
device=device,
|
|
cache_dir=cache_dir,
|
|
)
|
|
self.task = "fact"
|
|
self.dim = "consistency"
|
|
|
|
def evaluate(self, data, category):
|
|
"""
|
|
Get the factual consistency score (only 1 dimension for this task)
|
|
|
|
category: The category to be evaluated.
|
|
"""
|
|
n_data = len(data)
|
|
eval_scores = [{} for _ in range(n_data)]
|
|
|
|
# Calculate average sentence-level scores for factual consistency
|
|
src_list, output_list = [], []
|
|
n_sents = [] # the number of sentences in the claim
|
|
for i in range(n_data):
|
|
source = data[i]["source"]
|
|
system_outputs = sent_tokenize(data[i]["system_output"])
|
|
n_sents.append(len(system_outputs))
|
|
for j in range(len(system_outputs)):
|
|
src_list.append(source)
|
|
output_list.append(system_outputs[j])
|
|
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
|
|
sent_score = self.scorer.score(input_list, self.task, category, self.dim)
|
|
|
|
# Get average score for each sample
|
|
start_idx = 0
|
|
score = []
|
|
for cur_n_sent in n_sents:
|
|
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
|
|
start_idx += cur_n_sent
|
|
|
|
for i in range(n_data):
|
|
eval_scores[i][self.dim] = score[i]
|
|
|
|
return eval_scores
|
|
|
|
|
|
def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
|
|
assert task in ["summarization", "dialogue", "data2text", "fact"]
|
|
if task == "summarization":
|
|
return SumEvaluator(
|
|
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
|
|
)
|
|
elif task == "dialogue":
|
|
return DialogEvaluator(
|
|
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
|
|
)
|
|
elif task == "data2text":
|
|
return D2tEvaluator(
|
|
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
|
|
)
|
|
elif task == "fact":
|
|
return FactEvaluator(
|
|
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
|
|
)
|
|
else:
|
|
raise NotImplementedError(
|
|
"Other tasks are not implemented, \
|
|
please customize specific tasks here."
|
|
)
|