# MIT License # Copyright (c) 2022 Ming Zhong # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import numpy as np from nltk import sent_tokenize from .scorer import UniEvaluator from .utils import add_question class SumEvaluator: def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): """ Set up evaluator for text summarization """ self.scorer = UniEvaluator( model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) self.task = 'summarization' self.dimensions = ['coherence', 'consistency', 'fluency', 'relevance'] def evaluate(self, data, category, dims=None, overall=True): """ Get the scores of all the given dimensions category: The category to be evaluated. dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate four dimensions: coherence, consistency, fluency, relevance. overall: indicates whether the overall score is to be calculated. Overall score can be customized to a combination of scores based on different dimensions. The default here is the average score of all the given dimensions. """ n_data = len(data) eval_scores = [{} for _ in range(n_data)] if dims == None: eval_dims = self.dimensions else: assert isinstance(dims, list) eval_dims = dims for dim in eval_dims: # Calculate average sentence-level scores for 'consistency' and 'fluency' if dim == 'consistency' or dim == 'fluency': src_list, output_list = [], [] n_sents = [] # the number of sentences in each generated summary for i in range(n_data): source = data[i]['source'] system_outputs = sent_tokenize(data[i]['system_output']) n_sents.append(len(system_outputs)) for j in range(len(system_outputs)): src_list.append(source) output_list.append(system_outputs[j]) input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task) sent_score = self.scorer.score(input_list, self.task, category, dim) # Get average score for each sample start_idx = 0 score = [] for cur_n_sent in n_sents: score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent) start_idx += cur_n_sent # Calculate summary-level score for 'coherence' and 'relevance' elif dim == 'coherence' or dim == 'relevance': src_list, output_list, ref_list = [], [], [] for i in range(n_data): src_list.append(data[i]['source']) output_list.append(data[i]['system_output']) if dim == 'relevance': ref_list.append(data[i]['reference']) input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task) score = self.scorer.score(input_list, self.task, category, dim) # Please customize other dimensions here for summarization else: raise NotImplementedError('The input format for this dimension is still undefined. \ Please customize it first.') for i in range(n_data): eval_scores[i][dim] = score[i] # Customize your overall score here. if overall == True: for i in range(n_data): eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) return eval_scores class DialogEvaluator: def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): """ Set up evaluator for dialogues """ self.scorer = UniEvaluator( model_name_or_path='MingZhong/unieval-dialog' if model_name_or_path == "" else model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) self.task = 'dialogue' self.dimensions = ['naturalness', 'coherence', 'engagingness', 'groundedness', 'understandability'] def evaluate(self, data, category, dims=None, overall=True): """ Get the scores of all the given dimensions category: The category to be evaluated. dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate five dimensions: naturalness, coherence, engagingness, groundedness and understandability. overall: indicates whether the overall score is to be calculated. Overall score can be customized to a combination of scores based on different dimensions. The default here is the average score of all the given dimensions. """ n_data = len(data) eval_scores = [{} for _ in range(n_data)] if dims == None: eval_dims = self.dimensions else: assert isinstance(dims, list) eval_dims = dims for dim in eval_dims: # Calculate summation score for 'engagingness' if dim == 'engagingness': src_list, output_list, context_list = [], [], [] n_sents = [] # the number of sentences in each generated response for i in range(n_data): source = data[i]['source'] context = data[i]['context'] system_outputs = sent_tokenize(data[i]['system_output']) n_sents.append(len(system_outputs)) for j in range(len(system_outputs)): src_list.append(source) context_list.append(context) output_list.append(system_outputs[j]) input_list = add_question(dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task) sent_score = self.scorer.score(input_list, self.task, category, dim) # Get the summation score for each sample start_idx = 0 score = [] for cur_n_sent in n_sents: score.append(sum(sent_score[start_idx:start_idx + cur_n_sent])) start_idx += cur_n_sent # Calculate turn-level score for other dimensions elif dim in ['naturalness', 'coherence', 'groundedness', 'understandability']: src_list, output_list, context_list = [], [], [] for i in range(n_data): src_list.append(data[i]['source']) output_list.append(data[i]['system_output']) context_list.append(data[i]['context']) input_list = add_question(dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task) score = self.scorer.score(input_list, self.task, category, dim) # Please customize other dimensions here for summarization else: raise NotImplementedError('The input format for this dimension is still undefined. \ Please customize it first.') for i in range(n_data): eval_scores[i][dim] = score[i] # Customize your overall score here. if overall == True: for i in range(n_data): eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) return eval_scores class D2tEvaluator: def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): """ Set up evaluator for data-to-text """ self.scorer = UniEvaluator( model_name_or_path='MingZhong/unieval-sum' if model_name_or_path == "" else model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) self.task = 'data2text' self.dimensions = ['naturalness', 'informativeness'] def evaluate(self, data, category, dims=None, overall=True): """ Get the scores of all the given dimensions category: The category to be evaluated. dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate two dimensions: naturalness and informativeness. overall: indicates whether the overall score is to be calculated. Overall score can be customized to a combination of scores based on different dimensions. The default here is the average score of all the given dimensions. """ n_data = len(data) eval_scores = [{} for _ in range(n_data)] if dims == None: eval_dims = self.dimensions else: assert isinstance(dims, list) eval_dims = dims for dim in eval_dims: output_list, ref_list = [], [] for i in range(n_data): output_list.append(data[i]['system_output']) ref_list.append(data[i]['reference']) input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task) score = self.scorer.score(input_list, self.task, category, dim) for i in range(n_data): eval_scores[i][dim] = score[i] # Customize your overall score here. if overall == True: for i in range(n_data): eval_scores[i]['overall'] = np.mean(list(eval_scores[i].values())) return eval_scores class FactEvaluator: def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None): """ Set up evaluator for factual consistency detection """ self.scorer = UniEvaluator( model_name_or_path='MingZhong/unieval-fact' if model_name_or_path == "" else model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) self.task = 'fact' self.dim = 'consistency' def evaluate(self, data, category): """ Get the factual consistency score (only 1 dimension for this task) category: The category to be evaluated. """ n_data = len(data) eval_scores = [{} for _ in range(n_data)] # Calculate average sentence-level scores for factual consistency src_list, output_list = [], [] n_sents = [] # the number of sentences in the claim for i in range(n_data): source = data[i]['source'] system_outputs = sent_tokenize(data[i]['system_output']) n_sents.append(len(system_outputs)) for j in range(len(system_outputs)): src_list.append(source) output_list.append(system_outputs[j]) input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task) sent_score = self.scorer.score(input_list, self.task, category, self.dim) # Get average score for each sample start_idx = 0 score = [] for cur_n_sent in n_sents: score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent) start_idx += cur_n_sent for i in range(n_data): eval_scores[i][self.dim] = score[i] return eval_scores def get_evaluator(task, model_name_or_path="", max_length=1024, device='cuda:0', cache_dir=None): assert task in ['summarization', 'dialogue', 'data2text', 'fact'] if task == 'summarization': return SumEvaluator(model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) elif task == 'dialogue': return DialogEvaluator(model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) elif task == 'data2text': return D2tEvaluator(model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) elif task == 'fact': return FactEvaluator(model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir) else: raise NotImplementedError('Other tasks are not implemented, \ please customize specific tasks here.')