From 727c4598a97a3985770f85bd7990e885b350c744 Mon Sep 17 00:00:00 2001 From: digger yu Date: Mon, 19 Jun 2023 11:21:55 +0800 Subject: [PATCH] [nfc] fix dim not defined and fix typo (#3991) --- applications/Chat/evaluate/gpt_evaluate.py | 6 +++--- applications/Chat/evaluate/unieval/evaluator.py | 4 ++-- applications/Chat/evaluate/unieval/utils.py | 2 +- applications/Chat/tests/test_data.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/applications/Chat/evaluate/gpt_evaluate.py b/applications/Chat/evaluate/gpt_evaluate.py index 012f41ab0..f8cfb8d0f 100644 --- a/applications/Chat/evaluate/gpt_evaluate.py +++ b/applications/Chat/evaluate/gpt_evaluate.py @@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any], """ Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer. - Temprature is set to 0 to make the model more deterministic. + Temperature is set to 0 to make the model more deterministic. Args: prompt: a dictionary including prompt template, CoT and metrics. @@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any], Use completion model(text-davinci-003) to evaluate one model answer. Only completion models can return log probabilities. - Temprature is set to 0 to make the model more deterministic. + Temperature is set to 0 to make the model more deterministic. Args: prompt: a dictionary including prompt template, CoT and metrics. @@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int: """ Calculate the score from the response returned by gpt-3.5-turbo or gpt-4. - Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4. + Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4. Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo. Args: diff --git a/applications/Chat/evaluate/unieval/evaluator.py b/applications/Chat/evaluate/unieval/evaluator.py index 385425e4a..d7f2f87f8 100644 --- a/applications/Chat/evaluate/unieval/evaluator.py +++ b/applications/Chat/evaluate/unieval/evaluator.py @@ -277,7 +277,7 @@ class FactEvaluator: n_data = len(data) eval_scores = [{} for _ in range(n_data)] - # Calculate average sentence-level scores for facutal consistency + # Calculate average sentence-level scores for factual consistency src_list, output_list = [], [] n_sents = [] # the number of sentences in the claim for i in range(n_data): @@ -288,7 +288,7 @@ class FactEvaluator: src_list.append(source) output_list.append(system_outputs[j]) input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task) - sent_score = self.scorer.score(input_list, self.task, category, dim) + sent_score = self.scorer.score(input_list, self.task, category, self.dim) # Get average score for each sample start_idx = 0 diff --git a/applications/Chat/evaluate/unieval/utils.py b/applications/Chat/evaluate/unieval/utils.py index a77505faa..a381e9e59 100644 --- a/applications/Chat/evaluate/unieval/utils.py +++ b/applications/Chat/evaluate/unieval/utils.py @@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None) src: source input for different NLG tasks. For example, source document for summarization and dialogue history for dialogue response generation. output: output text generated by the models - ref: human-annotataed groundtruth + ref: human-annotated groundtruth context: the context needed to evaluate several specific dimension. For example, additional factual information when evaluating engagingness and groundedness in dialogues. """ diff --git a/applications/Chat/tests/test_data.py b/applications/Chat/tests/test_data.py index 2e4d4ceac..67016f6ed 100644 --- a/applications/Chat/tests/test_data.py +++ b/applications/Chat/tests/test_data.py @@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool: def run_test_data(strategy): - EXPERINCE_BATCH_SIZE = 4 + EXPERIENCE_BATCH_SIZE = 4 SAMPLE_BATCH_SIZE = 2 if strategy == 'ddp': @@ -54,7 +54,7 @@ def run_test_data(strategy): # experience of all ranks should be the same for _ in range(2): - data = get_data(EXPERINCE_BATCH_SIZE) + data = get_data(EXPERIENCE_BATCH_SIZE) assert gather_and_equal(data['input_ids']) assert gather_and_equal(data['attention_mask']) experience = experience_maker.make_experience(**data,