[nfc] fix dim not defined and fix typo (#3991)

pull/4033/head
digger yu 2023-06-19 11:21:55 +08:00 committed by GitHub
parent ca768eb62d
commit 727c4598a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 8 additions and 8 deletions

View File

@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
""" """
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer. Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Temprature is set to 0 to make the model more deterministic. Temperature is set to 0 to make the model more deterministic.
Args: Args:
prompt: a dictionary including prompt template, CoT and metrics. prompt: a dictionary including prompt template, CoT and metrics.
@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
Use completion model(text-davinci-003) to evaluate one model answer. Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities. Only completion models can return log probabilities.
Temprature is set to 0 to make the model more deterministic. Temperature is set to 0 to make the model more deterministic.
Args: Args:
prompt: a dictionary including prompt template, CoT and metrics. prompt: a dictionary including prompt template, CoT and metrics.
@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int: def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
""" """
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4. Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4. Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo. Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args: Args:

View File

@ -277,7 +277,7 @@ class FactEvaluator:
n_data = len(data) n_data = len(data)
eval_scores = [{} for _ in range(n_data)] eval_scores = [{} for _ in range(n_data)]
# Calculate average sentence-level scores for facutal consistency # Calculate average sentence-level scores for factual consistency
src_list, output_list = [], [] src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim n_sents = [] # the number of sentences in the claim
for i in range(n_data): for i in range(n_data):
@ -288,7 +288,7 @@ class FactEvaluator:
src_list.append(source) src_list.append(source)
output_list.append(system_outputs[j]) output_list.append(system_outputs[j])
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task) input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
sent_score = self.scorer.score(input_list, self.task, category, dim) sent_score = self.scorer.score(input_list, self.task, category, self.dim)
# Get average score for each sample # Get average score for each sample
start_idx = 0 start_idx = 0

View File

@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
src: source input for different NLG tasks. For example, source document for summarization src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation. and dialogue history for dialogue response generation.
output: output text generated by the models output: output text generated by the models
ref: human-annotataed groundtruth ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example, context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues. additional factual information when evaluating engagingness and groundedness in dialogues.
""" """

View File

@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
def run_test_data(strategy): def run_test_data(strategy):
EXPERINCE_BATCH_SIZE = 4 EXPERIENCE_BATCH_SIZE = 4
SAMPLE_BATCH_SIZE = 2 SAMPLE_BATCH_SIZE = 2
if strategy == 'ddp': if strategy == 'ddp':
@ -54,7 +54,7 @@ def run_test_data(strategy):
# experience of all ranks should be the same # experience of all ranks should be the same
for _ in range(2): for _ in range(2):
data = get_data(EXPERINCE_BATCH_SIZE) data = get_data(EXPERIENCE_BATCH_SIZE)
assert gather_and_equal(data['input_ids']) assert gather_and_equal(data['input_ids'])
assert gather_and_equal(data['attention_mask']) assert gather_and_equal(data['attention_mask'])
experience = experience_maker.make_experience(**data, experience = experience_maker.make_experience(**data,