mirror of https://github.com/hpcaitech/ColossalAI
[nfc] fix dim not defined and fix typo (#3991)
parent
ca768eb62d
commit
727c4598a9
|
@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
|
|||
"""
|
||||
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
|
||||
|
||||
Temprature is set to 0 to make the model more deterministic.
|
||||
Temperature is set to 0 to make the model more deterministic.
|
||||
|
||||
Args:
|
||||
prompt: a dictionary including prompt template, CoT and metrics.
|
||||
|
@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
|
|||
Use completion model(text-davinci-003) to evaluate one model answer.
|
||||
Only completion models can return log probabilities.
|
||||
|
||||
Temprature is set to 0 to make the model more deterministic.
|
||||
Temperature is set to 0 to make the model more deterministic.
|
||||
|
||||
Args:
|
||||
prompt: a dictionary including prompt template, CoT and metrics.
|
||||
|
@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
|
|||
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
|
||||
"""
|
||||
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
|
||||
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
|
||||
Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
|
||||
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
|
||||
|
||||
Args:
|
||||
|
|
|
@ -277,7 +277,7 @@ class FactEvaluator:
|
|||
n_data = len(data)
|
||||
eval_scores = [{} for _ in range(n_data)]
|
||||
|
||||
# Calculate average sentence-level scores for facutal consistency
|
||||
# Calculate average sentence-level scores for factual consistency
|
||||
src_list, output_list = [], []
|
||||
n_sents = [] # the number of sentences in the claim
|
||||
for i in range(n_data):
|
||||
|
@ -288,7 +288,7 @@ class FactEvaluator:
|
|||
src_list.append(source)
|
||||
output_list.append(system_outputs[j])
|
||||
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
|
||||
sent_score = self.scorer.score(input_list, self.task, category, dim)
|
||||
sent_score = self.scorer.score(input_list, self.task, category, self.dim)
|
||||
|
||||
# Get average score for each sample
|
||||
start_idx = 0
|
||||
|
|
|
@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
|
|||
src: source input for different NLG tasks. For example, source document for summarization
|
||||
and dialogue history for dialogue response generation.
|
||||
output: output text generated by the models
|
||||
ref: human-annotataed groundtruth
|
||||
ref: human-annotated groundtruth
|
||||
context: the context needed to evaluate several specific dimension. For example,
|
||||
additional factual information when evaluating engagingness and groundedness in dialogues.
|
||||
"""
|
||||
|
|
|
@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
|
|||
|
||||
|
||||
def run_test_data(strategy):
|
||||
EXPERINCE_BATCH_SIZE = 4
|
||||
EXPERIENCE_BATCH_SIZE = 4
|
||||
SAMPLE_BATCH_SIZE = 2
|
||||
|
||||
if strategy == 'ddp':
|
||||
|
@ -54,7 +54,7 @@ def run_test_data(strategy):
|
|||
|
||||
# experience of all ranks should be the same
|
||||
for _ in range(2):
|
||||
data = get_data(EXPERINCE_BATCH_SIZE)
|
||||
data = get_data(EXPERIENCE_BATCH_SIZE)
|
||||
assert gather_and_equal(data['input_ids'])
|
||||
assert gather_and_equal(data['attention_mask'])
|
||||
experience = experience_maker.make_experience(**data,
|
||||
|
|
Loading…
Reference in New Issue