|
|
|
@ -1,12 +1,46 @@
|
|
|
|
|
from typing import Callable |
|
|
|
|
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li |
|
|
|
|
# |
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
|
|
# you may not use this file except in compliance with the License. |
|
|
|
|
# You may obtain a copy of the License at |
|
|
|
|
# |
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
|
|
|
# |
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
|
|
|
# See the License for the specific language governing permissions and |
|
|
|
|
# limitations under the License. |
|
|
|
|
|
|
|
|
|
import copy |
|
|
|
|
from dataclasses import dataclass, field |
|
|
|
|
from typing import Callable, Dict, Sequence |
|
|
|
|
import random |
|
|
|
|
from torch.utils.data import Dataset |
|
|
|
|
import torch.distributed as dist |
|
|
|
|
from tqdm import tqdm |
|
|
|
|
import torch |
|
|
|
|
|
|
|
|
|
from .utils import is_rank_0 |
|
|
|
|
from .utils import is_rank_0, jload |
|
|
|
|
|
|
|
|
|
import transformers |
|
|
|
|
from colossalai.logging import get_dist_logger |
|
|
|
|
|
|
|
|
|
logger = get_dist_logger() |
|
|
|
|
|
|
|
|
|
IGNORE_INDEX = -100 |
|
|
|
|
PROMPT_DICT = { |
|
|
|
|
"prompt_input": ( |
|
|
|
|
"Below is an instruction that describes a task, paired with an input that provides further context. " |
|
|
|
|
"Write a response that appropriately completes the request.\n\n" |
|
|
|
|
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" |
|
|
|
|
), |
|
|
|
|
"prompt_no_input": ( |
|
|
|
|
"Below is an instruction that describes a task. " |
|
|
|
|
"Write a response that appropriately completes the request.\n\n" |
|
|
|
|
"### Instruction:\n{instruction}\n\n### Response:" |
|
|
|
|
), |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
class SFTDataset(Dataset): |
|
|
|
|
""" |
|
|
|
@ -38,3 +72,87 @@ class SFTDataset(Dataset):
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, idx): |
|
|
|
|
return self.prompts[idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: |
|
|
|
|
"""Tokenize a list of strings.""" |
|
|
|
|
tokenized_list = [ |
|
|
|
|
tokenizer( |
|
|
|
|
text, |
|
|
|
|
return_tensors="pt", |
|
|
|
|
padding="longest", |
|
|
|
|
max_length=tokenizer.model_max_length, |
|
|
|
|
truncation=True, |
|
|
|
|
) |
|
|
|
|
for text in strings |
|
|
|
|
] |
|
|
|
|
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] |
|
|
|
|
input_ids_lens = labels_lens = [ |
|
|
|
|
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list |
|
|
|
|
] |
|
|
|
|
return dict( |
|
|
|
|
input_ids=input_ids, |
|
|
|
|
labels=labels, |
|
|
|
|
input_ids_lens=input_ids_lens, |
|
|
|
|
labels_lens=labels_lens, |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
def preprocess( |
|
|
|
|
sources: Sequence[str], |
|
|
|
|
targets: Sequence[str], |
|
|
|
|
tokenizer: transformers.PreTrainedTokenizer, |
|
|
|
|
) -> Dict: |
|
|
|
|
"""Preprocess the data by tokenizing.""" |
|
|
|
|
examples = [s + t for s, t in zip(sources, targets)] |
|
|
|
|
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] |
|
|
|
|
input_ids = examples_tokenized["input_ids"] |
|
|
|
|
labels = copy.deepcopy(input_ids) |
|
|
|
|
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): |
|
|
|
|
label[:source_len] = IGNORE_INDEX |
|
|
|
|
return dict(input_ids=input_ids, labels=labels) |
|
|
|
|
|
|
|
|
|
class AlpacaDataset(Dataset): |
|
|
|
|
"""Dataset for supervised fine-tuning.""" |
|
|
|
|
|
|
|
|
|
def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer): |
|
|
|
|
super(AlpacaDataset, self).__init__() |
|
|
|
|
logger.info("Loading data...") |
|
|
|
|
list_data_dict = jload(data_path) |
|
|
|
|
|
|
|
|
|
logger.info("Formatting inputs...") |
|
|
|
|
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"] |
|
|
|
|
sources = [ |
|
|
|
|
prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example) |
|
|
|
|
for example in list_data_dict |
|
|
|
|
] |
|
|
|
|
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict] |
|
|
|
|
|
|
|
|
|
logger.info("Tokenizing inputs... This may take some time...") |
|
|
|
|
data_dict = preprocess(sources, targets, tokenizer) |
|
|
|
|
|
|
|
|
|
self.input_ids = data_dict["input_ids"] |
|
|
|
|
self.labels = data_dict["labels"] |
|
|
|
|
|
|
|
|
|
def __len__(self): |
|
|
|
|
return len(self.input_ids) |
|
|
|
|
|
|
|
|
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]: |
|
|
|
|
return dict(input_ids=self.input_ids[i], labels=self.labels[i]) |
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
class AlpacaDataCollator(object): |
|
|
|
|
"""Collate examples for supervised fine-tuning.""" |
|
|
|
|
|
|
|
|
|
tokenizer: transformers.PreTrainedTokenizer |
|
|
|
|
|
|
|
|
|
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: |
|
|
|
|
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) |
|
|
|
|
input_ids = torch.nn.utils.rnn.pad_sequence( |
|
|
|
|
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id |
|
|
|
|
) |
|
|
|
|
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) |
|
|
|
|
return dict( |
|
|
|
|
input_ids=input_ids, |
|
|
|
|
labels=labels, |
|
|
|
|
attention_mask=input_ids.ne(self.tokenizer.pad_token_id), |
|
|
|
|
) |
|
|
|
|