add community example dictionary (#3465)

pull/3343/head
Fazzie-Maqianli 2023-04-06 15:04:48 +08:00 committed by GitHub
parent 80eba05b0a
commit 6afeb1202a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 94 additions and 92 deletions

View File

@ -0,0 +1 @@
# Community Examples

View File

@ -1,14 +1,12 @@
import copy import copy
import json
from typing import Dict, Sequence from typing import Dict, Sequence
import torch
from datasets import load_dataset from datasets import load_dataset
from torch.utils.data import Dataset from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer from transformers import AutoTokenizer
import torch
from tqdm import tqdm
import json
from tqdm import tqdm
import json
IGNORE_INDEX = -100 IGNORE_INDEX = -100
@ -36,15 +34,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer,max_length :in
) )
def preprocess( def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTokenizer, max_length: int = 512) -> Dict:
sources: Sequence[str],
targets: Sequence[str],
tokenizer: AutoTokenizer,
max_length :int = 512
) -> Dict:
"""Preprocess the data by tokenizing.""" """Preprocess the data by tokenizing."""
examples = [s + t for s, t in zip(sources, targets)] examples = [s + t for s, t in zip(sources, targets)]
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer,max_length) for strings in (examples, sources)] examples_tokenized, sources_tokenized = [
_tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)
]
input_ids = examples_tokenized["input_ids"] input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids) labels = copy.deepcopy(input_ids)
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
@ -53,6 +48,7 @@ def preprocess(
class EasySupervisedDataset(Dataset): class EasySupervisedDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None: def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
super(EasySupervisedDataset, self).__init__() super(EasySupervisedDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f: with open(data_file, "r", encoding="UTF-8") as f:
@ -85,21 +81,21 @@ class EasySupervisedDataset(Dataset):
def __str__(self): def __str__(self):
return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})" return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})"
class EasyPromptsDataset(Dataset): class EasyPromptsDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None: def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
super(EasyPromptsDataset, self).__init__() super(EasyPromptsDataset, self).__init__()
with open(data_file, "r", encoding="UTF-8") as f: with open(data_file, "r", encoding="UTF-8") as f:
all_lines = f.readlines() all_lines = f.readlines()
all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines] all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
self.prompts = [ self.prompts = [
tokenizer(line, tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
return_tensors='pt',
max_length=max_length,
padding='max_length',
truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0) truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
for line in tqdm(all_lines) for line in tqdm(all_lines)
] ]
self.data_file = data_file self.data_file = data_file
def __len__(self): def __len__(self):
return len(self.prompts) return len(self.prompts)
@ -114,6 +110,7 @@ class EasyPromptsDataset(Dataset):
class EasyRewardDataset(Dataset): class EasyRewardDataset(Dataset):
def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None: def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
super(EasyRewardDataset, self).__init__() super(EasyRewardDataset, self).__init__()
self.chosen = [] self.chosen = []
@ -167,10 +164,13 @@ class EasyRewardDataset(Dataset):
def __str__(self): def __str__(self):
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})" return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
''' '''
Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better. Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
If individual lines are not related, just set is_group_texts to False. If individual lines are not related, just set is_group_texts to False.
''' '''
class EasySFTDataset(Dataset): class EasySFTDataset(Dataset):
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None: def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
@ -200,7 +200,8 @@ class EasySFTDataset(Dataset):
padded_length = max_length - len(current_input_ids) padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length) current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long)) grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long)) attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
current_input_ids = [] current_input_ids = []
else: else:
current_input_ids.extend(input_ids) current_input_ids.extend(input_ids)
@ -208,13 +209,15 @@ class EasySFTDataset(Dataset):
padded_length = max_length - len(current_input_ids) padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length) current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long)) grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long)) attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
else: else:
#just append the raw_input_ids to max_length #just append the raw_input_ids to max_length
for input_ids in raw_input_ids: for input_ids in raw_input_ids:
padded_length = max_length - len(input_ids) padded_length = max_length - len(input_ids)
input_ids.extend([tokenizer.pad_token_id] * padded_length) input_ids.extend([tokenizer.pad_token_id] * padded_length)
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long)) attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long)) grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_inpup_ids self.input_ids = grouped_inpup_ids
self.labels = copy.deepcopy(self.input_ids) self.labels = copy.deepcopy(self.input_ids)
@ -235,8 +238,3 @@ class EasySFTDataset(Dataset):
#generate the dataset description to be printed by print in python #generate the dataset description to be printed by print in python
def __str__(self): def __str__(self):
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})" return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"

View File

@ -3,12 +3,12 @@ from typing import Optional, Tuple, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn.modules import Module
from coati.models.generation import generate from coati.models.generation import generate
from coati.models.utils import log_probs_from_logits, masked_mean from coati.models.utils import log_probs_from_logits, masked_mean
from transformers import BloomConfig,BloomForCausalLM
from peft import PeftModel from peft import PeftModel
from torch.nn.modules import Module
from transformers import BloomConfig, BloomForCausalLM
class Actor(Module): class Actor(Module):
""" """
@ -94,4 +94,3 @@ class BLOOMActor(Actor):
def print_trainable_parameters(self): def print_trainable_parameters(self):
self.get_base_model().print_trainable_parameters() self.get_base_model().print_trainable_parameters()

View File

@ -5,21 +5,22 @@ import torch
import torch.distributed as dist import torch.distributed as dist
from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
from coati.models.bloom import BLOOMRM, BLOOMCritic from coati.models.bloom import BLOOMRM, BLOOMCritic
from easy_models import BLOOMActor
from coati.models.gpt import GPTRM, GPTActor, GPTCritic from coati.models.gpt import GPTRM, GPTActor, GPTCritic
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
from coati.models.opt import OPTRM, OPTActor, OPTCritic from coati.models.opt import OPTRM, OPTActor, OPTCritic
from coati.trainer import PPOTrainer from coati.trainer import PPOTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.utils import prepare_llama_tokenizer_and_embedding from coati.utils import prepare_llama_tokenizer_and_embedding
from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
from easy_models import BLOOMActor
from peft import PeftModel
from torch.optim import Adam from torch.optim import Adam
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from peft import PeftModel
from easy_dataset import EasyPromptsDataset,EasySupervisedDataset
def main(args): def main(args):
# configure strategy # configure strategy

View File

@ -14,19 +14,19 @@ from coati.trainer import SFTTrainer
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from coati.utils import prepare_llama_tokenizer_and_embedding from coati.utils import prepare_llama_tokenizer_and_embedding
from datasets import load_dataset from datasets import load_dataset
from easy_dataset import EasyDataset
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from torch.optim import Adam from torch.optim import Adam
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from transformers import AutoTokenizer, BloomTokenizerFast,AutoModelForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import HybridAdam from colossalai.nn.optimizer import HybridAdam
from colossalai.tensor import ColoParameter from colossalai.tensor import ColoParameter
from torch.utils.data.dataloader import default_collate
from peft import LoraConfig, TaskType,get_peft_model,PeftModel
from easy_dataset import EasyDataset
def train(args): def train(args):
# configure strategy # configure strategy
@ -54,11 +54,14 @@ def train(args):
#we'll use peft lora library to do the lora #we'll use peft lora library to do the lora
lora_rank = args.lora_rank if args.lora_rank > 0 else 32 lora_rank = args.lora_rank if args.lora_rank > 0 else 32
#config lora with rank of lora_rank #config lora with rank of lora_rank
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, lora_alpha=32, lora_dropout=0.1) lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=lora_rank,
lora_alpha=32,
lora_dropout=0.1)
model = get_peft_model(model, lora_config) model = get_peft_model(model, lora_config)
model.print_trainable_parameters() model.print_trainable_parameters()
# configure tokenizer # configure tokenizer
if args.model == 'gpt2': if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2')