mirror of https://github.com/hpcaitech/ColossalAI
add community example dictionary (#3465)
parent
80eba05b0a
commit
6afeb1202a
|
@ -0,0 +1 @@
|
||||||
|
# Community Examples
|
|
@ -1,14 +1,12 @@
|
||||||
import copy
|
import copy
|
||||||
|
import json
|
||||||
from typing import Dict, Sequence
|
from typing import Dict, Sequence
|
||||||
|
|
||||||
|
import torch
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
from tqdm import tqdm
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
import torch
|
|
||||||
from tqdm import tqdm
|
|
||||||
import json
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
import json
|
|
||||||
|
|
||||||
IGNORE_INDEX = -100
|
IGNORE_INDEX = -100
|
||||||
|
|
||||||
|
@ -36,15 +34,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer,max_length :in
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def preprocess(
|
def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTokenizer, max_length: int = 512) -> Dict:
|
||||||
sources: Sequence[str],
|
|
||||||
targets: Sequence[str],
|
|
||||||
tokenizer: AutoTokenizer,
|
|
||||||
max_length :int = 512
|
|
||||||
) -> Dict:
|
|
||||||
"""Preprocess the data by tokenizing."""
|
"""Preprocess the data by tokenizing."""
|
||||||
examples = [s + t for s, t in zip(sources, targets)]
|
examples = [s + t for s, t in zip(sources, targets)]
|
||||||
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer,max_length) for strings in (examples, sources)]
|
examples_tokenized, sources_tokenized = [
|
||||||
|
_tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)
|
||||||
|
]
|
||||||
input_ids = examples_tokenized["input_ids"]
|
input_ids = examples_tokenized["input_ids"]
|
||||||
labels = copy.deepcopy(input_ids)
|
labels = copy.deepcopy(input_ids)
|
||||||
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
|
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
|
||||||
|
@ -53,6 +48,7 @@ def preprocess(
|
||||||
|
|
||||||
|
|
||||||
class EasySupervisedDataset(Dataset):
|
class EasySupervisedDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
|
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
|
||||||
super(EasySupervisedDataset, self).__init__()
|
super(EasySupervisedDataset, self).__init__()
|
||||||
with open(data_file, "r", encoding="UTF-8") as f:
|
with open(data_file, "r", encoding="UTF-8") as f:
|
||||||
|
@ -85,21 +81,21 @@ class EasySupervisedDataset(Dataset):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})"
|
return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})"
|
||||||
|
|
||||||
|
|
||||||
class EasyPromptsDataset(Dataset):
|
class EasyPromptsDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
|
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
|
||||||
super(EasyPromptsDataset, self).__init__()
|
super(EasyPromptsDataset, self).__init__()
|
||||||
with open(data_file, "r", encoding="UTF-8") as f:
|
with open(data_file, "r", encoding="UTF-8") as f:
|
||||||
all_lines = f.readlines()
|
all_lines = f.readlines()
|
||||||
all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
|
all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
|
||||||
self.prompts = [
|
self.prompts = [
|
||||||
tokenizer(line,
|
tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
|
||||||
return_tensors='pt',
|
|
||||||
max_length=max_length,
|
|
||||||
padding='max_length',
|
|
||||||
truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
|
truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
|
||||||
for line in tqdm(all_lines)
|
for line in tqdm(all_lines)
|
||||||
]
|
]
|
||||||
self.data_file = data_file
|
self.data_file = data_file
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.prompts)
|
return len(self.prompts)
|
||||||
|
|
||||||
|
@ -114,6 +110,7 @@ class EasyPromptsDataset(Dataset):
|
||||||
|
|
||||||
|
|
||||||
class EasyRewardDataset(Dataset):
|
class EasyRewardDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
|
def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
|
||||||
super(EasyRewardDataset, self).__init__()
|
super(EasyRewardDataset, self).__init__()
|
||||||
self.chosen = []
|
self.chosen = []
|
||||||
|
@ -167,10 +164,13 @@ class EasyRewardDataset(Dataset):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
|
return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
|
Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
|
||||||
If individual lines are not related, just set is_group_texts to False.
|
If individual lines are not related, just set is_group_texts to False.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
class EasySFTDataset(Dataset):
|
class EasySFTDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
|
def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
|
||||||
|
@ -200,7 +200,8 @@ class EasySFTDataset(Dataset):
|
||||||
padded_length = max_length - len(current_input_ids)
|
padded_length = max_length - len(current_input_ids)
|
||||||
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
||||||
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
|
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
|
||||||
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
|
attention_mask.append(
|
||||||
|
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
|
||||||
current_input_ids = []
|
current_input_ids = []
|
||||||
else:
|
else:
|
||||||
current_input_ids.extend(input_ids)
|
current_input_ids.extend(input_ids)
|
||||||
|
@ -208,13 +209,15 @@ class EasySFTDataset(Dataset):
|
||||||
padded_length = max_length - len(current_input_ids)
|
padded_length = max_length - len(current_input_ids)
|
||||||
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
||||||
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
|
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
|
||||||
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
|
attention_mask.append(
|
||||||
|
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
|
||||||
else:
|
else:
|
||||||
#just append the raw_input_ids to max_length
|
#just append the raw_input_ids to max_length
|
||||||
for input_ids in raw_input_ids:
|
for input_ids in raw_input_ids:
|
||||||
padded_length = max_length - len(input_ids)
|
padded_length = max_length - len(input_ids)
|
||||||
input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
input_ids.extend([tokenizer.pad_token_id] * padded_length)
|
||||||
attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
|
attention_mask.append(
|
||||||
|
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
|
||||||
grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
|
grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
|
||||||
self.input_ids = grouped_inpup_ids
|
self.input_ids = grouped_inpup_ids
|
||||||
self.labels = copy.deepcopy(self.input_ids)
|
self.labels = copy.deepcopy(self.input_ids)
|
||||||
|
@ -235,8 +238,3 @@ class EasySFTDataset(Dataset):
|
||||||
#generate the dataset description to be printed by print in python
|
#generate the dataset description to be printed by print in python
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
|
return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@ from typing import Optional, Tuple, Union
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.nn.modules import Module
|
|
||||||
|
|
||||||
from coati.models.generation import generate
|
from coati.models.generation import generate
|
||||||
from coati.models.utils import log_probs_from_logits, masked_mean
|
from coati.models.utils import log_probs_from_logits, masked_mean
|
||||||
from transformers import BloomConfig,BloomForCausalLM
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
|
from torch.nn.modules import Module
|
||||||
|
from transformers import BloomConfig, BloomForCausalLM
|
||||||
|
|
||||||
|
|
||||||
class Actor(Module):
|
class Actor(Module):
|
||||||
"""
|
"""
|
||||||
|
@ -94,4 +94,3 @@ class BLOOMActor(Actor):
|
||||||
|
|
||||||
def print_trainable_parameters(self):
|
def print_trainable_parameters(self):
|
||||||
self.get_base_model().print_trainable_parameters()
|
self.get_base_model().print_trainable_parameters()
|
||||||
|
|
|
@ -5,21 +5,22 @@ import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
|
from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
|
||||||
from coati.models.bloom import BLOOMRM, BLOOMCritic
|
from coati.models.bloom import BLOOMRM, BLOOMCritic
|
||||||
from easy_models import BLOOMActor
|
|
||||||
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
|
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
|
||||||
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
|
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
|
||||||
from coati.models.opt import OPTRM, OPTActor, OPTCritic
|
from coati.models.opt import OPTRM, OPTActor, OPTCritic
|
||||||
from coati.trainer import PPOTrainer
|
from coati.trainer import PPOTrainer
|
||||||
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
|
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
|
||||||
from coati.utils import prepare_llama_tokenizer_and_embedding
|
from coati.utils import prepare_llama_tokenizer_and_embedding
|
||||||
|
from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
|
||||||
|
from easy_models import BLOOMActor
|
||||||
|
from peft import PeftModel
|
||||||
from torch.optim import Adam
|
from torch.optim import Adam
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
|
from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
|
||||||
|
|
||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from peft import PeftModel
|
|
||||||
from easy_dataset import EasyPromptsDataset,EasySupervisedDataset
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# configure strategy
|
# configure strategy
|
|
@ -14,19 +14,19 @@ from coati.trainer import SFTTrainer
|
||||||
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
|
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
|
||||||
from coati.utils import prepare_llama_tokenizer_and_embedding
|
from coati.utils import prepare_llama_tokenizer_and_embedding
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
from easy_dataset import EasyDataset
|
||||||
|
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
|
||||||
from torch.optim import Adam
|
from torch.optim import Adam
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
from torch.utils.data.dataloader import default_collate
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from transformers import AutoTokenizer, BloomTokenizerFast,AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast
|
||||||
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
|
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
|
||||||
|
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.tensor import ColoParameter
|
from colossalai.tensor import ColoParameter
|
||||||
|
|
||||||
from torch.utils.data.dataloader import default_collate
|
|
||||||
from peft import LoraConfig, TaskType,get_peft_model,PeftModel
|
|
||||||
from easy_dataset import EasyDataset
|
|
||||||
|
|
||||||
def train(args):
|
def train(args):
|
||||||
# configure strategy
|
# configure strategy
|
||||||
|
@ -54,11 +54,14 @@ def train(args):
|
||||||
#we'll use peft lora library to do the lora
|
#we'll use peft lora library to do the lora
|
||||||
lora_rank = args.lora_rank if args.lora_rank > 0 else 32
|
lora_rank = args.lora_rank if args.lora_rank > 0 else 32
|
||||||
#config lora with rank of lora_rank
|
#config lora with rank of lora_rank
|
||||||
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, lora_alpha=32, lora_dropout=0.1)
|
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
|
||||||
|
inference_mode=False,
|
||||||
|
r=lora_rank,
|
||||||
|
lora_alpha=32,
|
||||||
|
lora_dropout=0.1)
|
||||||
model = get_peft_model(model, lora_config)
|
model = get_peft_model(model, lora_config)
|
||||||
model.print_trainable_parameters()
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
|
||||||
# configure tokenizer
|
# configure tokenizer
|
||||||
if args.model == 'gpt2':
|
if args.model == 'gpt2':
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
Loading…
Reference in New Issue