add community example dictionary (#3465)

2023-04-06 15:04:48 +08:00 · 2023-04-06 15:04:48 +08:00 · 6afeb1202a
parent 80eba05b0a
commit 6afeb1202a
6 changed files with 94 additions and 92 deletions
--- a/applications/Chat/examples/community/README.md
+++ b/applications/Chat/examples/community/README.md
@ -0,0 +1 @@
 # Community Examples
--- a/applications/Chat/examples/community/EasyPeftModel.md
+++ b/applications/Chat/examples/community/EasyPeftModel.md
--- a/applications/Chat/examples/community/peft/easy_dataset.py
+++ b/applications/Chat/examples/community/peft/easy_dataset.py
@ -1,14 +1,12 @@
 import copy
 import json
 from typing import Dict, Sequence
 import torch
 from datasets import load_dataset
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import AutoTokenizer
 import torch
 from tqdm import tqdm
 import json
 from tqdm import tqdm
 import json
 IGNORE_INDEX = -100
@ -36,15 +34,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer,max_length :in
    )
-def preprocess(
+def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTokenizer, max_length: int = 512) -> Dict:
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: AutoTokenizer,
    max_length :int = 512
 ) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
-    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer,max_length) for strings in (examples, sources)]
+    examples_tokenized, sources_tokenized = [
        _tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
@ -53,6 +48,7 @@ def preprocess(
 class EasySupervisedDataset(Dataset):
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
        super(EasySupervisedDataset, self).__init__()
        with open(data_file, "r", encoding="UTF-8") as f:
@ -85,21 +81,21 @@ class EasySupervisedDataset(Dataset):
    def __str__(self):
        return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})"
 class EasyPromptsDataset(Dataset):
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
        super(EasyPromptsDataset, self).__init__()
        with open(data_file, "r", encoding="UTF-8") as f:
            all_lines = f.readlines()
            all_lines = [line if "回答：" not in line else line[:line.index("回答：") + 3] for line in all_lines]
        self.prompts = [
-            tokenizer(line,
+            tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
                              return_tensors='pt',
                              max_length=max_length,
                              padding='max_length',
                      truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
            for line in tqdm(all_lines)
        ]
        self.data_file = data_file
    def __len__(self):
        return len(self.prompts)
@ -114,6 +110,7 @@ class EasyPromptsDataset(Dataset):
 class EasyRewardDataset(Dataset):
    def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
        super(EasyRewardDataset, self).__init__()
        self.chosen = []
@ -167,10 +164,13 @@ class EasyRewardDataset(Dataset):
    def __str__(self):
        return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
 '''
 Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
 If individual lines are not related, just set is_group_texts to False.
 '''
 class EasySFTDataset(Dataset):
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
@ -200,7 +200,8 @@ class EasySFTDataset(Dataset):
                    padded_length = max_length - len(current_input_ids)
                    current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
                    grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
-                    attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
+                    attention_mask.append(
                        torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
                    current_input_ids = []
                else:
                    current_input_ids.extend(input_ids)
@ -208,13 +209,15 @@ class EasySFTDataset(Dataset):
                padded_length = max_length - len(current_input_ids)
                current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
                grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
-                attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
+                attention_mask.append(
                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
        else:
            #just append the raw_input_ids to max_length
            for input_ids in raw_input_ids:
                padded_length = max_length - len(input_ids)
                input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
+                attention_mask.append(
                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
                grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
        self.input_ids = grouped_inpup_ids
        self.labels = copy.deepcopy(self.input_ids)
@ -235,8 +238,3 @@ class EasySFTDataset(Dataset):
    #generate the dataset description to be printed by print in python
    def __str__(self):
        return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
--- a/applications/Chat/examples/community/peft/easy_models.py
+++ b/applications/Chat/examples/community/peft/easy_models.py
@ -3,12 +3,12 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.modules import Module
 from coati.models.generation import generate
 from coati.models.utils import log_probs_from_logits, masked_mean
 from transformers import BloomConfig,BloomForCausalLM
 from peft import PeftModel
 from torch.nn.modules import Module
 from transformers import BloomConfig, BloomForCausalLM
 class Actor(Module):
    """
@ -94,4 +94,3 @@ class BLOOMActor(Actor):
    def print_trainable_parameters(self):
        self.get_base_model().print_trainable_parameters()
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@ -5,21 +5,22 @@ import torch
 import torch.distributed as dist
 from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
 from coati.models.bloom import BLOOMRM, BLOOMCritic
 from easy_models import BLOOMActor
 from coati.models.gpt import GPTRM, GPTActor, GPTCritic
 from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
 from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.trainer import PPOTrainer
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
 from easy_models import BLOOMActor
 from peft import PeftModel
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
 from colossalai.nn.optimizer import HybridAdam
-from peft import PeftModel
+
 from easy_dataset import EasyPromptsDataset,EasySupervisedDataset
 def main(args):
    # configure strategy
--- a/applications/Chat/examples/community/peft/train_peft_sft.py
+++ b/applications/Chat/examples/community/peft/train_peft_sft.py
@ -14,19 +14,19 @@ from coati.trainer import SFTTrainer
 from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 from coati.utils import prepare_llama_tokenizer_and_embedding
 from datasets import load_dataset
 from easy_dataset import EasyDataset
 from peft import LoraConfig, PeftModel, TaskType, get_peft_model
 from torch.optim import Adam
 from torch.utils.data import DataLoader
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast,AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor import ColoParameter
 from torch.utils.data.dataloader import default_collate
 from peft import LoraConfig, TaskType,get_peft_model,PeftModel
 from easy_dataset import EasyDataset
 def train(args):
    # configure strategy
@ -54,11 +54,14 @@ def train(args):
            #we'll use peft lora library to do the lora
            lora_rank = args.lora_rank if args.lora_rank > 0 else 32
            #config lora with rank of lora_rank
-            lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, lora_alpha=32, lora_dropout=0.1)
+            lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                                     inference_mode=False,
                                     r=lora_rank,
                                     lora_alpha=32,
                                     lora_dropout=0.1)
            model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
    # configure tokenizer
    if args.model == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')