mirror of https://github.com/hpcaitech/ColossalAI
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.cipull/5850/head
parent
33f15203d3
commit
8a9721bafe
|
@ -5,10 +5,11 @@ import resource
|
|||
from contextlib import nullcontext
|
||||
|
||||
import torch
|
||||
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
|
||||
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler
|
||||
from coati.models import convert_to_lora_module, disable_dropout
|
||||
from coati.trainer import DPOTrainer
|
||||
from coati.utils import load_checkpoint
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
import colossalai
|
||||
|
@ -18,7 +19,6 @@ from colossalai.cluster import DistCoordinator
|
|||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
@ -136,7 +136,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
|
||||
|
@ -165,9 +165,11 @@ def train(args):
|
|||
|
||||
# configure dataset
|
||||
mode_map = {"train": "train", "valid": "validation", "test": "test"}
|
||||
train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids",
|
||||
"rejected_loss_mask"],
|
||||
args.max_length, args.dataset_size)
|
||||
train_dataset = DummyLLMDataset(
|
||||
["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"],
|
||||
args.max_length,
|
||||
args.dataset_size,
|
||||
)
|
||||
data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
|
||||
|
||||
train_dataloader = plugin.prepare_dataloader(
|
||||
|
|
|
@ -5,10 +5,11 @@ import resource
|
|||
from contextlib import nullcontext
|
||||
|
||||
import torch
|
||||
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
|
||||
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler
|
||||
from coati.models import convert_to_lora_module, disable_dropout
|
||||
from coati.trainer import ORPOTrainer
|
||||
from coati.utils import load_checkpoint
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
import colossalai
|
||||
|
@ -18,7 +19,7 @@ from colossalai.cluster import DistCoordinator
|
|||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
||||
|
@ -122,7 +123,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
|
||||
|
@ -152,9 +153,11 @@ def train(args):
|
|||
# configure dataset
|
||||
coordinator.print_on_master(f"Load dataset: {args.dataset}")
|
||||
mode_map = {"train": "train", "valid": "validation", "test": "test"}
|
||||
train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids",
|
||||
"rejected_loss_mask"],
|
||||
args.max_length, args.dataset_size)
|
||||
train_dataset = DummyLLMDataset(
|
||||
["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"],
|
||||
args.max_length,
|
||||
args.dataset_size,
|
||||
)
|
||||
data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
|
||||
|
||||
train_dataloader = plugin.prepare_dataloader(
|
||||
|
|
|
@ -6,10 +6,11 @@ import resource
|
|||
from contextlib import nullcontext
|
||||
|
||||
import torch
|
||||
from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset
|
||||
from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler
|
||||
from coati.models import convert_to_lora_module
|
||||
from coati.trainer import SFTTrainer
|
||||
from coati.utils import load_checkpoint
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
import colossalai
|
||||
|
@ -19,7 +20,6 @@ from colossalai.cluster import DistCoordinator
|
|||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from dummy_dataset import DummyLLMDataset
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
@ -127,7 +127,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class DummyLLMDataset(Dataset):
|
||||
def __init__(self, keys, seq_len, size=500):
|
||||
|
@ -11,11 +12,11 @@ class DummyLLMDataset(Dataset):
|
|||
def _generate_data(self):
|
||||
data = {}
|
||||
for key in self.keys:
|
||||
data[key] = torch.ones(self.seq_len, dtype = torch.long)
|
||||
data[key] = torch.ones(self.seq_len, dtype=torch.long)
|
||||
return data
|
||||
|
||||
def __len__(self):
|
||||
return self.size
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return {key: self.data[key] for key in self.keys}
|
||||
return {key: self.data[key] for key in self.keys}
|
||||
|
|
|
@ -761,8 +761,8 @@ For SFT, we recommend using zero2 or zero2-cpu for 7B model and tp is your model
|
|||
- zero2_cpu, micro batch size=8, VRAM Usage=19412.77 MB
|
||||
- zero2, micro batch size=8, VRAM Usage=43446.31 MB
|
||||
- zero2, micro batch size=16, VRAM Usage=58082.30 MB
|
||||
- zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB
|
||||
- zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB
|
||||
- zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB
|
||||
- zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB
|
||||
|
||||
For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model (llama2-7B-hf) on a dummy dataset with a sequence length of 2048 and a layout length of 512 with different tp_size (equal to the number of GPUs).
|
||||
| PPO | tp=8 | tp=4 |
|
||||
|
|
|
@ -135,7 +135,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
|
||||
|
|
|
@ -122,7 +122,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
|
||||
|
|
|
@ -126,7 +126,7 @@ def train(args):
|
|||
# Note, for some models, lora may not be compatible with gradient checkpointing
|
||||
model.gradient_checkpointing_enable()
|
||||
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
|
||||
|
||||
|
||||
# configure tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True
|
||||
|
|
Loading…
Reference in New Issue