diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.py b/applications/ColossalChat/benchmarks/benchmark_dpo.py index 5b9d76c99..f80d81566 100755 --- a/applications/ColossalChat/benchmarks/benchmark_dpo.py +++ b/applications/ColossalChat/benchmarks/benchmark_dpo.py @@ -5,10 +5,11 @@ import resource from contextlib import nullcontext import torch -from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler from coati.models import convert_to_lora_module, disable_dropout from coati.trainer import DPOTrainer from coati.utils import load_checkpoint +from dummy_dataset import DummyLLMDataset from transformers import AutoModelForCausalLM, AutoTokenizer import colossalai @@ -18,7 +19,6 @@ from colossalai.cluster import DistCoordinator from colossalai.logging import get_dist_logger from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.nn.optimizer import HybridAdam -from dummy_dataset import DummyLLMDataset logger = get_dist_logger() @@ -136,7 +136,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) @@ -165,9 +165,11 @@ def train(args): # configure dataset mode_map = {"train": "train", "valid": "validation", "test": "test"} - train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", - "rejected_loss_mask"], - args.max_length, args.dataset_size) + train_dataset = DummyLLMDataset( + ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"], + args.max_length, + args.dataset_size, + ) data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) train_dataloader = plugin.prepare_dataloader( diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.py b/applications/ColossalChat/benchmarks/benchmark_orpo.py index f974d1169..1325bada2 100755 --- a/applications/ColossalChat/benchmarks/benchmark_orpo.py +++ b/applications/ColossalChat/benchmarks/benchmark_orpo.py @@ -5,10 +5,11 @@ import resource from contextlib import nullcontext import torch -from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler from coati.models import convert_to_lora_module, disable_dropout from coati.trainer import ORPOTrainer from coati.utils import load_checkpoint +from dummy_dataset import DummyLLMDataset from transformers import AutoModelForCausalLM, AutoTokenizer import colossalai @@ -18,7 +19,7 @@ from colossalai.cluster import DistCoordinator from colossalai.logging import get_dist_logger from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.nn.optimizer import HybridAdam -from dummy_dataset import DummyLLMDataset + logger = get_dist_logger() @@ -122,7 +123,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) @@ -152,9 +153,11 @@ def train(args): # configure dataset coordinator.print_on_master(f"Load dataset: {args.dataset}") mode_map = {"train": "train", "valid": "validation", "test": "test"} - train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", - "rejected_loss_mask"], - args.max_length, args.dataset_size) + train_dataset = DummyLLMDataset( + ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"], + args.max_length, + args.dataset_size, + ) data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) train_dataloader = plugin.prepare_dataloader( diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.py b/applications/ColossalChat/benchmarks/benchmark_sft.py index f991dc938..b6438c503 100644 --- a/applications/ColossalChat/benchmarks/benchmark_sft.py +++ b/applications/ColossalChat/benchmarks/benchmark_sft.py @@ -6,10 +6,11 @@ import resource from contextlib import nullcontext import torch -from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler from coati.models import convert_to_lora_module from coati.trainer import SFTTrainer from coati.utils import load_checkpoint +from dummy_dataset import DummyLLMDataset from transformers import AutoModelForCausalLM, AutoTokenizer import colossalai @@ -19,7 +20,6 @@ from colossalai.cluster import DistCoordinator from colossalai.logging import get_dist_logger from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.nn.optimizer import HybridAdam -from dummy_dataset import DummyLLMDataset logger = get_dist_logger() @@ -127,7 +127,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True diff --git a/applications/ColossalChat/benchmarks/dummy_dataset.py b/applications/ColossalChat/benchmarks/dummy_dataset.py index 5f9642e2f..070531fd5 100644 --- a/applications/ColossalChat/benchmarks/dummy_dataset.py +++ b/applications/ColossalChat/benchmarks/dummy_dataset.py @@ -1,5 +1,6 @@ import torch -from torch.utils.data import Dataset, DataLoader +from torch.utils.data import Dataset + class DummyLLMDataset(Dataset): def __init__(self, keys, seq_len, size=500): @@ -11,11 +12,11 @@ class DummyLLMDataset(Dataset): def _generate_data(self): data = {} for key in self.keys: - data[key] = torch.ones(self.seq_len, dtype = torch.long) + data[key] = torch.ones(self.seq_len, dtype=torch.long) return data def __len__(self): return self.size def __getitem__(self, idx): - return {key: self.data[key] for key in self.keys} \ No newline at end of file + return {key: self.data[key] for key in self.keys} diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md index 22c0c4f50..bdf4d23f1 100755 --- a/applications/ColossalChat/examples/README.md +++ b/applications/ColossalChat/examples/README.md @@ -761,8 +761,8 @@ For SFT, we recommend using zero2 or zero2-cpu for 7B model and tp is your model - zero2_cpu, micro batch size=8, VRAM Usage=19412.77 MB - zero2, micro batch size=8, VRAM Usage=43446.31 MB - zero2, micro batch size=16, VRAM Usage=58082.30 MB - - zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB - - zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB + - zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB + - zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model (llama2-7B-hf) on a dummy dataset with a sequence length of 2048 and a layout length of 512 with different tp_size (equal to the number of GPUs). | PPO | tp=8 | tp=4 | diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py index 990c49a35..89d01d358 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py @@ -135,7 +135,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py index 55976407a..65c280e18 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py @@ -122,7 +122,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py index 892ab95f2..dc0481912 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.py +++ b/applications/ColossalChat/examples/training_scripts/train_sft.py @@ -126,7 +126,7 @@ def train(args): # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - + # configure tokenizer tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True