diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.py b/applications/ColossalChat/benchmarks/benchmark_dpo.py new file mode 100755 index 000000000..5b9d76c99 --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_dpo.py @@ -0,0 +1,338 @@ +import argparse +import json +import os +import resource +from contextlib import nullcontext + +import torch +from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.models import convert_to_lora_module, disable_dropout +from coati.trainer import DPOTrainer +from coati.utils import load_checkpoint +from transformers import AutoModelForCausalLM, AutoTokenizer + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin +from colossalai.cluster import DistCoordinator +from colossalai.logging import get_dist_logger +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.nn.optimizer import HybridAdam +from dummy_dataset import DummyLLMDataset + +logger = get_dist_logger() + + +def train(args): + # check lora compatibility + if "gemini" in args.plugin and args.lora_rank > 0: + raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") + if args.plugin == "gemini_auto" and args.accumulation_steps > 1: + raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") + + # ============================== + # Initialize Distributed Training + # ============================== + colossalai.launch_from_torch() + coordinator = DistCoordinator() + + # ============================== + # Initialize Booster + # ============================== + if args.plugin == "ddp": + """ + Default torch ddp plugin without any acceleration, for + debugging purpose acceleration, for debugging purpose + """ + plugin = TorchDDPPlugin(find_unused_parameters=True) + elif args.plugin == "gemini": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="static", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_gradient_accumulation=True, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "gemini_auto": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="auto", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "zero2": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + max_norm=args.grad_clip, + ) + elif args.plugin == "zero2_cpu": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + cpu_offload=True, + max_norm=args.grad_clip, + ) + elif args.plugin == "3d": + plugin = HybridParallelPlugin( + tp_size=args.tp, + pp_size=args.pp, + sp_size=args.sp, + sequence_parallelism_mode=args.sp_mode, + zero_stage=args.zero_stage, + enable_flash_attention=args.use_flash_attn, + enable_sequence_parallelism=args.enable_sequence_parallelism, + cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, + parallel_output=False, + max_norm=args.grad_clip, + precision=args.mixed_precision, + ) + else: + raise ValueError(f"Unknown plugin {args.plugin}") + + booster = Booster(plugin=plugin) + ref_booster = Booster(plugin=plugin) + + # ====================================================== + # Initialize Model, Objective, Optimizer and LR Scheduler + # ====================================================== + # Temp Fix: Disable lazy init due to version conflict + # init_ctx = ( + # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() + # ) + + init_ctx = nullcontext() + with init_ctx: + if args.use_flash_attn: + model = AutoModelForCausalLM.from_pretrained( + args.pretrain, + torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, + use_flash_attention_2=True, + ) + coordinator.print_on_master(msg="Flash-attention enabled successfully") + else: + model = AutoModelForCausalLM.from_pretrained(args.pretrain) + disable_dropout(model) + if not args.disable_reference_model: + if args.use_flash_attn: + ref_model = AutoModelForCausalLM.from_pretrained( + args.pretrain, + torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, + use_flash_attention_2=True, + ) + else: + ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain) + disable_dropout(ref_model) + else: + ref_model = None + if args.lora_rank > 0: + model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) + + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing + model.gradient_checkpointing_enable() + coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") + + # configure tokenizer + tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) + if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: + try: + # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen + tokenizer.pad_token = tokenizer.eos_token + except AttributeError as e: + logger.warning(f"Unable to set pad token to eos token, {str(e)}") + if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: + logger.warning( + "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." + ) + + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + + # configure optimizer + optim = HybridAdam( + model_params=model.parameters(), + lr=args.lr, + betas=(0.9, 0.95), + weight_decay=args.weight_decay, + adamw_mode=True, + ) + + # configure dataset + mode_map = {"train": "train", "valid": "validation", "test": "test"} + train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", + "rejected_loss_mask"], + args.max_length, args.dataset_size) + data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) + + train_dataloader = plugin.prepare_dataloader( + dataset=train_dataset, + batch_size=args.batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + distributed_sampler_cls=StatefulDistributedSampler, + ) + + num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps + if args.warmup_steps is None: + args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) + coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") + + lr_scheduler = CosineAnnealingWarmupLR( + optimizer=optim, + total_steps=args.max_epochs * num_update_steps_per_epoch, + warmup_steps=args.warmup_steps, + eta_min=0.1 * args.lr, + ) + + default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 + torch.set_default_dtype(default_dtype) + model, optim, _, train_dataloader, lr_scheduler = booster.boost( + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + dataloader=train_dataloader, + ) + if ref_model is not None: + ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader) + torch.set_default_dtype(torch.float) + + coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") + coordinator.print_on_master( + f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + start_epoch = 0 + sampler_start_idx = 0 + start_step = 0 + if args.checkpoint_path is not None: + if "modeling" in args.checkpoint_path: + coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") + booster.load_model(model, args.checkpoint_path) + else: + coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") + start_epoch, start_step, sampler_start_idx = load_checkpoint( + load_dir=args.checkpoint_path, + booster=booster, + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + ) + assert isinstance(train_dataloader.sampler, StatefulDistributedSampler) + train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) + + coordinator.print_on_master( + f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" + ) + coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") + + coordinator.print_on_master( + f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + trainer = DPOTrainer( + actor=model, + ref_model=ref_model, + booster=booster, + actor_optim=optim, + actor_lr_scheduler=lr_scheduler, + tokenizer=tokenizer, + max_epochs=args.max_epochs, + accumulation_steps=args.accumulation_steps, + start_epoch=start_epoch, + save_interval=None, + save_dir=None, + coordinator=coordinator, + beta=args.beta, + gamma=args.gamma, + length_normalization=args.length_normalization, + ) + + trainer.fit( + train_preference_dataloader=train_dataloader, + eval_preference_dataloader=None, + log_dir=None, + use_wandb=False, + ) + coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") + + +if __name__ == "__main__": + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument( + "--plugin", + type=str, + default="gemini", + choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"], + help="Choose which plugin to use", + ) + parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") + parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") + parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--pp", type=int, default=1) + parser.add_argument("--sp", type=int, default=1) + parser.add_argument("--loss_type", type=str, default="dpo_loss", help="dpo_loss or simpo_loss") + parser.add_argument("--beta", type=float, default=0.1, help="beta in DPO loss") + parser.add_argument("--gamma", type=float, default=0.0, help="gamma in SimPO loss") + parser.add_argument("--length_normalization", default=False, action="store_true") + parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") + parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) + parser.add_argument("--zero_cpu_offload", default=False, action="store_true") + parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) + parser.add_argument("--pretrain", type=str, default=None) + parser.add_argument("--model_type", type=str, default=None) + parser.add_argument("--tokenizer_dir", type=str, default=None) + parser.add_argument( + "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" + ) + parser.add_argument("--config_file", type=str, default="config_file", help="Config file") + parser.add_argument("--max_length", type=int, default=2048, help="Model max length") + parser.add_argument("--max_epochs", type=int, default=3) + parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--dataset_size", type=int, default=500) + parser.add_argument( + "--disable_reference_model", + action="store_true", + default=False, + help="Disable the reference model (enabled by default)", + ) + parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision") + parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") + parser.add_argument( + "--lora_train_bias", + type=str, + default="none", + help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", + ) + parser.add_argument("--merge_lora_weights", type=bool, default=True) + parser.add_argument("--lr", type=float, default=5e-6) + parser.add_argument("--accumulation_steps", type=int, default=8) + parser.add_argument("--grad_checkpoint", default=False, action="store_true") + parser.add_argument("--use_flash_attn", default=False, action="store_true") + args = parser.parse_args() + + # fool proof hyperparameter setup + if args.loss_type == "simpo_loss": + args.length_normalization = True + args.gamma = args.gamma if args.gamma > 0 else 1.4 + + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) + train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.sh b/applications/ColossalChat/benchmarks/benchmark_dpo.sh new file mode 100755 index 000000000..cc6364675 --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_dpo.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | + tail -n +2 | + nl -v 0 | + tee /dev/tty | + sort -g -k 2 | + awk '{print $1}' | + head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} +set_n_least_used_CUDA_VISIBLE_DEVICES 4 + +PROJECT_NAME="dpo" +PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path + +TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) +FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" +SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" +CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" + +echo $(which colossalai) +echo $(which python) +colossalai run --nproc_per_node 4 --master_port 31313 benchmark_dpo.py \ + --pretrain $PRETRAINED_MODEL_PATH \ + --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --config_file $CONFIG_FILE \ + --plugin "zero2_cpu" \ + --max_epochs 1 \ + --accumulation_steps 1 \ + --batch_size 8 \ + --lr 1e-6 \ + --beta 0.1 \ + --gamma 0.6 \ + --mixed_precision "bf16" \ + --grad_clip 1.0 \ + --max_length 2048 \ + --dataset_size 640 \ + --weight_decay 0.01 \ + --warmup_steps 60 \ + --disable_reference_model \ + --length_normalization \ + --grad_checkpoint \ + --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.py b/applications/ColossalChat/benchmarks/benchmark_orpo.py new file mode 100755 index 000000000..f974d1169 --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_orpo.py @@ -0,0 +1,312 @@ +import argparse +import json +import os +import resource +from contextlib import nullcontext + +import torch +from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.models import convert_to_lora_module, disable_dropout +from coati.trainer import ORPOTrainer +from coati.utils import load_checkpoint +from transformers import AutoModelForCausalLM, AutoTokenizer + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin +from colossalai.cluster import DistCoordinator +from colossalai.logging import get_dist_logger +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.nn.optimizer import HybridAdam +from dummy_dataset import DummyLLMDataset +logger = get_dist_logger() + + +def train(args): + # check lora compatibility + if "gemini" in args.plugin and args.lora_rank > 0: + raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") + if args.plugin == "gemini_auto" and args.accumulation_steps > 1: + raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") + + # ============================== + # Initialize Distributed Training + # ============================== + colossalai.launch_from_torch() + coordinator = DistCoordinator() + + # ============================== + # Initialize Booster + # ============================== + if args.plugin == "ddp": + """ + Default torch ddp plugin without any acceleration, for + debugging purpose acceleration, for debugging purpose + """ + plugin = TorchDDPPlugin(find_unused_parameters=True) + elif args.plugin == "gemini": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="static", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_gradient_accumulation=True, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "gemini_auto": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="auto", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "zero2": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + max_norm=args.grad_clip, + ) + elif args.plugin == "zero2_cpu": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + cpu_offload=True, + max_norm=args.grad_clip, + ) + elif args.plugin == "3d": + plugin = HybridParallelPlugin( + tp_size=args.tp, + pp_size=args.pp, + sp_size=args.sp, + sequence_parallelism_mode=args.sp_mode, + zero_stage=args.zero_stage, + enable_flash_attention=args.use_flash_attn, + enable_sequence_parallelism=args.enable_sequence_parallelism, + cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, + parallel_output=False, + max_norm=args.grad_clip, + precision=args.mixed_precision, + ) + else: + raise ValueError(f"Unknown plugin {args.plugin}") + + booster = Booster(plugin=plugin) + + # ====================================================== + # Initialize Model, Objective, Optimizer and LR Scheduler + # ====================================================== + # Temp Fix: Disable lazy init due to version conflict + # init_ctx = ( + # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() + # ) + + init_ctx = nullcontext() + with init_ctx: + if args.use_flash_attn: + model = AutoModelForCausalLM.from_pretrained( + args.pretrain, + torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, + use_flash_attention_2=True, + ) + coordinator.print_on_master(msg="Flash-attention enabled successfully") + else: + model = AutoModelForCausalLM.from_pretrained(args.pretrain) + disable_dropout(model) + if args.lora_rank > 0: + model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) + + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing + model.gradient_checkpointing_enable() + coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") + + # configure tokenizer + tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) + if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: + try: + # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen + tokenizer.pad_token = tokenizer.eos_token + except AttributeError as e: + logger.warning(f"Unable to set pad token to eos token, {str(e)}") + if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: + logger.warning( + "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." + ) + + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + + # configure optimizer + optim = HybridAdam( + model_params=model.parameters(), + lr=args.lr, + betas=(0.9, 0.95), + weight_decay=args.weight_decay, + adamw_mode=True, + ) + + # configure dataset + coordinator.print_on_master(f"Load dataset: {args.dataset}") + mode_map = {"train": "train", "valid": "validation", "test": "test"} + train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", + "rejected_loss_mask"], + args.max_length, args.dataset_size) + data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) + + train_dataloader = plugin.prepare_dataloader( + dataset=train_dataset, + batch_size=args.batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + distributed_sampler_cls=StatefulDistributedSampler, + ) + + num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps + if args.warmup_steps is None: + args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) + coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") + + lr_scheduler = CosineAnnealingWarmupLR( + optimizer=optim, + total_steps=args.max_epochs * num_update_steps_per_epoch, + warmup_steps=args.warmup_steps, + eta_min=0.1 * args.lr, + ) + + default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 + torch.set_default_dtype(default_dtype) + model, optim, _, train_dataloader, lr_scheduler = booster.boost( + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + dataloader=train_dataloader, + ) + torch.set_default_dtype(torch.float) + + coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") + coordinator.print_on_master( + f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + start_epoch = 0 + sampler_start_idx = 0 + start_step = 0 + if args.checkpoint_path is not None: + if "modeling" in args.checkpoint_path: + coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") + booster.load_model(model, args.checkpoint_path) + else: + coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") + start_epoch, start_step, sampler_start_idx = load_checkpoint( + load_dir=args.checkpoint_path, + booster=booster, + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + ) + assert isinstance(train_dataloader.sampler, StatefulDistributedSampler) + train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) + + coordinator.print_on_master( + f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" + ) + coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") + + coordinator.print_on_master( + f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + trainer = ORPOTrainer( + actor=model, + booster=booster, + actor_optim=optim, + actor_lr_scheduler=lr_scheduler, + tokenizer=tokenizer, + max_epochs=args.max_epochs, + accumulation_steps=args.accumulation_steps, + start_epoch=start_epoch, + save_interval=None, + save_dir=None, + coordinator=coordinator, + lam=args.lam, + ) + + trainer.fit( + train_preference_dataloader=train_dataloader, + eval_preference_dataloader=None, + log_dir=None, + use_wandb=False, + ) + coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") + + +if __name__ == "__main__": + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument( + "--plugin", + type=str, + default="gemini", + choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"], + help="Choose which plugin to use", + ) + parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") + parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") + parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--pp", type=int, default=1) + parser.add_argument("--sp", type=int, default=1) + parser.add_argument("--lam", type=float, default=0.1, help="lambda in ORPO loss") + parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") + parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) + parser.add_argument("--zero_cpu_offload", default=False, action="store_true") + parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) + parser.add_argument("--pretrain", type=str, default=None) + parser.add_argument("--model_type", type=str, default=None) + parser.add_argument("--tokenizer_dir", type=str, default=None) + parser.add_argument("--dataset", nargs="+", default=[]) + parser.add_argument( + "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" + ) + parser.add_argument("--config_file", type=str, default="config_file", help="Config file") + parser.add_argument("--max_length", type=int, default=2048, help="Model max length") + parser.add_argument("--max_epochs", type=int, default=3) + parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument( + "--disable_reference_model", + action="store_true", + default=False, + help="Disable the reference model (enabled by default)", + ) + parser.add_argument("--dataset_size", type=int, default=500) + parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision") + parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") + parser.add_argument( + "--lora_train_bias", + type=str, + default="none", + help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", + ) + parser.add_argument("--merge_lora_weights", type=bool, default=True) + parser.add_argument("--lr", type=float, default=5e-6) + parser.add_argument("--accumulation_steps", type=int, default=8) + parser.add_argument("--grad_checkpoint", default=False, action="store_true") + parser.add_argument("--use_flash_attn", default=False, action="store_true") + args = parser.parse_args() + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) + train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.sh b/applications/ColossalChat/benchmarks/benchmark_orpo.sh new file mode 100755 index 000000000..2139004df --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_orpo.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | + tail -n +2 | + nl -v 0 | + tee /dev/tty | + sort -g -k 2 | + awk '{print $1}' | + head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} +set_n_least_used_CUDA_VISIBLE_DEVICES 2 + +PROJECT_NAME="dpo" +PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path + +TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) +FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" +CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" + +colossalai run --nproc_per_node 2 --master_port 31313 benchmark_orpo.py \ + --pretrain $PRETRAINED_MODEL_PATH \ + --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --plugin "zero2" \ + --config_file $CONFIG_FILE \ + --max_epochs 1 \ + --accumulation_steps 1 \ + --batch_size 4 \ + --lr 8e-6 \ + --lam 0.5 \ + --mixed_precision "bf16" \ + --grad_clip 1.0 \ + --max_length 2048 \ + --weight_decay 0.01 \ + --warmup_steps 60 \ + --dataset_size 160 \ + --grad_checkpoint \ + --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.py b/applications/ColossalChat/benchmarks/benchmark_sft.py new file mode 100644 index 000000000..f991dc938 --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_sft.py @@ -0,0 +1,315 @@ +import argparse +import json +import math +import os +import resource +from contextlib import nullcontext + +import torch +from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset +from coati.models import convert_to_lora_module +from coati.trainer import SFTTrainer +from coati.utils import load_checkpoint +from transformers import AutoModelForCausalLM, AutoTokenizer + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin +from colossalai.cluster import DistCoordinator +from colossalai.logging import get_dist_logger +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.nn.optimizer import HybridAdam +from dummy_dataset import DummyLLMDataset + +logger = get_dist_logger() + + +def train(args): + # check lora compatibility + if "gemini" in args.plugin and args.lora_rank > 0: + raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") + if args.plugin == "gemini_auto" and args.accumulation_steps > 1: + raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") + # ============================== + # Initialize Distributed Training + # ============================== + colossalai.launch_from_torch() + coordinator = DistCoordinator() + + # ============================== + # Initialize Booster + # ============================== + init_ctx = nullcontext() + with init_ctx: + if args.use_flash_attn: + model = AutoModelForCausalLM.from_pretrained( + args.pretrain, + torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, + attn_implementation="flash_attention_2", + trust_remote_code=True, + ) + else: + model = AutoModelForCausalLM.from_pretrained( + args.pretrain, + torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, + trust_remote_code=True, + ) + if args.lora_rank > 0: + model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) + + if args.plugin == "ddp": + """ + Default torch ddp plugin without any acceleration, for + debugging purpose acceleration, for debugging purpose + """ + plugin = TorchDDPPlugin(find_unused_parameters=True) + elif args.plugin == "gemini": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="static", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_gradient_accumulation=True if args.accumulation_steps > 1 else False, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "gemini_auto": + plugin = GeminiPlugin( + precision=args.mixed_precision, + placement_policy="auto", + initial_scale=2**16, + max_norm=args.grad_clip, + enable_flash_attention=args.use_flash_attn, + ) + elif args.plugin == "zero2": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + max_norm=args.grad_clip, + ) + elif args.plugin == "zero2_cpu": + plugin = LowLevelZeroPlugin( + stage=2, + precision=args.mixed_precision, + initial_scale=2**16, + cpu_offload=True, + max_norm=args.grad_clip, + ) + elif args.plugin == "3d": + plugin = HybridParallelPlugin( + tp_size=args.tp, + pp_size=args.pp, + sp_size=args.sp, + sequence_parallelism_mode=args.sp_mode, + zero_stage=args.zero_stage, + enable_flash_attention=args.use_flash_attn, + enable_sequence_parallelism=args.enable_sequence_parallelism, + cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, + parallel_output=False, + max_norm=args.grad_clip, + precision=args.mixed_precision, + microbatch_size=args.batch_size, + ) + else: + raise ValueError(f"Unknown plugin {args.plugin}") + + booster = Booster(plugin=plugin) + + # ====================================================== + # Initialize Model, Objective, Optimizer and LR Scheduler + # ====================================================== + # Temp Fix: Disable lazy init due to version conflict + # init_ctx = ( + # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() + # ) + + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing + model.gradient_checkpointing_enable() + coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") + + # configure tokenizer + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True + ) + if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: + try: + # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen + tokenizer.pad_token = tokenizer.eos_token + except AttributeError as e: + logger.warning(f"Unable to set pad token to eos token, {str(e)}") + if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: + logger.warning( + "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." + ) + + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + tokenizer.padding_side = "right" + + coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}") + + # configure optimizer + optim = HybridAdam( + model_params=model.parameters(), + lr=args.lr, + betas=(0.9, 0.95), + weight_decay=args.weight_decay, + adamw_mode=True, + ) + + # configure dataset + coordinator.print_on_master( + f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" + ) + dataset = DummyLLMDataset(["input_ids", "attention_mask", "labels"], args.max_len, args.dataset_size) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len) + + train_dataloader = plugin.prepare_dataloader( + dataset=dataset, + batch_size=args.batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + distributed_sampler_cls=StatefulDistributedSampler, + ) + coordinator.print_on_master( + f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" + ) + + num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps + math.ceil(args.max_epochs * num_update_steps_per_epoch) + + if args.warmup_steps is None: + args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) + coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") + + lr_scheduler = CosineAnnealingWarmupLR( + optimizer=optim, + total_steps=args.max_epochs * num_update_steps_per_epoch, + warmup_steps=args.warmup_steps, + eta_min=0.1 * args.lr, + ) + + # Flash attention will be disabled because it does NOT support fp32. + default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 + torch.set_default_dtype(default_dtype) + model, optim, _, train_dataloader, lr_scheduler = booster.boost( + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + dataloader=train_dataloader, + ) + torch.set_default_dtype(torch.float) + + coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") + coordinator.print_on_master( + f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + start_epoch = 0 + sampler_start_idx = 0 + start_step = 0 + if args.checkpoint_path is not None: + if "modeling" in args.checkpoint_path: + coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") + booster.load_model(model, args.checkpoint_path) + else: + coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") + start_epoch, start_step, sampler_start_idx = load_checkpoint( + load_dir=args.checkpoint_path, + booster=booster, + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + ) + train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) + + coordinator.print_on_master( + f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" + ) + coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") + + coordinator.print_on_master( + f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" + ) + coordinator.print_on_master( + f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" + ) + + trainer = SFTTrainer( + model=model, + booster=booster, + optim=optim, + lr_scheduler=lr_scheduler, + max_epochs=args.max_epochs, + accumulation_steps=args.accumulation_steps, + start_epoch=start_epoch, + save_interval=None, + save_dir=None, + coordinator=coordinator, + ) + + trainer.fit( + train_dataloader=train_dataloader, + eval_dataloader=None, + log_dir=None, + use_wandb=False, + ) + + coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") + + +if __name__ == "__main__": + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument( + "--plugin", + type=str, + default="gemini", + choices=["gemini", "gemini_auto", "3d", "ddp", "zero2_cpu", "zero2"], + help="Choose which plugin to use", + ) + parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") + parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") + parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--pp", type=int, default=1) + parser.add_argument("--sp", type=int, default=1) + parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") + parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) + parser.add_argument("--zero_cpu_offload", default=False, action="store_true") + parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) + parser.add_argument("--pretrain", type=str, default=None) + parser.add_argument("--tokenizer_dir", type=str, default=None) + parser.add_argument( + "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" + ) + parser.add_argument("--max_epochs", type=int, default=3) + parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--max_len", type=int, default=512) + parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["fp16", "bf16"], help="Mixed precision") + parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") + parser.add_argument( + "--lora_train_bias", + type=str, + default="none", + help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", + ) + parser.add_argument("--merge_lora_weights", type=bool, default=True) + parser.add_argument("--lr", type=float, default=5e-6) + parser.add_argument("--config_file", type=str, default="config_file", help="Config file") + parser.add_argument("--accumulation_steps", type=int, default=8) + parser.add_argument("--grad_checkpoint", default=False, action="store_true") + parser.add_argument("--use_flash_attn", default=False, action="store_true") + parser.add_argument("--dataset_size", type=int, default=500) + args = parser.parse_args() + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) + train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.sh b/applications/ColossalChat/benchmarks/benchmark_sft.sh new file mode 100755 index 000000000..84ddf046a --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_sft.sh @@ -0,0 +1,43 @@ +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | + tail -n +2 | + nl -v 0 | + tee /dev/tty | + sort -g -k 2 | + awk '{print $1}' | + head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} + +set_n_least_used_CUDA_VISIBLE_DEVICES 4 +# export CUDA_VISIBLE_DEVICES=3,4 +PROJECT_NAME="sft" +PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path + +TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) +FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" +CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" + +echo $(which colossalai) +echo $(which python) +# the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size +colossalai run --nproc_per_node 4 --master_port 31312 benchmark_sft.py \ + --pretrain $PRETRAINED_MODEL_PATH \ + --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --config_file $CONFIG_FILE \ + --plugin zero2 \ + --batch_size 8 \ + --max_epochs 1 \ + --accumulation_steps 1 \ + --lr 5e-5 \ + --lora_rank 32 \ + --max_len 2048 \ + --dataset_size 640 \ + --grad_checkpoint \ + --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/dummy_dataset.py b/applications/ColossalChat/benchmarks/dummy_dataset.py new file mode 100644 index 000000000..5f9642e2f --- /dev/null +++ b/applications/ColossalChat/benchmarks/dummy_dataset.py @@ -0,0 +1,21 @@ +import torch +from torch.utils.data import Dataset, DataLoader + +class DummyLLMDataset(Dataset): + def __init__(self, keys, seq_len, size=500): + self.keys = keys + self.seq_len = seq_len + self.data = self._generate_data() + self.size = size + + def _generate_data(self): + data = {} + for key in self.keys: + data[key] = torch.ones(self.seq_len, dtype = torch.long) + return data + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return {key: self.data[key] for key in self.keys} \ No newline at end of file diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py index c095cc35c..c7bbf5ad4 100755 --- a/applications/ColossalChat/coati/trainer/dpo.py +++ b/applications/ColossalChat/coati/trainer/dpo.py @@ -139,7 +139,7 @@ class DPOTrainer(SLTrainer): actor_all_logits = self.model( input_ids=torch.cat([chosen_input_ids, reject_input_ids]), attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]), - )["logits"].to(torch.float32) + )["logits"] actor_chosen_logits = actor_all_logits[:batch_size] actor_reject_logits = actor_all_logits[batch_size:] logprob_actor_chosen = calc_masked_log_probs( @@ -156,7 +156,7 @@ class DPOTrainer(SLTrainer): ref_all_logits = self.ref_model( input_ids=torch.cat([chosen_input_ids, reject_input_ids]), attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]), - )["logits"].to(torch.float32) + )["logits"] ref_chosen_logits = ref_all_logits[:batch_size] ref_reject_logits = ref_all_logits[batch_size:] logprob_ref_chosen = calc_masked_log_probs( @@ -225,7 +225,7 @@ class DPOTrainer(SLTrainer): ) self.accumulative_meter.reset() - if (self.num_train_step + 1) % self.save_interval == 0: + if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0: # save checkpoint self.coordinator.print_on_master("\nStart saving model checkpoint with running states") save_checkpoint( @@ -289,7 +289,7 @@ class DPOTrainer(SLTrainer): actor_all_logits = self.model( torch.cat([chosen_input_ids, reject_input_ids]), torch.cat([chosen_attention_mask, reject_attention_mask]), - )["logits"].to(torch.float32) + )["logits"] actor_chosen_logits = actor_all_logits[:batch_size] actor_reject_logits = actor_all_logits[batch_size:] @@ -306,7 +306,7 @@ class DPOTrainer(SLTrainer): ref_all_logits = self.ref_model( torch.cat([chosen_input_ids, reject_input_ids]), torch.cat([chosen_attention_mask, reject_attention_mask]), - )["logits"].to(torch.float32) + )["logits"] ref_chosen_logits = ref_all_logits[:batch_size] ref_reject_logits = ref_all_logits[batch_size:] logprob_ref_chosen = calc_masked_log_probs( diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py index aa94e0acb..4cdc19a82 100644 --- a/applications/ColossalChat/coati/trainer/orpo.py +++ b/applications/ColossalChat/coati/trainer/orpo.py @@ -209,7 +209,7 @@ class ORPOTrainer(SLTrainer): ) self.accumulative_meter.reset() - if (self.num_train_step + 1) % self.save_interval == 0: + if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0: # save checkpoint self.coordinator.print_on_master("\nStart saving model checkpoint with running states") save_checkpoint( diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md index 8b1f0d2b0..22c0c4f50 100755 --- a/applications/ColossalChat/examples/README.md +++ b/applications/ColossalChat/examples/README.md @@ -752,7 +752,19 @@ We support the method introduced in the paper [ORPO: Monolithic Preference Optim

## Hardware Requirements -For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model on a dummy dataset with 2048 sequence length and 512 layout length with different tp_size (equal to the number of GPUs). In this experiment, we use an H800 GPU with 80GB VRAM. + +For SFT, we recommend using zero2 or zero2-cpu for 7B model and tp is your model is extra large. We tested the VRAM consumption on a dummy dataset with a sequence length of 2048. In all experiments, we use H800 GPUs with 80GB VRAM and enable gradient checkpointing and flash attention. +- 2 H800 GPU + - zero2-cpu, micro batch size=4, VRAM Usage=22457.98 MB + - zero2, micro batch size=4, VRAM Usage=72390.95 MB +- 4 H800 GPUs + - zero2_cpu, micro batch size=8, VRAM Usage=19412.77 MB + - zero2, micro batch size=8, VRAM Usage=43446.31 MB + - zero2, micro batch size=16, VRAM Usage=58082.30 MB + - zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB + - zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB + +For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model (llama2-7B-hf) on a dummy dataset with a sequence length of 2048 and a layout length of 512 with different tp_size (equal to the number of GPUs). | PPO | tp=8 | tp=4 | |-------|---------------|---------------| | bs=1 | 18485.19 MB | 42934.45 MB | @@ -763,12 +775,31 @@ For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM For DPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length. - -- 1 H800 GPU - - zero2-cpu, batch size=2, VRAM Usage=49873.90 MB - - zero2-cpu, batch size=4, VRAM Usage=60998.22 MB +- 2 H800 GPU + - zero2-cpu, micro batch size=2, VRAM Usage=36989.37 MB + - zero2-cpu, micro batch size=4, VRAM Usage=48081.67 MB - 4 H800 GPUs - - zero2, batch size=4, VRAM Usage=67544.47 MB + - zero2, micro batch size=4, VRAM Usage=67483.44 MB + +For SimPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length. + +- 2 H800 GPU + - zero2-cpu, micro batch size=4, VRAM 25705.26 MB + - zero2, micro batch size=4, VRAM Usage=73375.04 MB +- 4 H800 GPUs + - zero2_cpu, micro batch size=8, VRAM Usage=36709.36 MB + - zero2, micro batch size=4, VRAM Usage=44330.90 MB + - zero2, micro batch size=8, VRAM Usage=56086.12 MB + +For ORPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length. + +- 2 H800 GPU + - zero2-cpu, micro batch size=4, VRAM 26693.38 MB + - zero2, micro batch size=4, VRAM Usage=74332.65 MB +- 4 H800 GPUs + - zero2_cpu, micro batch size=8, VRAM Usage=38709.73 MB + - zero2, micro batch size=4, VRAM Usage=45309.52 MB + - zero2, micro batch size=8, VRAM Usage=58086.37 MB ## List of Supported Models diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py index eb3cfb63a..990c49a35 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py @@ -128,16 +128,14 @@ def train(args): disable_dropout(ref_model) else: ref_model = None - print("ref_model is None", args.disable_reference_model, ref_model is None) if args.lora_rank > 0: model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - if args.grad_checkpoint and args.lora_rank == 0: + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - elif args.lora_rank > 0: - coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py index 1ed5a499b..55976407a 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py @@ -118,12 +118,11 @@ def train(args): if args.lora_rank > 0: model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - if args.grad_checkpoint and args.lora_rank == 0: + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - elif args.lora_rank > 0: - coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled") - + # configure tokenizer tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py index 3ae0a63a1..892ab95f2 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.py +++ b/applications/ColossalChat/examples/training_scripts/train_sft.py @@ -122,13 +122,11 @@ def train(args): # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() # ) - if args.grad_checkpoint and args.lora_rank == 0: - # lora layers are not supported by gradient checkpointing + if args.grad_checkpoint: + # Note, for some models, lora may not be compatible with gradient checkpointing model.gradient_checkpointing_enable() coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - elif args.lora_rank > 0: - coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled") - + # configure tokenizer tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True