mirror of https://github.com/hpcaitech/ColossalAI
Baizhou Zhang
1 year ago
committed by
GitHub
12 changed files with 571 additions and 290 deletions
@ -0,0 +1,120 @@
|
||||
from colossalai import get_default_parser |
||||
|
||||
|
||||
def parse_demo_args(): |
||||
|
||||
parser = get_default_parser() |
||||
parser.add_argument( |
||||
"--model_name_or_path", |
||||
type=str, |
||||
default="facebook/opt-350m", |
||||
help="Path to pretrained model or model identifier from huggingface.co/models." |
||||
) |
||||
parser.add_argument( |
||||
"--output_path", |
||||
type=str, |
||||
default="./output_model.bin", |
||||
help="The path of your saved model after finetuning." |
||||
) |
||||
parser.add_argument( |
||||
"--plugin", |
||||
type=str, |
||||
default="gemini", |
||||
help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'." |
||||
) |
||||
parser.add_argument( |
||||
"--num_epoch", |
||||
type=int, |
||||
default=10, |
||||
help="Number of epochs." |
||||
) |
||||
parser.add_argument( |
||||
"--batch_size", |
||||
type=int, |
||||
default=32, |
||||
help="Batch size (per dp group) for the training dataloader." |
||||
) |
||||
parser.add_argument( |
||||
"--learning_rate", |
||||
type=float, |
||||
default=5e-5, |
||||
help="Initial learning rate (after the potential warmup period) to use." |
||||
) |
||||
parser.add_argument( |
||||
"--warmup_ratio", |
||||
type=float, |
||||
default=0.1, |
||||
help="Ratio of warmup steps against total training steps." |
||||
) |
||||
parser.add_argument( |
||||
"--weight_decay", |
||||
type=float, |
||||
default=0.01, |
||||
help="Weight decay to use." |
||||
) |
||||
parser.add_argument( |
||||
"--seed", |
||||
type=int, |
||||
default=42, |
||||
help="A seed for reproducible training." |
||||
) |
||||
|
||||
args = parser.parse_args() |
||||
return args |
||||
|
||||
|
||||
|
||||
def parse_benchmark_args(): |
||||
|
||||
parser = get_default_parser() |
||||
parser.add_argument( |
||||
"--model_name_or_path", |
||||
type=str, |
||||
default="facebook/opt-125m", |
||||
help="Path to pretrained model or model identifier from huggingface.co/models." |
||||
) |
||||
parser.add_argument( |
||||
"--plugin", |
||||
type=str, |
||||
default="gemini", |
||||
help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'." |
||||
) |
||||
parser.add_argument( |
||||
"--batch_size", |
||||
type=int, |
||||
default=32, |
||||
help="Batch size (per dp group) for the training dataloader." |
||||
) |
||||
parser.add_argument( |
||||
"--learning_rate", |
||||
type=float, |
||||
default=5e-5, |
||||
help="Initial learning rate (after the potential warmup period) to use." |
||||
) |
||||
parser.add_argument( |
||||
"--weight_decay", |
||||
type=float, |
||||
default=0.0, |
||||
help="Weight decay to use." |
||||
) |
||||
parser.add_argument( |
||||
"--max_train_steps", |
||||
type=int, |
||||
default=20, |
||||
help="Total number of training steps to perform." |
||||
) |
||||
parser.add_argument( |
||||
"--seed", |
||||
type=int, |
||||
default=42, |
||||
help="A seed for reproducible training." |
||||
) |
||||
parser.add_argument( |
||||
"--mem_cap", |
||||
type=int, |
||||
default=0, |
||||
help="Limit on the usage of space for each GPU (in GB)." |
||||
) |
||||
args = parser.parse_args() |
||||
|
||||
return args |
@ -1,21 +0,0 @@
|
||||
export BS=16 |
||||
export MEMCAP=0 |
||||
export MODEL="6.7b" |
||||
export GPUNUM=1 |
||||
|
||||
for MODEL in "6.7b" "13b" "1.3b" |
||||
do |
||||
for GPUNUM in 8 1 |
||||
do |
||||
for BS in 16 24 32 8 |
||||
do |
||||
for MEMCAP in 0 40 |
||||
do |
||||
pkill -9 torchrun |
||||
pkill -9 python |
||||
|
||||
env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh |
||||
done |
||||
done |
||||
done |
||||
done |
@ -0,0 +1,37 @@
|
||||
import torch |
||||
from torch.utils.data import Dataset |
||||
from datasets import load_dataset |
||||
|
||||
|
||||
class NetflixDataset(Dataset): |
||||
|
||||
def __init__(self, tokenizer): |
||||
|
||||
super().__init__() |
||||
|
||||
self.tokenizer = tokenizer |
||||
self.input_ids = [] |
||||
self.attn_masks = [] |
||||
self.labels = [] |
||||
self.txt_list = netflix_descriptions = load_dataset("hugginglearners/netflix-shows", split="train")['description'] |
||||
self.max_length = max([len(self.tokenizer.encode(description)) for description in netflix_descriptions]) |
||||
|
||||
for txt in self.txt_list: |
||||
encodings_dict = self.tokenizer('</s>' + txt + '</s>', |
||||
truncation=True, |
||||
max_length=self.max_length, |
||||
padding="max_length") |
||||
self.input_ids.append(torch.tensor(encodings_dict['input_ids'])) |
||||
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'])) |
||||
|
||||
def __len__(self): |
||||
return len(self.input_ids) |
||||
|
||||
def __getitem__(self, idx): |
||||
return self.input_ids[idx], self.attn_masks[idx] |
||||
|
||||
|
||||
def netflix_collator(data): |
||||
return {'input_ids': torch.stack([x[0] for x in data]), |
||||
'attention_mask': torch.stack([x[1] for x in data]), |
||||
'labels': torch.stack([x[0] for x in data])} |
@ -0,0 +1,146 @@
|
||||
import time |
||||
|
||||
import torch |
||||
import transformers |
||||
from transformers import AutoConfig, OPTForCausalLM |
||||
from transformers.utils.versions import require_version |
||||
import tqdm |
||||
|
||||
import colossalai |
||||
from colossalai.nn.optimizer import HybridAdam |
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger |
||||
from colossalai.tensor import ProcessGroup, ShardSpec |
||||
from colossalai.utils import get_current_device |
||||
from colossalai.zero import ColoInitContext |
||||
from colossalai.booster import Booster |
||||
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin |
||||
from colossalai.cluster import DistCoordinator |
||||
|
||||
from args import parse_benchmark_args |
||||
|
||||
require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt") |
||||
|
||||
|
||||
def format_num(num: int, bytes=False): |
||||
"""Scale bytes to its proper format, e.g. 1253656 => '1.20MB'""" |
||||
factor = 1024 if bytes else 1000 |
||||
suffix = "B" if bytes else "" |
||||
for unit in ["", " K", " M", " G", " T", " P"]: |
||||
if num < factor: |
||||
return f"{num:.2f}{unit}{suffix}" |
||||
num /= factor |
||||
|
||||
|
||||
def get_data(batch_size, seq_len, vocab_size): |
||||
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device()) |
||||
attention_mask = torch.ones_like(input_ids) |
||||
return input_ids, attention_mask |
||||
|
||||
|
||||
def colo_memory_cap(size_in_GB): |
||||
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device |
||||
cuda_capacity = colo_device_memory_capacity(get_current_device()) |
||||
if size_in_GB * (1024**3) < cuda_capacity: |
||||
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) |
||||
print(f"Limiting GPU memory usage to {size_in_GB} GB") |
||||
|
||||
|
||||
def main(): |
||||
|
||||
args = parse_benchmark_args() |
||||
|
||||
# Launch ColossalAI |
||||
colossalai.launch_from_torch(config={}, seed=args.seed) |
||||
coordinator = DistCoordinator() |
||||
world_size = coordinator.world_size |
||||
|
||||
# Manage loggers |
||||
disable_existing_loggers() |
||||
logger = get_dist_logger() |
||||
if coordinator.is_master(): |
||||
transformers.utils.logging.set_verbosity_info() |
||||
else: |
||||
transformers.utils.logging.set_verbosity_error() |
||||
|
||||
# Whether to set limit of memory capacity |
||||
if args.mem_cap > 0: |
||||
colo_memory_cap(args.mem_cap) |
||||
|
||||
# Build OPT model |
||||
# Initialize the model under ColoInitContext if using GeminiPlugin |
||||
config = AutoConfig.from_pretrained(args.model_name_or_path) |
||||
if args.plugin == 'gemini': |
||||
shard_pg = ProcessGroup(tp_degree=world_size) |
||||
default_dist_spec = ShardSpec([-1], [world_size]) |
||||
with ColoInitContext(device='cpu', |
||||
default_dist_spec=default_dist_spec, |
||||
default_pg=shard_pg): |
||||
model = OPTForCausalLM(config) |
||||
else: |
||||
model = OPTForCausalLM(config) |
||||
logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0]) |
||||
|
||||
# Enable gradient checkpointing |
||||
model.gradient_checkpointing_enable() |
||||
|
||||
# Set plugin |
||||
booster_kwargs = {} |
||||
if args.plugin == 'torch_ddp_fp16': |
||||
booster_kwargs['mixed_precision'] = 'fp16' |
||||
if args.plugin.startswith('torch_ddp'): |
||||
plugin = TorchDDPPlugin() |
||||
elif args.plugin == 'gemini': |
||||
plugin = GeminiPlugin(device=get_current_device(), |
||||
placement_policy='cpu', |
||||
pin_memory=True, |
||||
strict_ddp_mode=True, |
||||
initial_scale=2**5) |
||||
elif args.plugin == 'low_level_zero': |
||||
plugin = LowLevelZeroPlugin(initial_scale=2**5) |
||||
logger.info(f"Set plugin as {args.plugin}", ranks=[0]) |
||||
|
||||
# Set optimizer |
||||
optimizer = HybridAdam(model.parameters(), lr=args.learning_rate) |
||||
|
||||
# Set booster |
||||
booster = Booster(plugin=plugin, **booster_kwargs) |
||||
model, optimizer, _, _, _ = booster.boost(model, optimizer) |
||||
|
||||
SEQ_LEN = 1024 |
||||
VOCAB_SIZE = 50257 |
||||
|
||||
# Start training. |
||||
logger.info(f"Start testing", ranks=[0]) |
||||
progress_bar = tqdm.tqdm(total=args.max_train_steps, desc="Training Step", disable=not coordinator.is_master()) |
||||
|
||||
torch.cuda.synchronize() |
||||
model.train() |
||||
start_time = time.time() |
||||
|
||||
for _ in range(args.max_train_steps): |
||||
|
||||
input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE) |
||||
optimizer.zero_grad() |
||||
outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False) |
||||
loss = outputs['loss'] |
||||
booster.backward(loss, optimizer) |
||||
optimizer.step() |
||||
|
||||
torch.cuda.synchronize() |
||||
progress_bar.update(1) |
||||
|
||||
# Compute Statistics |
||||
end_time = time.time() |
||||
throughput = "{:.4f}".format((world_size * args.max_train_steps * args.batch_size) / (end_time - start_time)) |
||||
max_mem = format_num(torch.cuda.max_memory_allocated(device=torch.cuda.current_device()), bytes=True) |
||||
|
||||
logger.info(f"Testing finished, " |
||||
f"batch size per gpu: {args.batch_size}, " |
||||
f"plugin: {args.plugin}, " |
||||
f"throughput: {throughput}, " |
||||
f"maximum memory usage per gpu: {max_mem}.", |
||||
ranks=[0]) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
main() |
@ -0,0 +1,149 @@
|
||||
import time |
||||
|
||||
import torch |
||||
import datasets |
||||
import transformers |
||||
from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer |
||||
from transformers import get_linear_schedule_with_warmup |
||||
from transformers.utils.versions import require_version |
||||
from tqdm import tqdm |
||||
|
||||
import colossalai |
||||
from colossalai.nn.optimizer import HybridAdam |
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger |
||||
from colossalai.tensor import ProcessGroup, ShardSpec |
||||
from colossalai.utils import get_current_device |
||||
from colossalai.zero import ColoInitContext |
||||
from colossalai.booster import Booster |
||||
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin |
||||
from colossalai.cluster import DistCoordinator |
||||
|
||||
from args import parse_demo_args |
||||
from data import NetflixDataset, netflix_collator |
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r requirements.txt") |
||||
require_version("transformers>=4.20.0", "To fix: pip install -r requirements.txt") |
||||
|
||||
|
||||
def move_to_cuda(batch, device): |
||||
return {k: v.to(device) for k, v in batch.items()} |
||||
|
||||
|
||||
def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator): |
||||
|
||||
torch.cuda.synchronize() |
||||
model.train() |
||||
|
||||
with tqdm(dataloader, desc=f'Epoch [{epoch + 1}]', disable=not coordinator.is_master()) as pbar: |
||||
|
||||
for batch in pbar: |
||||
|
||||
# Foward |
||||
optimizer.zero_grad() |
||||
batch = move_to_cuda(batch, torch.cuda.current_device()) |
||||
|
||||
outputs = model(use_cache=False, **batch) |
||||
loss = outputs['loss'] |
||||
|
||||
# Backward |
||||
booster.backward(loss, optimizer) |
||||
optimizer.step() |
||||
lr_scheduler.step() |
||||
|
||||
# Print batch loss |
||||
pbar.set_postfix({'loss': loss.item()}) |
||||
|
||||
|
||||
def main(): |
||||
|
||||
args = parse_demo_args() |
||||
|
||||
# Launch ColossalAI |
||||
colossalai.launch_from_torch(config={}, seed=args.seed) |
||||
coordinator = DistCoordinator() |
||||
world_size = coordinator.world_size |
||||
|
||||
# Manage loggers |
||||
disable_existing_loggers() |
||||
logger = get_dist_logger() |
||||
if coordinator.is_master(): |
||||
datasets.utils.logging.set_verbosity_warning() |
||||
transformers.utils.logging.set_verbosity_info() |
||||
else: |
||||
datasets.utils.logging.set_verbosity_error() |
||||
transformers.utils.logging.set_verbosity_error() |
||||
|
||||
# Build OPT model |
||||
# Initialize the model under ColoInitContext if using GeminiPlugin |
||||
config = AutoConfig.from_pretrained(args.model_name_or_path) |
||||
if args.plugin == 'gemini': |
||||
shard_pg = ProcessGroup(tp_degree=world_size) |
||||
default_dist_spec = ShardSpec([-1], [world_size]) |
||||
with ColoInitContext(device='cpu', |
||||
default_dist_spec=default_dist_spec, |
||||
default_pg=shard_pg): |
||||
model = OPTForCausalLM(config) |
||||
else: |
||||
model = OPTForCausalLM(config) |
||||
logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0]) |
||||
|
||||
# Enable gradient checkpointing |
||||
model.gradient_checkpointing_enable() |
||||
|
||||
# Set plugin |
||||
booster_kwargs = {} |
||||
if args.plugin == 'torch_ddp_fp16': |
||||
booster_kwargs['mixed_precision'] = 'fp16' |
||||
if args.plugin.startswith('torch_ddp'): |
||||
plugin = TorchDDPPlugin() |
||||
elif args.plugin == 'gemini': |
||||
plugin = GeminiPlugin(device=get_current_device(), |
||||
placement_policy='cpu', |
||||
pin_memory=True, |
||||
strict_ddp_mode=True, |
||||
initial_scale=2**5) |
||||
elif args.plugin == 'low_level_zero': |
||||
plugin = LowLevelZeroPlugin(initial_scale=2**5) |
||||
logger.info(f"Set plugin as {args.plugin}", ranks=[0]) |
||||
|
||||
# Prepare tokenizer and dataloader |
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) |
||||
dataset = NetflixDataset(tokenizer) |
||||
dataloader = plugin.prepare_dataloader(dataset, |
||||
batch_size=args.batch_size, |
||||
shuffle=True, |
||||
drop_last=True, |
||||
collate_fn=netflix_collator) |
||||
|
||||
# Set optimizer |
||||
optimizer = HybridAdam(model.parameters(), lr=(args.learning_rate * world_size)) |
||||
|
||||
# Set lr scheduler |
||||
total_steps = len(dataloader) * args.num_epoch |
||||
num_warmup_steps = int(args.warmup_ratio * total_steps) |
||||
lr_scheduler = get_linear_schedule_with_warmup( |
||||
optimizer, |
||||
num_warmup_steps=num_warmup_steps, |
||||
num_training_steps=len(dataloader) * args.num_epoch |
||||
) |
||||
|
||||
# Set booster |
||||
booster = Booster(plugin=plugin, **booster_kwargs) |
||||
model, optimizer, _, dataloader, lr_scheduler = booster.boost(model=model, |
||||
optimizer=optimizer, |
||||
dataloader=dataloader, |
||||
lr_scheduler=lr_scheduler) |
||||
|
||||
# Start finetuning |
||||
logger.info(f"Start finetuning", ranks=[0]) |
||||
for epoch in range(args.num_epoch): |
||||
train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coordinator) |
||||
|
||||
# Finish training and evaluate |
||||
logger.info(f"Finish finetuning", ranks=[0]) |
||||
booster.save_model(model, args.output_path) |
||||
logger.info(f"Saving model checkpoint to {args.output_path}", ranks=[0]) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
main() |
@ -1,2 +1,4 @@
|
||||
colossalai >= 0.1.12 |
||||
torch >= 1.8.1 |
||||
datasets >= 1.8.0 |
||||
transformers >= 4.20.0 |
@ -0,0 +1,30 @@
|
||||
set -xe |
||||
pip install -r requirements.txt |
||||
|
||||
export BS=32 |
||||
export MEMCAP=0 |
||||
export GPUNUM=1 |
||||
|
||||
# acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b` |
||||
export MODEL="125m" |
||||
|
||||
for BS in 8 32 128 |
||||
do |
||||
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" |
||||
do |
||||
for GPUNUM in 1 4 |
||||
do |
||||
|
||||
MODLE_PATH="facebook/opt-${MODEL}" |
||||
torchrun \ |
||||
--standalone \ |
||||
--nproc_per_node ${GPUNUM} \ |
||||
opt_benchmark.py \ |
||||
--model_name_or_path ${MODLE_PATH} \ |
||||
--mem_cap ${MEMCAP} \ |
||||
--plugin ${PLUGIN} \ |
||||
--batch_size ${BS} |
||||
|
||||
done |
||||
done |
||||
done |
@ -0,0 +1,44 @@
|
||||
set -xe |
||||
pip install -r requirements.txt |
||||
|
||||
# model name or path |
||||
MODEL="facebook/opt-350m" |
||||
|
||||
# path for saving model |
||||
OUTPUT_PATH="./output_model.bin" |
||||
|
||||
# plugin(training strategy) |
||||
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini" |
||||
PLUGIN="gemini" |
||||
|
||||
# number of gpus to use |
||||
GPUNUM=4 |
||||
|
||||
# batch size per gpu |
||||
BS=16 |
||||
|
||||
# learning rate |
||||
LR="5e-5" |
||||
|
||||
# number of epoch |
||||
EPOCH=10 |
||||
|
||||
# weight decay |
||||
WEIGHT_DECAY=0.01 |
||||
|
||||
# ratio of warmup steps |
||||
WARMUP_RATIO=0.1 |
||||
|
||||
# run the script for demo |
||||
torchrun \ |
||||
--standalone \ |
||||
--nproc_per_node ${GPUNUM} \ |
||||
opt_train_demo.py \ |
||||
--model_name_or_path ${MODEL} \ |
||||
--output_path ${OUTPUT_PATH} \ |
||||
--plugin ${PLUGIN} \ |
||||
--batch_size ${BS} \ |
||||
--num_epoch ${EPOCH} \ |
||||
--learning_rate ${LR} \ |
||||
--weight_decay ${WEIGHT_DECAY} \ |
||||
--warmup_ratio ${WARMUP_RATIO} |
@ -1,28 +0,0 @@
|
||||
set -x |
||||
export BS=${BS:-16} |
||||
export MEMCAP=${MEMCAP:-0} |
||||
# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b` |
||||
export MODEL=${MODEL:-"125m"} |
||||
export GPUNUM=${GPUNUM:-1} |
||||
export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"} |
||||
|
||||
# make directory for logs |
||||
mkdir -p ./logs |
||||
|
||||
if [ ${USE_SHARD_INIT} = "true" ]; then |
||||
USE_SHARD_INIT="--shardinit" |
||||
else |
||||
USE_SHARD_INIT="" |
||||
fi |
||||
|
||||
export MODLE_PATH="facebook/opt-${MODEL}" |
||||
|
||||
# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 |
||||
torchrun \ |
||||
--nproc_per_node ${GPUNUM} \ |
||||
--master_port 19198 \ |
||||
train_gemini_opt.py \ |
||||
--mem_cap ${MEMCAP} \ |
||||
--model_name_or_path ${MODLE_PATH} \ |
||||
${USE_SHARD_INIT} \ |
||||
--batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log |
@ -1,4 +1,19 @@
|
||||
for GPUNUM in 2 1 |
||||
set -xe |
||||
pip install -r requirements.txt |
||||
|
||||
BS=4 |
||||
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" |
||||
do |
||||
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh |
||||
for GPUNUM in 1 4 |
||||
do |
||||
|
||||
torchrun \ |
||||
--standalone \ |
||||
--nproc_per_node ${GPUNUM} \ |
||||
opt_benchmark.py \ |
||||
--model_name_or_path "facebook/opt-125m" \ |
||||
--plugin ${PLUGIN} \ |
||||
--batch_size ${BS} |
||||
|
||||
done |
||||
done |
||||
|
@ -1,233 +0,0 @@
|
||||
#!/usr/bin/env python |
||||
# coding=utf-8 |
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved. |
||||
# |
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
""" |
||||
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) |
||||
on a text file or a dataset without using HuggingFace Trainer. |
||||
|
||||
Here is the full list of checkpoints on the hub that can be fine-tuned by this script: |
||||
https://huggingface.co/models?filter=text-generation |
||||
""" |
||||
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. |
||||
|
||||
import time |
||||
from functools import partial |
||||
|
||||
import datasets |
||||
import torch |
||||
import torch.distributed as dist |
||||
import transformers |
||||
from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM |
||||
from transformers.utils.versions import require_version |
||||
|
||||
import colossalai |
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger |
||||
from colossalai.tensor import ProcessGroup, ShardSpec |
||||
from colossalai.utils import get_current_device |
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, GeminiDDP |
||||
|
||||
|
||||
def get_data(batch_size, seq_len, vocab_size): |
||||
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device()) |
||||
attention_mask = torch.ones_like(input_ids) |
||||
return input_ids, attention_mask |
||||
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") |
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) |
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) |
||||
|
||||
|
||||
def get_time_stamp(): |
||||
torch.cuda.synchronize() |
||||
return time.time() |
||||
|
||||
|
||||
def get_tflops(model_numel, batch_size, seq_len, step_time): |
||||
return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) |
||||
|
||||
|
||||
def parse_args(): |
||||
parser = colossalai.get_default_parser() |
||||
parser.add_argument( |
||||
"--model_name_or_path", |
||||
type=str, |
||||
help="Path to pretrained model or model identifier from huggingface.co/models.", |
||||
required=True, |
||||
) |
||||
parser.add_argument( |
||||
"--config_name", |
||||
type=str, |
||||
default=None, |
||||
help="Pretrained config name or path if not the same as model_name", |
||||
) |
||||
parser.add_argument( |
||||
"--batch_size", |
||||
type=int, |
||||
default=8, |
||||
help="Batch size (per dp group) for the training dataloader.", |
||||
) |
||||
parser.add_argument( |
||||
"--learning_rate", |
||||
type=float, |
||||
default=5e-5, |
||||
help="Initial learning rate (after the potential warmup period) to use.", |
||||
) |
||||
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") |
||||
parser.add_argument( |
||||
"--max_train_steps", |
||||
type=int, |
||||
default=20, |
||||
help="Total number of training steps to perform.", |
||||
) |
||||
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") |
||||
parser.add_argument( |
||||
"--model_type", |
||||
type=str, |
||||
default=None, |
||||
help="Model type to use if training from scratch.", |
||||
choices=MODEL_TYPES, |
||||
) |
||||
parser.add_argument( |
||||
"--shardinit", |
||||
action="store_true", |
||||
help="Initialize the model with tensor parallel", |
||||
) |
||||
parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap") |
||||
parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu") |
||||
args = parser.parse_args() |
||||
|
||||
return args |
||||
|
||||
|
||||
def colo_memory_cap(size_in_GB): |
||||
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device |
||||
cuda_capacity = colo_device_memory_capacity(get_current_device()) |
||||
if size_in_GB * (1024**3) < cuda_capacity: |
||||
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) |
||||
print("Using {} GB of GPU memory".format(size_in_GB)) |
||||
|
||||
|
||||
def main(): |
||||
args = parse_args() |
||||
disable_existing_loggers() |
||||
colossalai.launch_from_torch({}) |
||||
logger = get_dist_logger() |
||||
is_main_process = dist.get_rank() == 0 |
||||
|
||||
if is_main_process: |
||||
datasets.utils.logging.set_verbosity_warning() |
||||
transformers.utils.logging.set_verbosity_info() |
||||
else: |
||||
datasets.utils.logging.set_verbosity_error() |
||||
transformers.utils.logging.set_verbosity_error() |
||||
|
||||
if args.mem_cap > 0: |
||||
colo_memory_cap(args.mem_cap) |
||||
|
||||
# If passed along, set the training seed now. |
||||
if args.seed is not None: |
||||
torch.mannul_seed(args.seed) |
||||
logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}") |
||||
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at |
||||
# https://huggingface.co/docs/datasets/loading_datasets.html. |
||||
|
||||
# Load pretrained model |
||||
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently |
||||
# download model & vocab. |
||||
if args.config_name: |
||||
config = AutoConfig.from_pretrained(args.config_name) |
||||
elif args.model_name_or_path: |
||||
config = AutoConfig.from_pretrained(args.model_name_or_path) |
||||
else: |
||||
config = CONFIG_MAPPING[args.model_type]() |
||||
logger.warning("You are instantiating a new config instance from scratch.") |
||||
logger.info("Model config has been created", ranks=[0]) |
||||
|
||||
if args.init_in_cpu: |
||||
init_dev = torch.device('cpu') |
||||
else: |
||||
init_dev = get_current_device() |
||||
|
||||
# shard init parameters |
||||
if args.shardinit: |
||||
logger.info("Sharding initialization !", ranks=[0]) |
||||
else: |
||||
logger.info("Skipping sharding initialization", ranks=[0]) |
||||
|
||||
world_size = torch.distributed.get_world_size() |
||||
shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None |
||||
default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None |
||||
|
||||
# build model |
||||
if args.model_name_or_path is None: |
||||
logger.info("Train a new model from scratch", ranks=[0]) |
||||
with ColoInitContext(device=init_dev, |
||||
dtype=torch.half, |
||||
default_dist_spec=default_dist_spec, |
||||
default_pg=shard_pg): |
||||
model = OPTForCausalLM(config) |
||||
else: |
||||
logger.info("Finetune a pre-trained model", ranks=[0]) |
||||
with ColoInitContext(device=init_dev, |
||||
dtype=torch.half, |
||||
default_dist_spec=default_dist_spec, |
||||
default_pg=shard_pg): |
||||
model = OPTForCausalLM.from_pretrained(args.model_name_or_path, |
||||
from_tf=bool(".ckpt" in args.model_name_or_path), |
||||
config=config, |
||||
local_files_only=False) |
||||
|
||||
# enable gradient checkpointing |
||||
model.gradient_checkpointing_enable() |
||||
|
||||
numel = sum([p.numel() for p in model.parameters()]) |
||||
PLACEMENT_POLICY = 'cpu' |
||||
model = GeminiDDP(model, |
||||
device=get_current_device(), |
||||
placement_policy=PLACEMENT_POLICY, |
||||
pin_memory=True, |
||||
strict_ddp_mode=args.shardinit) |
||||
optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0) |
||||
|
||||
SEQ_LEN = 1024 |
||||
VOCAB_SIZE = 50257 |
||||
|
||||
get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN) |
||||
|
||||
model.train() |
||||
for step in range(args.max_train_steps): |
||||
st_time = time.time() |
||||
input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE) |
||||
|
||||
outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False) |
||||
loss = outputs['loss'] |
||||
optimizer.backward(loss) |
||||
|
||||
optimizer.step() |
||||
optimizer.zero_grad() |
||||
torch.cuda.synchronize() |
||||
step_time = time.time() - st_time |
||||
step_tflops = get_tflops_func(step_time) |
||||
|
||||
logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0]) |
||||
|
||||
logger.info("Training finished", ranks=[0]) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
main() |
Loading…
Reference in new issue