set benchmarks as a git submodule (#156)

* remove benchmark folder

* added benchmark submodule

* update .gitmodules
pull/159/head
Frank Lee 2022-01-18 15:48:07 +08:00 committed by GitHub
parent f3802d6b06
commit c7b8ece736
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 5 additions and 1240 deletions

4
.gitmodules vendored Normal file
View File

@ -0,0 +1,4 @@
[submodule "benchmark"]
path = benchmark
url = https://github.com/hpcaitech/ColossalAI-Benchmark.git
branch = main

1
benchmark Submodule

@ -0,0 +1 @@
Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0

View File

@ -1,66 +0,0 @@
# Benchmark for Tuning Accuracy and Efficiency
## Overview
The benchmark includes our efforts in using Colossal-AI to train different tasks to achieve SOTA results.
We are interested in both validataion accuracy and training speed, and prefer larger batch size to take advantage of more GPU devices.
For example, we trained vision transformer with batch size 512 on CIFAR10 and 4096 on ImageNet1k, which are basically not used in existing works.
Some of the results in the benchmark trained with 8x A100 are shown below.
| Task | Model | Training Time | Top-1 Accuracy |
| ---------- | ------------ | ------------- | -------------- |
| CIFAR10 | [ViT-Lite-7/4](https://arxiv.org/pdf/2104.05704.pdf) | ~ 16 min | ~ 90.5% |
| ImageNet1k | ViT-S/16 | ~ 16.5 h | ~ 74.5% |
The `train.py` script in each task runs training with the specific configuration script in `configs/` for different parallelisms.
Supported parallelisms include data parallel only (ends with `vanilla`), 1D (ends with `1d`), 2D (ends with `2d`), 2.5D (ends with `2p5d`), 3D (ends with `3d`).
Each configuration scripts basically includes the following elements, taking ImageNet1k task as example:
```
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
# data parallel only
TENSOR_PARALLEL_SIZE = 1
TENSOR_PARALLEL_MODE = None
# parallelism setting
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, ) # amp setting
gradient_accumulation = 2 # accumulate 2 steps for gradient update
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation # actual batch size for dataloader
clip_grad_norm = 1.0 # clip gradient with norm 1.0
```
Upper case elements are basically what `train.py` needs, and lower case elements are what Colossal-AI needs to initialize the training.
## Usage
To start training, use the following command to run each worker:
```
$ DATA=/path/to/dataset python train.py --world_size=WORLD_SIZE \
--rank=RANK \
--local_rank=LOCAL_RANK \
--host=MASTER_IP_ADDRESS \
--port=MASTER_PORT \
--config=CONFIG_FILE
```
It is also recommended to start training with `torchrun` as:
```
$ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \
--nnodes=NUM_NODES \
--node_rank=NODE_RANK \
--master_addr=MASTER_IP_ADDRESS \
--master_port=MASTER_PORT \
train.py --config=CONFIG_FILE
```

View File

@ -1,18 +0,0 @@
BATCH_SIZE = 512
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 3e-2
TENSOR_PARALLEL_SIZE = 2
TENSOR_PARALLEL_MODE = '1d'
NUM_EPOCHS = 200
WARMUP_EPOCHS = 40
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
seed = 42
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"

View File

@ -1,18 +0,0 @@
BATCH_SIZE = 512
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 3e-2
TENSOR_PARALLEL_SIZE = 4
TENSOR_PARALLEL_MODE = '2d'
NUM_EPOCHS = 200
WARMUP_EPOCHS = 40
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
seed = 42
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"

View File

@ -1,19 +0,0 @@
BATCH_SIZE = 512
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 3e-2
TENSOR_PARALLEL_SIZE = 4
DEPTH = 1
TENSOR_PARALLEL_MODE = '2.5d'
NUM_EPOCHS = 200
WARMUP_EPOCHS = 40
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
)
seed = 42
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"

View File

@ -1,18 +0,0 @@
BATCH_SIZE = 512
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 3e-2
TENSOR_PARALLEL_SIZE = 8
TENSOR_PARALLEL_MODE = '3d'
NUM_EPOCHS = 200
WARMUP_EPOCHS = 40
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
seed = 42
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"

View File

@ -1,18 +0,0 @@
BATCH_SIZE = 512
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 3e-2
TENSOR_PARALLEL_SIZE = 1
TENSOR_PARALLEL_MODE = None
NUM_EPOCHS = 200
WARMUP_EPOCHS = 40
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
seed = 42
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"

View File

@ -1,119 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import colossalai
import torch
import torchvision
from colossalai.builder import *
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn import Accuracy, CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from model_zoo.vit import vit_lite_depth7_patch4_32
from torchvision import transforms
DATASET_PATH = str(os.environ['DATA'])
def build_cifar(batch_size):
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
transforms.Resize(32),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH,
train=True,
download=True,
transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, transform=transform_test)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=batch_size,
num_workers=4,
pin_memory=True)
test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
return train_dataloader, test_dataloader
def train_cifar():
args = colossalai.get_default_parser().parse_args()
# standard launch
# colossalai.launch(config=args.config,
# rank=args.rank,
# world_size=args.world_size,
# local_rank=args.local_rank,
# host=args.host,
# port=args.port)
# launch from torchrun
colossalai.launch_from_torch(config=args.config)
logger = get_dist_logger()
if hasattr(gpc.config, 'LOG_PATH'):
if gpc.get_global_rank() == 0:
log_path = gpc.config.LOG_PATH
if not os.path.exists(log_path):
os.mkdir(log_path)
logger.log_to_file(log_path)
model = vit_lite_depth7_patch4_32()
train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
criterion = CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
steps_per_epoch = len(train_dataloader)
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch)
engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
lr_scheduler=lr_scheduler)
logger.info("Engine is built", ranks=[0])
timer = MultiTimer()
trainer = Trainer(engine=engine, logger=logger, timer=timer)
logger.info("Trainer is built", ranks=[0])
hook_list = [
hooks.LogMetricByEpochHook(logger=logger),
hooks.LogMetricByStepHook(),
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
# hooks.LogMemoryByEpochHook(logger=logger),
hooks.AccuracyHook(accuracy_func=Accuracy()),
hooks.LossHook(),
hooks.ThroughputHook(),
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
]
logger.info("Train start", ranks=[0])
trainer.fit(train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
epochs=gpc.config.NUM_EPOCHS,
hooks=hook_list,
display_progress=True,
test_interval=1)
if __name__ == '__main__':
train_cifar()

View File

@ -1,29 +0,0 @@
from colossalai.amp import AMP_TYPE
VOCAB_SIZE = 50304
SEQ_LENGTH = 1024
TOTAL_BATCH_SIZE = 256
LEARNING_RATE = 0.00015
WEIGHT_DECAY = 1e-2
TENSOR_PARALLEL_SIZE = 2
TENSOR_PARALLEL_MODE = '1d'
NUM_EPOCHS = 60
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"

View File

@ -1,29 +0,0 @@
from colossalai.amp import AMP_TYPE
VOCAB_SIZE = 50304
SEQ_LENGTH = 1024
TOTAL_BATCH_SIZE = 256
LEARNING_RATE = 0.00015
WEIGHT_DECAY = 1e-2
TENSOR_PARALLEL_SIZE = 4
TENSOR_PARALLEL_MODE = '2d'
NUM_EPOCHS = 60
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 1
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"

View File

@ -1,30 +0,0 @@
from colossalai.amp import AMP_TYPE
VOCAB_SIZE = 50304
SEQ_LENGTH = 1024
TOTAL_BATCH_SIZE = 256
LEARNING_RATE = 0.00015
WEIGHT_DECAY = 1e-2
TENSOR_PARALLEL_SIZE = 4
DEPTH = 1
TENSOR_PARALLEL_MODE = '2.5d'
NUM_EPOCHS = 60
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 1
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"

View File

@ -1,29 +0,0 @@
from colossalai.amp import AMP_TYPE
VOCAB_SIZE = 50304
SEQ_LENGTH = 1024
TOTAL_BATCH_SIZE = 256
LEARNING_RATE = 0.00015
WEIGHT_DECAY = 1e-2
TENSOR_PARALLEL_SIZE = 8
TENSOR_PARALLEL_MODE = '3d'
NUM_EPOCHS = 60
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 1
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"

View File

@ -1,29 +0,0 @@
from colossalai.amp import AMP_TYPE
VOCAB_SIZE = 50304
SEQ_LENGTH = 1024
TOTAL_BATCH_SIZE = 256
LEARNING_RATE = 0.00015
WEIGHT_DECAY = 1e-2
TENSOR_PARALLEL_SIZE = 1
TENSOR_PARALLEL_MODE = None
NUM_EPOCHS = 60
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 1
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"

View File

@ -1,37 +0,0 @@
import json
import os
import torch
from colossalai.registry import DATASETS
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer
@DATASETS.register_module
class WebtextDataset(Dataset):
def __init__(self, path, seq_len=1024) -> None:
super().__init__()
root = os.path.dirname(path)
encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
if os.path.isfile(encoded_data_cache_path):
seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
if seq_len_ == seq_len:
self.data = data
self.attention_mask = attention_mask
return
raw_data = []
with open(path) as f:
for line in f.readlines():
raw_data.append(json.loads(line)['text'])
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.unk_token
encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
self.data = encoded_data['input_ids']
self.attention_mask = encoded_data['attention_mask']
torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return (self.data[index], self.attention_mask[index]), self.data[index]

View File

@ -1,105 +0,0 @@
import contextlib
import os
import colossalai
import torch
from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule)
from colossalai.logging import get_dist_logger
from colossalai.nn import CosineAnnealingWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import MultiTimer, get_dataloader
from colossalai.zero import zero3_model_context
from model_zoo.gpt import GPTLMLoss, gpt2_small, gpt2_medium, gpt2_large, gpt2_xl
from data import WebtextDataset
def train_gpt():
args = colossalai.get_default_parser().parse_args()
# standard launch
# colossalai.launch(config=args.config,
# rank=args.rank,
# world_size=args.world_size,
# local_rank=args.local_rank,
# host=args.host,
# port=args.port)
# launch from torchrun
colossalai.launch_from_torch(config=args.config)
logger = get_dist_logger()
if hasattr(gpc.config, 'LOG_PATH'):
if gpc.get_global_rank() == 0:
log_path = gpc.config.LOG_PATH
if not os.path.exists(log_path):
os.mkdir(log_path)
logger.log_to_file(log_path)
train_dataset = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LENGTH)
train_dataloader = get_dataloader(train_dataset,
seed=42,
batch_size=gpc.config.BATCH_SIZE // gpc.data_parallel_size,
pin_memory=True,
shuffle=True,
drop_last=True)
logger.info(f'Loaded {len(train_dataset)}/{len(train_dataloader)} samples/batches', ranks=[0])
# zero3 under test
# use_zero3 = hasattr(gpc.config, 'zero') and gpc.config.zero.level == 3
# cm = zero3_model_context() if use_zero3 else contextlib.nullcontext()
# with cm:
# model = gpc.config.model.pop('type')(**gpc.config.model)
model = gpt2_medium(vocab_size=gpc.config.VOCAB_SIZE,
max_position_embeddings=gpc.config.SEQ_LENGTH,
checkpoint=True)
criterion = GPTLMLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2)
steps_per_epoch = len(train_dataloader) // gpc.config.gradient_accumulation
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch,
eta_min=1e-5)
engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader,
lr_scheduler=lr_scheduler)
# pipeline under test
# num_model_chunks = getattr(gpc.config.model, 'num_chunks', 1)
# if num_model_chunks > 1:
# logger.info('Build InterleavedPipelineSchedule', ranks=[0])
# schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES, num_model_chunks)
# else:
# logger.info('Build PipelineSchedule', ranks=[0])
# schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES)
timer = MultiTimer()
trainer = Trainer(engine=engine, logger=logger, timer=timer)
hook_list = [
hooks.LogMetricByEpochHook(logger=logger),
hooks.LogMetricByStepHook(),
hooks.LossHook(),
hooks.ThroughputHook(),
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
# hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
# hooks.LogMemoryByEpochHook(logger),
# hooks.LogTimingByEpochHook(timer, logger),
# hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
]
logger.info("Training start", ranks=[0])
trainer.fit(train_dataloader=train_dataloader, epochs=gpc.config.NUM_EPOCHS, hooks=hook_list, display_progress=True)
if __name__ == '__main__':
train_gpt()

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 2
TENSOR_PARALLEL_MODE = '1d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 4
TENSOR_PARALLEL_MODE = '2d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,27 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 4
DEPTH = 1
TENSOR_PARALLEL_MODE = '2.5d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 8
TENSOR_PARALLEL_MODE = '3d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 1
TENSOR_PARALLEL_MODE = None
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,207 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import glob
import os
import colossalai
import nvidia.dali.fn as fn
import nvidia.dali.tfrecord as tfrec
import torch
from colossalai.builder import *
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn import Accuracy, CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import MultiTimer
from model_zoo.vit import vit_small_patch16_224
from nvidia.dali import types
from nvidia.dali.pipeline import Pipeline
from nvidia.dali.plugin.pytorch import DALIClassificationIterator
DATASET_PATH = str(os.environ['DATA'])
TRAIN_RECS = DATASET_PATH + '/train/*'
VAL_RECS = DATASET_PATH + '/validation/*'
TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
class DaliDataloader(DALIClassificationIterator):
def __init__(self,
tfrec_filenames,
tfrec_idx_filenames,
shard_id=0,
num_shards=1,
batch_size=128,
num_threads=4,
resize=256,
crop=224,
prefetch=2,
training=True,
gpu_aug=False,
cuda=True):
pipe = Pipeline(batch_size=batch_size,
num_threads=num_threads,
device_id=torch.cuda.current_device() if cuda else None,
seed=1024)
with pipe:
inputs = fn.readers.tfrecord(path=tfrec_filenames,
index_path=tfrec_idx_filenames,
random_shuffle=training,
shard_id=shard_id,
num_shards=num_shards,
initial_fill=10000,
read_ahead=True,
prefetch_queue_depth=prefetch,
name='Reader',
features={
'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
})
images = inputs["image/encoded"]
if training:
images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
flip_lr = fn.random.coin_flip(probability=0.5)
else:
# decode jpeg and resize
images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
images = fn.resize(images,
device='gpu' if gpu_aug else 'cpu',
resize_x=resize,
resize_y=resize,
dtype=types.FLOAT,
interp_type=types.INTERP_TRIANGULAR)
flip_lr = False
# center crop and normalise
images = fn.crop_mirror_normalize(images,
dtype=types.FLOAT,
crop=(crop, crop),
mean=[127.5],
std=[127.5],
mirror=flip_lr)
label = inputs["image/class/label"] - 1 # 0-999
# LSG: element_extract will raise exception, let's flatten outside
# label = fn.element_extract(label, element_map=0) # Flatten
if cuda: # transfer data to gpu
pipe.set_outputs(images.gpu(), label.gpu())
else:
pipe.set_outputs(images, label)
pipe.build()
last_batch_policy = 'DROP' if training else 'PARTIAL'
super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
def __iter__(self):
# if not reset (after an epoch), reset; if just initialize, ignore
if self._counter >= self._size or self._size < 0:
self.reset()
return self
def __next__(self):
data = super().__next__()
img, label = data[0]['data'], data[0]['label']
label = label.squeeze()
return (img, ), (label, )
def build_dali_train(batch_size):
return DaliDataloader(
sorted(glob.glob(TRAIN_RECS)),
sorted(glob.glob(TRAIN_IDX)),
batch_size=batch_size,
shard_id=gpc.get_local_rank(ParallelMode.DATA),
num_shards=gpc.get_world_size(ParallelMode.DATA),
training=True,
gpu_aug=True,
cuda=True,
)
def build_dali_test(batch_size):
return DaliDataloader(
sorted(glob.glob(VAL_RECS)),
sorted(glob.glob(VAL_IDX)),
batch_size=batch_size,
shard_id=gpc.get_local_rank(ParallelMode.DATA),
num_shards=gpc.get_world_size(ParallelMode.DATA),
training=False,
gpu_aug=True,
cuda=True,
)
def train_imagenet():
args = colossalai.get_default_parser().parse_args()
# standard launch
# colossalai.launch(config=args.config,
# rank=args.rank,
# world_size=args.world_size,
# local_rank=args.local_rank,
# host=args.host,
# port=args.port)
# launch from torchrun
colossalai.launch_from_torch(config=args.config)
logger = get_dist_logger()
if hasattr(gpc.config, 'LOG_PATH'):
if gpc.get_global_rank() == 0:
log_path = gpc.config.LOG_PATH
if not os.path.exists(log_path):
os.mkdir(log_path)
logger.log_to_file(log_path)
model = vit_small_patch16_224(num_classes=100, init_method='jax')
train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
criterion = CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
total_steps=gpc.config.NUM_EPOCHS,
warmup_steps=gpc.config.WARMUP_EPOCHS)
engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader)
logger.info("Engine is built", ranks=[0])
timer = MultiTimer()
trainer = Trainer(engine=engine, logger=logger, timer=timer)
logger.info("Trainer is built", ranks=[0])
hook_list = [
hooks.LogMetricByEpochHook(logger=logger),
hooks.LogMetricByStepHook(),
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
# hooks.LogMemoryByEpochHook(logger=logger),
hooks.AccuracyHook(accuracy_func=Accuracy()),
hooks.LossHook(),
hooks.ThroughputHook(),
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
]
logger.info("Train start", ranks=[0])
trainer.fit(train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
epochs=gpc.config.NUM_EPOCHS,
hooks=hook_list,
display_progress=True,
test_interval=1)
if __name__ == '__main__':
train_imagenet()

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 2
TENSOR_PARALLEL_MODE = '1d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 4
TENSOR_PARALLEL_MODE = '2d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,27 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 4
DEPTH = 1
TENSOR_PARALLEL_MODE = '2.5d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 8
TENSOR_PARALLEL_MODE = '3d'
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,26 +0,0 @@
from colossalai.amp import AMP_TYPE
TOTAL_BATCH_SIZE = 4096
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
TENSOR_PARALLEL_SIZE = 1
TENSOR_PARALLEL_MODE = None
NUM_EPOCHS = 300
WARMUP_EPOCHS = 32
parallel = dict(
pipeline=1,
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
)
fp16 = dict(mode=AMP_TYPE.TORCH, )
gradient_accumulation = 2
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
clip_grad_norm = 1.0
LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"

View File

@ -1,207 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import glob
import os
import colossalai
import nvidia.dali.fn as fn
import nvidia.dali.tfrecord as tfrec
import torch
from colossalai.builder import *
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn import Accuracy, CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.trainer import Trainer, hooks
from colossalai.utils import MultiTimer
from model_zoo.vit import vit_small_patch16_224
from nvidia.dali import types
from nvidia.dali.pipeline import Pipeline
from nvidia.dali.plugin.pytorch import DALIClassificationIterator
DATASET_PATH = str(os.environ['DATA'])
TRAIN_RECS = DATASET_PATH + '/train/*'
VAL_RECS = DATASET_PATH + '/validation/*'
TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
class DaliDataloader(DALIClassificationIterator):
def __init__(self,
tfrec_filenames,
tfrec_idx_filenames,
shard_id=0,
num_shards=1,
batch_size=128,
num_threads=4,
resize=256,
crop=224,
prefetch=2,
training=True,
gpu_aug=False,
cuda=True):
pipe = Pipeline(batch_size=batch_size,
num_threads=num_threads,
device_id=torch.cuda.current_device() if cuda else None,
seed=1024)
with pipe:
inputs = fn.readers.tfrecord(path=tfrec_filenames,
index_path=tfrec_idx_filenames,
random_shuffle=training,
shard_id=shard_id,
num_shards=num_shards,
initial_fill=10000,
read_ahead=True,
prefetch_queue_depth=prefetch,
name='Reader',
features={
'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
})
images = inputs["image/encoded"]
if training:
images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
flip_lr = fn.random.coin_flip(probability=0.5)
else:
# decode jpeg and resize
images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
images = fn.resize(images,
device='gpu' if gpu_aug else 'cpu',
resize_x=resize,
resize_y=resize,
dtype=types.FLOAT,
interp_type=types.INTERP_TRIANGULAR)
flip_lr = False
# center crop and normalise
images = fn.crop_mirror_normalize(images,
dtype=types.FLOAT,
crop=(crop, crop),
mean=[127.5],
std=[127.5],
mirror=flip_lr)
label = inputs["image/class/label"] - 1 # 0-999
# LSG: element_extract will raise exception, let's flatten outside
# label = fn.element_extract(label, element_map=0) # Flatten
if cuda: # transfer data to gpu
pipe.set_outputs(images.gpu(), label.gpu())
else:
pipe.set_outputs(images, label)
pipe.build()
last_batch_policy = 'DROP' if training else 'PARTIAL'
super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
def __iter__(self):
# if not reset (after an epoch), reset; if just initialize, ignore
if self._counter >= self._size or self._size < 0:
self.reset()
return self
def __next__(self):
data = super().__next__()
img, label = data[0]['data'], data[0]['label']
label = label.squeeze()
return (img, ), (label, )
def build_dali_train(batch_size):
return DaliDataloader(
sorted(glob.glob(TRAIN_RECS)),
sorted(glob.glob(TRAIN_IDX)),
batch_size=batch_size,
shard_id=gpc.get_local_rank(ParallelMode.DATA),
num_shards=gpc.get_world_size(ParallelMode.DATA),
training=True,
gpu_aug=True,
cuda=True,
)
def build_dali_test(batch_size):
return DaliDataloader(
sorted(glob.glob(VAL_RECS)),
sorted(glob.glob(VAL_IDX)),
batch_size=batch_size,
shard_id=gpc.get_local_rank(ParallelMode.DATA),
num_shards=gpc.get_world_size(ParallelMode.DATA),
training=False,
gpu_aug=True,
cuda=True,
)
def train_imagenet():
args = colossalai.get_default_parser().parse_args()
# standard launch
# colossalai.launch(config=args.config,
# rank=args.rank,
# world_size=args.world_size,
# local_rank=args.local_rank,
# host=args.host,
# port=args.port)
# launch from torchrun
colossalai.launch_from_torch(config=args.config)
logger = get_dist_logger()
if hasattr(gpc.config, 'LOG_PATH'):
if gpc.get_global_rank() == 0:
log_path = gpc.config.LOG_PATH
if not os.path.exists(log_path):
os.mkdir(log_path)
logger.log_to_file(log_path)
model = vit_small_patch16_224(num_classes=1000, init_method='jax')
train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
criterion = CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
total_steps=gpc.config.NUM_EPOCHS,
warmup_steps=gpc.config.WARMUP_EPOCHS)
engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader)
logger.info("Engine is built", ranks=[0])
timer = MultiTimer()
trainer = Trainer(engine=engine, logger=logger, timer=timer)
logger.info("Trainer is built", ranks=[0])
hook_list = [
hooks.LogMetricByEpochHook(logger=logger),
hooks.LogMetricByStepHook(),
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
# hooks.LogMemoryByEpochHook(logger=logger),
hooks.AccuracyHook(accuracy_func=Accuracy()),
hooks.LossHook(),
hooks.ThroughputHook(),
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
]
logger.info("Train start", ranks=[0])
trainer.fit(train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
epochs=gpc.config.NUM_EPOCHS,
hooks=hook_list,
display_progress=True,
test_interval=1)
if __name__ == '__main__':
train_imagenet()