diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..bfcd7fc60 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "benchmark"] + path = benchmark + url = https://github.com/hpcaitech/ColossalAI-Benchmark.git + branch = main diff --git a/benchmark b/benchmark new file mode 160000 index 000000000..c319bc2ee --- /dev/null +++ b/benchmark @@ -0,0 +1 @@ +Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0 diff --git a/benchmark/README.md b/benchmark/README.md deleted file mode 100644 index eac6474d1..000000000 --- a/benchmark/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Benchmark for Tuning Accuracy and Efficiency - -## Overview - -The benchmark includes our efforts in using Colossal-AI to train different tasks to achieve SOTA results. -We are interested in both validataion accuracy and training speed, and prefer larger batch size to take advantage of more GPU devices. -For example, we trained vision transformer with batch size 512 on CIFAR10 and 4096 on ImageNet1k, which are basically not used in existing works. -Some of the results in the benchmark trained with 8x A100 are shown below. - -| Task | Model | Training Time | Top-1 Accuracy | -| ---------- | ------------ | ------------- | -------------- | -| CIFAR10 | [ViT-Lite-7/4](https://arxiv.org/pdf/2104.05704.pdf) | ~ 16 min | ~ 90.5% | -| ImageNet1k | ViT-S/16 | ~ 16.5 h | ~ 74.5% | - -The `train.py` script in each task runs training with the specific configuration script in `configs/` for different parallelisms. -Supported parallelisms include data parallel only (ends with `vanilla`), 1D (ends with `1d`), 2D (ends with `2d`), 2.5D (ends with `2p5d`), 3D (ends with `3d`). - -Each configuration scripts basically includes the following elements, taking ImageNet1k task as example: -``` -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -# data parallel only -TENSOR_PARALLEL_SIZE = 1 -TENSOR_PARALLEL_MODE = None - -# parallelism setting -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) # amp setting - -gradient_accumulation = 2 # accumulate 2 steps for gradient update - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation # actual batch size for dataloader - -clip_grad_norm = 1.0 # clip gradient with norm 1.0 -``` -Upper case elements are basically what `train.py` needs, and lower case elements are what Colossal-AI needs to initialize the training. - -## Usage - -To start training, use the following command to run each worker: -``` -$ DATA=/path/to/dataset python train.py --world_size=WORLD_SIZE \ - --rank=RANK \ - --local_rank=LOCAL_RANK \ - --host=MASTER_IP_ADDRESS \ - --port=MASTER_PORT \ - --config=CONFIG_FILE -``` -It is also recommended to start training with `torchrun` as: -``` -$ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \ - --nnodes=NUM_NODES \ - --node_rank=NODE_RANK \ - --master_addr=MASTER_IP_ADDRESS \ - --master_port=MASTER_PORT \ - train.py --config=CONFIG_FILE -``` \ No newline at end of file diff --git a/benchmark/cifar/configs/vit_1d.py b/benchmark/cifar/configs/vit_1d.py deleted file mode 100644 index 1731abc1e..000000000 --- a/benchmark/cifar/configs/vit_1d.py +++ /dev/null @@ -1,18 +0,0 @@ -BATCH_SIZE = 512 -LEARNING_RATE = 2e-3 -WEIGHT_DECAY = 3e-2 - -TENSOR_PARALLEL_SIZE = 2 -TENSOR_PARALLEL_MODE = '1d' - -NUM_EPOCHS = 200 -WARMUP_EPOCHS = 40 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -seed = 42 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" diff --git a/benchmark/cifar/configs/vit_2d.py b/benchmark/cifar/configs/vit_2d.py deleted file mode 100644 index 88864cb6a..000000000 --- a/benchmark/cifar/configs/vit_2d.py +++ /dev/null @@ -1,18 +0,0 @@ -BATCH_SIZE = 512 -LEARNING_RATE = 2e-3 -WEIGHT_DECAY = 3e-2 - -TENSOR_PARALLEL_SIZE = 4 -TENSOR_PARALLEL_MODE = '2d' - -NUM_EPOCHS = 200 -WARMUP_EPOCHS = 40 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -seed = 42 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" diff --git a/benchmark/cifar/configs/vit_2p5d.py b/benchmark/cifar/configs/vit_2p5d.py deleted file mode 100644 index 4da546f14..000000000 --- a/benchmark/cifar/configs/vit_2p5d.py +++ /dev/null @@ -1,19 +0,0 @@ -BATCH_SIZE = 512 -LEARNING_RATE = 2e-3 -WEIGHT_DECAY = 3e-2 - -TENSOR_PARALLEL_SIZE = 4 -DEPTH = 1 -TENSOR_PARALLEL_MODE = '2.5d' - -NUM_EPOCHS = 200 -WARMUP_EPOCHS = 40 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH), -) - -seed = 42 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" diff --git a/benchmark/cifar/configs/vit_3d.py b/benchmark/cifar/configs/vit_3d.py deleted file mode 100644 index 9600f9b3a..000000000 --- a/benchmark/cifar/configs/vit_3d.py +++ /dev/null @@ -1,18 +0,0 @@ -BATCH_SIZE = 512 -LEARNING_RATE = 2e-3 -WEIGHT_DECAY = 3e-2 - -TENSOR_PARALLEL_SIZE = 8 -TENSOR_PARALLEL_MODE = '3d' - -NUM_EPOCHS = 200 -WARMUP_EPOCHS = 40 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -seed = 42 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" diff --git a/benchmark/cifar/configs/vit_vanilla.py b/benchmark/cifar/configs/vit_vanilla.py deleted file mode 100644 index 3d9193686..000000000 --- a/benchmark/cifar/configs/vit_vanilla.py +++ /dev/null @@ -1,18 +0,0 @@ -BATCH_SIZE = 512 -LEARNING_RATE = 2e-3 -WEIGHT_DECAY = 3e-2 - -TENSOR_PARALLEL_SIZE = 1 -TENSOR_PARALLEL_MODE = None - -NUM_EPOCHS = 200 -WARMUP_EPOCHS = 40 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -seed = 42 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/" diff --git a/benchmark/cifar/train.py b/benchmark/cifar/train.py deleted file mode 100644 index 38cd6e29b..000000000 --- a/benchmark/cifar/train.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os - -import colossalai -import torch -import torchvision -from colossalai.builder import * -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn import Accuracy, CrossEntropyLoss -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.trainer import Trainer, hooks -from colossalai.utils import MultiTimer, get_dataloader -from model_zoo.vit import vit_lite_depth7_patch4_32 -from torchvision import transforms - -DATASET_PATH = str(os.environ['DATA']) - - -def build_cifar(batch_size): - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) - transform_test = transforms.Compose([ - transforms.Resize(32), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) - - train_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH, - train=True, - download=True, - transform=transform_train) - test_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, transform=transform_test) - train_dataloader = get_dataloader(dataset=train_dataset, - shuffle=True, - batch_size=batch_size, - num_workers=4, - pin_memory=True) - test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True) - return train_dataloader, test_dataloader - - -def train_cifar(): - args = colossalai.get_default_parser().parse_args() - # standard launch - # colossalai.launch(config=args.config, - # rank=args.rank, - # world_size=args.world_size, - # local_rank=args.local_rank, - # host=args.host, - # port=args.port) - - # launch from torchrun - colossalai.launch_from_torch(config=args.config) - - logger = get_dist_logger() - if hasattr(gpc.config, 'LOG_PATH'): - if gpc.get_global_rank() == 0: - log_path = gpc.config.LOG_PATH - if not os.path.exists(log_path): - os.mkdir(log_path) - logger.log_to_file(log_path) - - model = vit_lite_depth7_patch4_32() - - train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE // gpc.data_parallel_size) - - criterion = CrossEntropyLoss(label_smoothing=0.1) - - optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY) - - steps_per_epoch = len(train_dataloader) - - lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, - total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch, - warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch) - - engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model=model, - optimizer=optimizer, - criterion=criterion, - train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - lr_scheduler=lr_scheduler) - - logger.info("Engine is built", ranks=[0]) - - timer = MultiTimer() - - trainer = Trainer(engine=engine, logger=logger, timer=timer) - logger.info("Trainer is built", ranks=[0]) - - hook_list = [ - hooks.LogMetricByEpochHook(logger=logger), - hooks.LogMetricByStepHook(), - # hooks.LogTimingByEpochHook(timer=timer, logger=logger), - # hooks.LogMemoryByEpochHook(logger=logger), - hooks.AccuracyHook(accuracy_func=Accuracy()), - hooks.LossHook(), - hooks.ThroughputHook(), - hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False) - ] - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - epochs=gpc.config.NUM_EPOCHS, - hooks=hook_list, - display_progress=True, - test_interval=1) - - -if __name__ == '__main__': - train_cifar() diff --git a/benchmark/gpt2/configs/gpt2_1d.py b/benchmark/gpt2/configs/gpt2_1d.py deleted file mode 100644 index f9a659b83..000000000 --- a/benchmark/gpt2/configs/gpt2_1d.py +++ /dev/null @@ -1,29 +0,0 @@ -from colossalai.amp import AMP_TYPE - -VOCAB_SIZE = 50304 -SEQ_LENGTH = 1024 - -TOTAL_BATCH_SIZE = 256 -LEARNING_RATE = 0.00015 -WEIGHT_DECAY = 1e-2 - -TENSOR_PARALLEL_SIZE = 2 -TENSOR_PARALLEL_MODE = '1d' - -NUM_EPOCHS = 60 -WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36) - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/gpt2/configs/gpt2_2d.py b/benchmark/gpt2/configs/gpt2_2d.py deleted file mode 100644 index 5abec73e6..000000000 --- a/benchmark/gpt2/configs/gpt2_2d.py +++ /dev/null @@ -1,29 +0,0 @@ -from colossalai.amp import AMP_TYPE - -VOCAB_SIZE = 50304 -SEQ_LENGTH = 1024 - -TOTAL_BATCH_SIZE = 256 -LEARNING_RATE = 0.00015 -WEIGHT_DECAY = 1e-2 - -TENSOR_PARALLEL_SIZE = 4 -TENSOR_PARALLEL_MODE = '2d' - -NUM_EPOCHS = 60 -WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36) - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 1 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/gpt2/configs/gpt2_2p5d.py b/benchmark/gpt2/configs/gpt2_2p5d.py deleted file mode 100644 index 33ea4411e..000000000 --- a/benchmark/gpt2/configs/gpt2_2p5d.py +++ /dev/null @@ -1,30 +0,0 @@ -from colossalai.amp import AMP_TYPE - -VOCAB_SIZE = 50304 -SEQ_LENGTH = 1024 - -TOTAL_BATCH_SIZE = 256 -LEARNING_RATE = 0.00015 -WEIGHT_DECAY = 1e-2 - -TENSOR_PARALLEL_SIZE = 4 -DEPTH = 1 -TENSOR_PARALLEL_MODE = '2.5d' - -NUM_EPOCHS = 60 -WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36) - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 1 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/gpt2/configs/gpt2_3d.py b/benchmark/gpt2/configs/gpt2_3d.py deleted file mode 100644 index 9f8728d29..000000000 --- a/benchmark/gpt2/configs/gpt2_3d.py +++ /dev/null @@ -1,29 +0,0 @@ -from colossalai.amp import AMP_TYPE - -VOCAB_SIZE = 50304 -SEQ_LENGTH = 1024 - -TOTAL_BATCH_SIZE = 256 -LEARNING_RATE = 0.00015 -WEIGHT_DECAY = 1e-2 - -TENSOR_PARALLEL_SIZE = 8 -TENSOR_PARALLEL_MODE = '3d' - -NUM_EPOCHS = 60 -WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36) - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 1 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/gpt2/configs/gpt2_vanilla.py b/benchmark/gpt2/configs/gpt2_vanilla.py deleted file mode 100644 index b450cd048..000000000 --- a/benchmark/gpt2/configs/gpt2_vanilla.py +++ /dev/null @@ -1,29 +0,0 @@ -from colossalai.amp import AMP_TYPE - -VOCAB_SIZE = 50304 -SEQ_LENGTH = 1024 - -TOTAL_BATCH_SIZE = 256 -LEARNING_RATE = 0.00015 -WEIGHT_DECAY = 1e-2 - -TENSOR_PARALLEL_SIZE = 1 -TENSOR_PARALLEL_MODE = None - -NUM_EPOCHS = 60 -WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36) - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 1 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/gpt2/data.py b/benchmark/gpt2/data.py deleted file mode 100644 index d6fdfba78..000000000 --- a/benchmark/gpt2/data.py +++ /dev/null @@ -1,37 +0,0 @@ -import json -import os - -import torch -from colossalai.registry import DATASETS -from torch.utils.data import Dataset -from transformers import GPT2Tokenizer - - -@DATASETS.register_module -class WebtextDataset(Dataset): - def __init__(self, path, seq_len=1024) -> None: - super().__init__() - root = os.path.dirname(path) - encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt') - if os.path.isfile(encoded_data_cache_path): - seq_len_, data, attention_mask = torch.load(encoded_data_cache_path) - if seq_len_ == seq_len: - self.data = data - self.attention_mask = attention_mask - return - raw_data = [] - with open(path) as f: - for line in f.readlines(): - raw_data.append(json.loads(line)['text']) - tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - tokenizer.pad_token = tokenizer.unk_token - encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt') - self.data = encoded_data['input_ids'] - self.attention_mask = encoded_data['attention_mask'] - torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path) - - def __len__(self): - return len(self.data) - - def __getitem__(self, index): - return (self.data[index], self.attention_mask[index]), self.data[index] diff --git a/benchmark/gpt2/train.py b/benchmark/gpt2/train.py deleted file mode 100644 index 664a5a206..000000000 --- a/benchmark/gpt2/train.py +++ /dev/null @@ -1,105 +0,0 @@ -import contextlib -import os - -import colossalai -import torch -from colossalai.core import global_context as gpc -from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) -from colossalai.logging import get_dist_logger -from colossalai.nn import CosineAnnealingWarmupLR -from colossalai.trainer import Trainer, hooks -from colossalai.utils import MultiTimer, get_dataloader -from colossalai.zero import zero3_model_context -from model_zoo.gpt import GPTLMLoss, gpt2_small, gpt2_medium, gpt2_large, gpt2_xl - -from data import WebtextDataset - - -def train_gpt(): - args = colossalai.get_default_parser().parse_args() - # standard launch - # colossalai.launch(config=args.config, - # rank=args.rank, - # world_size=args.world_size, - # local_rank=args.local_rank, - # host=args.host, - # port=args.port) - - # launch from torchrun - colossalai.launch_from_torch(config=args.config) - - logger = get_dist_logger() - if hasattr(gpc.config, 'LOG_PATH'): - if gpc.get_global_rank() == 0: - log_path = gpc.config.LOG_PATH - if not os.path.exists(log_path): - os.mkdir(log_path) - logger.log_to_file(log_path) - - train_dataset = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LENGTH) - train_dataloader = get_dataloader(train_dataset, - seed=42, - batch_size=gpc.config.BATCH_SIZE // gpc.data_parallel_size, - pin_memory=True, - shuffle=True, - drop_last=True) - logger.info(f'Loaded {len(train_dataset)}/{len(train_dataloader)} samples/batches', ranks=[0]) - - # zero3 under test - # use_zero3 = hasattr(gpc.config, 'zero') and gpc.config.zero.level == 3 - # cm = zero3_model_context() if use_zero3 else contextlib.nullcontext() - # with cm: - # model = gpc.config.model.pop('type')(**gpc.config.model) - - model = gpt2_medium(vocab_size=gpc.config.VOCAB_SIZE, - max_position_embeddings=gpc.config.SEQ_LENGTH, - checkpoint=True) - - criterion = GPTLMLoss() - - optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2) - - steps_per_epoch = len(train_dataloader) // gpc.config.gradient_accumulation - - lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, - total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch, - warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch, - eta_min=1e-5) - - engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model=model, - optimizer=optimizer, - criterion=criterion, - train_dataloader=train_dataloader, - lr_scheduler=lr_scheduler) - - # pipeline under test - # num_model_chunks = getattr(gpc.config.model, 'num_chunks', 1) - # if num_model_chunks > 1: - # logger.info('Build InterleavedPipelineSchedule', ranks=[0]) - # schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES, num_model_chunks) - # else: - # logger.info('Build PipelineSchedule', ranks=[0]) - # schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES) - - timer = MultiTimer() - - trainer = Trainer(engine=engine, logger=logger, timer=timer) - - hook_list = [ - hooks.LogMetricByEpochHook(logger=logger), - hooks.LogMetricByStepHook(), - hooks.LossHook(), - hooks.ThroughputHook(), - hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False), - # hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]), - # hooks.LogMemoryByEpochHook(logger), - # hooks.LogTimingByEpochHook(timer, logger), - # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt') - ] - - logger.info("Training start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, epochs=gpc.config.NUM_EPOCHS, hooks=hook_list, display_progress=True) - - -if __name__ == '__main__': - train_gpt() diff --git a/benchmark/imagenet100/configs/vit_1d.py b/benchmark/imagenet100/configs/vit_1d.py deleted file mode 100644 index bd90e1e84..000000000 --- a/benchmark/imagenet100/configs/vit_1d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 2 -TENSOR_PARALLEL_MODE = '1d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet100/configs/vit_2d.py b/benchmark/imagenet100/configs/vit_2d.py deleted file mode 100644 index e80fb15eb..000000000 --- a/benchmark/imagenet100/configs/vit_2d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 4 -TENSOR_PARALLEL_MODE = '2d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet100/configs/vit_2p5d.py b/benchmark/imagenet100/configs/vit_2p5d.py deleted file mode 100644 index 5e0cf179e..000000000 --- a/benchmark/imagenet100/configs/vit_2p5d.py +++ /dev/null @@ -1,27 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 4 -DEPTH = 1 -TENSOR_PARALLEL_MODE = '2.5d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet100/configs/vit_3d.py b/benchmark/imagenet100/configs/vit_3d.py deleted file mode 100644 index ae2145ce6..000000000 --- a/benchmark/imagenet100/configs/vit_3d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 8 -TENSOR_PARALLEL_MODE = '3d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet100/configs/vit_vanilla.py b/benchmark/imagenet100/configs/vit_vanilla.py deleted file mode 100644 index 130f3689c..000000000 --- a/benchmark/imagenet100/configs/vit_vanilla.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 1 -TENSOR_PARALLEL_MODE = None - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet100/train.py b/benchmark/imagenet100/train.py deleted file mode 100644 index af06ec452..000000000 --- a/benchmark/imagenet100/train.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import glob -import os - -import colossalai -import nvidia.dali.fn as fn -import nvidia.dali.tfrecord as tfrec -import torch -from colossalai.builder import * -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn import Accuracy, CrossEntropyLoss -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.trainer import Trainer, hooks -from colossalai.utils import MultiTimer -from model_zoo.vit import vit_small_patch16_224 -from nvidia.dali import types -from nvidia.dali.pipeline import Pipeline -from nvidia.dali.plugin.pytorch import DALIClassificationIterator - -DATASET_PATH = str(os.environ['DATA']) - -TRAIN_RECS = DATASET_PATH + '/train/*' -VAL_RECS = DATASET_PATH + '/validation/*' -TRAIN_IDX = DATASET_PATH + '/idx_files/train/*' -VAL_IDX = DATASET_PATH + '/idx_files/validation/*' - - -class DaliDataloader(DALIClassificationIterator): - def __init__(self, - tfrec_filenames, - tfrec_idx_filenames, - shard_id=0, - num_shards=1, - batch_size=128, - num_threads=4, - resize=256, - crop=224, - prefetch=2, - training=True, - gpu_aug=False, - cuda=True): - pipe = Pipeline(batch_size=batch_size, - num_threads=num_threads, - device_id=torch.cuda.current_device() if cuda else None, - seed=1024) - with pipe: - inputs = fn.readers.tfrecord(path=tfrec_filenames, - index_path=tfrec_idx_filenames, - random_shuffle=training, - shard_id=shard_id, - num_shards=num_shards, - initial_fill=10000, - read_ahead=True, - prefetch_queue_depth=prefetch, - name='Reader', - features={ - 'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""), - 'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1), - }) - images = inputs["image/encoded"] - - if training: - images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB) - images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu') - flip_lr = fn.random.coin_flip(probability=0.5) - else: - # decode jpeg and resize - images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB) - images = fn.resize(images, - device='gpu' if gpu_aug else 'cpu', - resize_x=resize, - resize_y=resize, - dtype=types.FLOAT, - interp_type=types.INTERP_TRIANGULAR) - flip_lr = False - - # center crop and normalise - images = fn.crop_mirror_normalize(images, - dtype=types.FLOAT, - crop=(crop, crop), - mean=[127.5], - std=[127.5], - mirror=flip_lr) - label = inputs["image/class/label"] - 1 # 0-999 - # LSG: element_extract will raise exception, let's flatten outside - # label = fn.element_extract(label, element_map=0) # Flatten - if cuda: # transfer data to gpu - pipe.set_outputs(images.gpu(), label.gpu()) - else: - pipe.set_outputs(images, label) - - pipe.build() - last_batch_policy = 'DROP' if training else 'PARTIAL' - super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy) - - def __iter__(self): - # if not reset (after an epoch), reset; if just initialize, ignore - if self._counter >= self._size or self._size < 0: - self.reset() - return self - - def __next__(self): - data = super().__next__() - img, label = data[0]['data'], data[0]['label'] - label = label.squeeze() - return (img, ), (label, ) - - -def build_dali_train(batch_size): - return DaliDataloader( - sorted(glob.glob(TRAIN_RECS)), - sorted(glob.glob(TRAIN_IDX)), - batch_size=batch_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=True, - gpu_aug=True, - cuda=True, - ) - - -def build_dali_test(batch_size): - return DaliDataloader( - sorted(glob.glob(VAL_RECS)), - sorted(glob.glob(VAL_IDX)), - batch_size=batch_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=False, - gpu_aug=True, - cuda=True, - ) - - -def train_imagenet(): - args = colossalai.get_default_parser().parse_args() - # standard launch - # colossalai.launch(config=args.config, - # rank=args.rank, - # world_size=args.world_size, - # local_rank=args.local_rank, - # host=args.host, - # port=args.port) - - # launch from torchrun - colossalai.launch_from_torch(config=args.config) - - logger = get_dist_logger() - if hasattr(gpc.config, 'LOG_PATH'): - if gpc.get_global_rank() == 0: - log_path = gpc.config.LOG_PATH - if not os.path.exists(log_path): - os.mkdir(log_path) - logger.log_to_file(log_path) - - model = vit_small_patch16_224(num_classes=100, init_method='jax') - - train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size) - test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size) - - criterion = CrossEntropyLoss(label_smoothing=0.1) - - optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY) - - lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, - total_steps=gpc.config.NUM_EPOCHS, - warmup_steps=gpc.config.WARMUP_EPOCHS) - - engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model, - optimizer=optimizer, - criterion=criterion, - train_dataloader=train_dataloader, - test_dataloader=test_dataloader) - - logger.info("Engine is built", ranks=[0]) - - timer = MultiTimer() - - trainer = Trainer(engine=engine, logger=logger, timer=timer) - logger.info("Trainer is built", ranks=[0]) - - hook_list = [ - hooks.LogMetricByEpochHook(logger=logger), - hooks.LogMetricByStepHook(), - # hooks.LogTimingByEpochHook(timer=timer, logger=logger), - # hooks.LogMemoryByEpochHook(logger=logger), - hooks.AccuracyHook(accuracy_func=Accuracy()), - hooks.LossHook(), - hooks.ThroughputHook(), - hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True) - ] - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - epochs=gpc.config.NUM_EPOCHS, - hooks=hook_list, - display_progress=True, - test_interval=1) - - -if __name__ == '__main__': - train_imagenet() diff --git a/benchmark/imagenet1k/configs/vit_1d.py b/benchmark/imagenet1k/configs/vit_1d.py deleted file mode 100644 index d447d10b1..000000000 --- a/benchmark/imagenet1k/configs/vit_1d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 2 -TENSOR_PARALLEL_MODE = '1d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet1k/configs/vit_2d.py b/benchmark/imagenet1k/configs/vit_2d.py deleted file mode 100644 index 19144973b..000000000 --- a/benchmark/imagenet1k/configs/vit_2d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 4 -TENSOR_PARALLEL_MODE = '2d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet1k/configs/vit_2p5d.py b/benchmark/imagenet1k/configs/vit_2p5d.py deleted file mode 100644 index fc06ce9b6..000000000 --- a/benchmark/imagenet1k/configs/vit_2p5d.py +++ /dev/null @@ -1,27 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 4 -DEPTH = 1 -TENSOR_PARALLEL_MODE = '2.5d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet1k/configs/vit_3d.py b/benchmark/imagenet1k/configs/vit_3d.py deleted file mode 100644 index b2fcb86a6..000000000 --- a/benchmark/imagenet1k/configs/vit_3d.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 8 -TENSOR_PARALLEL_MODE = '3d' - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet1k/configs/vit_vanilla.py b/benchmark/imagenet1k/configs/vit_vanilla.py deleted file mode 100644 index 888b8d568..000000000 --- a/benchmark/imagenet1k/configs/vit_vanilla.py +++ /dev/null @@ -1,26 +0,0 @@ -from colossalai.amp import AMP_TYPE - -TOTAL_BATCH_SIZE = 4096 -LEARNING_RATE = 3e-3 -WEIGHT_DECAY = 0.3 - -TENSOR_PARALLEL_SIZE = 1 -TENSOR_PARALLEL_MODE = None - -NUM_EPOCHS = 300 -WARMUP_EPOCHS = 32 - -parallel = dict( - pipeline=1, - tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), -) - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -gradient_accumulation = 2 - -BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation - -clip_grad_norm = 1.0 - -LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/" diff --git a/benchmark/imagenet1k/train.py b/benchmark/imagenet1k/train.py deleted file mode 100644 index 4a77280df..000000000 --- a/benchmark/imagenet1k/train.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import glob -import os - -import colossalai -import nvidia.dali.fn as fn -import nvidia.dali.tfrecord as tfrec -import torch -from colossalai.builder import * -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn import Accuracy, CrossEntropyLoss -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.trainer import Trainer, hooks -from colossalai.utils import MultiTimer -from model_zoo.vit import vit_small_patch16_224 -from nvidia.dali import types -from nvidia.dali.pipeline import Pipeline -from nvidia.dali.plugin.pytorch import DALIClassificationIterator - -DATASET_PATH = str(os.environ['DATA']) - -TRAIN_RECS = DATASET_PATH + '/train/*' -VAL_RECS = DATASET_PATH + '/validation/*' -TRAIN_IDX = DATASET_PATH + '/idx_files/train/*' -VAL_IDX = DATASET_PATH + '/idx_files/validation/*' - - -class DaliDataloader(DALIClassificationIterator): - def __init__(self, - tfrec_filenames, - tfrec_idx_filenames, - shard_id=0, - num_shards=1, - batch_size=128, - num_threads=4, - resize=256, - crop=224, - prefetch=2, - training=True, - gpu_aug=False, - cuda=True): - pipe = Pipeline(batch_size=batch_size, - num_threads=num_threads, - device_id=torch.cuda.current_device() if cuda else None, - seed=1024) - with pipe: - inputs = fn.readers.tfrecord(path=tfrec_filenames, - index_path=tfrec_idx_filenames, - random_shuffle=training, - shard_id=shard_id, - num_shards=num_shards, - initial_fill=10000, - read_ahead=True, - prefetch_queue_depth=prefetch, - name='Reader', - features={ - 'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""), - 'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1), - }) - images = inputs["image/encoded"] - - if training: - images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB) - images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu') - flip_lr = fn.random.coin_flip(probability=0.5) - else: - # decode jpeg and resize - images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB) - images = fn.resize(images, - device='gpu' if gpu_aug else 'cpu', - resize_x=resize, - resize_y=resize, - dtype=types.FLOAT, - interp_type=types.INTERP_TRIANGULAR) - flip_lr = False - - # center crop and normalise - images = fn.crop_mirror_normalize(images, - dtype=types.FLOAT, - crop=(crop, crop), - mean=[127.5], - std=[127.5], - mirror=flip_lr) - label = inputs["image/class/label"] - 1 # 0-999 - # LSG: element_extract will raise exception, let's flatten outside - # label = fn.element_extract(label, element_map=0) # Flatten - if cuda: # transfer data to gpu - pipe.set_outputs(images.gpu(), label.gpu()) - else: - pipe.set_outputs(images, label) - - pipe.build() - last_batch_policy = 'DROP' if training else 'PARTIAL' - super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy) - - def __iter__(self): - # if not reset (after an epoch), reset; if just initialize, ignore - if self._counter >= self._size or self._size < 0: - self.reset() - return self - - def __next__(self): - data = super().__next__() - img, label = data[0]['data'], data[0]['label'] - label = label.squeeze() - return (img, ), (label, ) - - -def build_dali_train(batch_size): - return DaliDataloader( - sorted(glob.glob(TRAIN_RECS)), - sorted(glob.glob(TRAIN_IDX)), - batch_size=batch_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=True, - gpu_aug=True, - cuda=True, - ) - - -def build_dali_test(batch_size): - return DaliDataloader( - sorted(glob.glob(VAL_RECS)), - sorted(glob.glob(VAL_IDX)), - batch_size=batch_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=False, - gpu_aug=True, - cuda=True, - ) - - -def train_imagenet(): - args = colossalai.get_default_parser().parse_args() - # standard launch - # colossalai.launch(config=args.config, - # rank=args.rank, - # world_size=args.world_size, - # local_rank=args.local_rank, - # host=args.host, - # port=args.port) - - # launch from torchrun - colossalai.launch_from_torch(config=args.config) - - logger = get_dist_logger() - if hasattr(gpc.config, 'LOG_PATH'): - if gpc.get_global_rank() == 0: - log_path = gpc.config.LOG_PATH - if not os.path.exists(log_path): - os.mkdir(log_path) - logger.log_to_file(log_path) - - model = vit_small_patch16_224(num_classes=1000, init_method='jax') - - train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size) - test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size) - - criterion = CrossEntropyLoss(label_smoothing=0.1) - - optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY) - - lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, - total_steps=gpc.config.NUM_EPOCHS, - warmup_steps=gpc.config.WARMUP_EPOCHS) - - engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model, - optimizer=optimizer, - criterion=criterion, - train_dataloader=train_dataloader, - test_dataloader=test_dataloader) - - logger.info("Engine is built", ranks=[0]) - - timer = MultiTimer() - - trainer = Trainer(engine=engine, logger=logger, timer=timer) - logger.info("Trainer is built", ranks=[0]) - - hook_list = [ - hooks.LogMetricByEpochHook(logger=logger), - hooks.LogMetricByStepHook(), - # hooks.LogTimingByEpochHook(timer=timer, logger=logger), - # hooks.LogMemoryByEpochHook(logger=logger), - hooks.AccuracyHook(accuracy_func=Accuracy()), - hooks.LossHook(), - hooks.ThroughputHook(), - hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True) - ] - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - epochs=gpc.config.NUM_EPOCHS, - hooks=hook_list, - display_progress=True, - test_interval=1) - - -if __name__ == '__main__': - train_imagenet()