set benchmarks as a git submodule (#156)

* remove benchmark folder * added benchmark submodule * update .gitmodules
2022-01-18 15:48:07 +08:00 · 2022-01-18 15:48:07 +08:00 · c7b8ece736
parent f3802d6b06
commit c7b8ece736
28 changed files with 5 additions and 1240 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,4 @@
+[submodule "benchmark"]
+	path = benchmark
+	url = https://github.com/hpcaitech/ColossalAI-Benchmark.git
+	branch = main
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -1,66 +0,0 @@
-# Benchmark for Tuning Accuracy and Efficiency
-
-## Overview
-
-The benchmark includes our efforts in using Colossal-AI to train different tasks to achieve SOTA results.
-We are interested in both validataion accuracy and training speed, and prefer larger batch size to take advantage of more GPU devices.
-For example, we trained vision transformer with batch size 512 on CIFAR10 and 4096 on ImageNet1k, which are basically not used in existing works.
-Some of the results in the benchmark trained with 8x A100 are shown below.
-
-| Task       | Model        | Training Time | Top-1 Accuracy |
-| ---------- | ------------ | ------------- | -------------- |
-| CIFAR10    | [ViT-Lite-7/4](https://arxiv.org/pdf/2104.05704.pdf) | ~ 16 min      | ~ 90.5%        |
-| ImageNet1k | ViT-S/16     | ~ 16.5 h      | ~ 74.5%        |
-
-The `train.py` script in each task runs training with the specific configuration script in `configs/` for different parallelisms.
-Supported parallelisms include data parallel only (ends with `vanilla`), 1D (ends with `1d`), 2D (ends with `2d`), 2.5D (ends with `2p5d`), 3D (ends with `3d`).
-
-Each configuration scripts basically includes the following elements, taking ImageNet1k task as example:
-```
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-# data parallel only
-TENSOR_PARALLEL_SIZE = 1    
-TENSOR_PARALLEL_MODE = None
-
-# parallelism setting
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, ) # amp setting
-
-gradient_accumulation = 2 # accumulate 2 steps for gradient update
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation # actual batch size for dataloader
-
-clip_grad_norm = 1.0 # clip gradient with norm 1.0
-```
-Upper case elements are basically what `train.py` needs, and lower case elements are what Colossal-AI needs to initialize the training.
-
-## Usage
-
-To start training, use the following command to run each worker:
-```
-$ DATA=/path/to/dataset python train.py --world_size=WORLD_SIZE \
-                                        --rank=RANK \
-                                        --local_rank=LOCAL_RANK \
-                                        --host=MASTER_IP_ADDRESS \
-                                        --port=MASTER_PORT \
-                                        --config=CONFIG_FILE
-```
-It is also recommended to start training with `torchrun` as:
-```
-$ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \
-                                 --nnodes=NUM_NODES \
-                                 --node_rank=NODE_RANK \
-                                 --master_addr=MASTER_IP_ADDRESS \
-                                 --master_port=MASTER_PORT \
-                                 train.py --config=CONFIG_FILE
-```
--- a/benchmark/cifar/configs/vit_1d.py
+++ b/benchmark/cifar/configs/vit_1d.py
@ -1,18 +0,0 @@
-BATCH_SIZE = 512
-LEARNING_RATE = 2e-3
-WEIGHT_DECAY = 3e-2
-
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-NUM_EPOCHS = 200
-WARMUP_EPOCHS = 40
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-seed = 42
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_2d.py
+++ b/benchmark/cifar/configs/vit_2d.py
@ -1,18 +0,0 @@
-BATCH_SIZE = 512
-LEARNING_RATE = 2e-3
-WEIGHT_DECAY = 3e-2
-
-TENSOR_PARALLEL_SIZE = 4
-TENSOR_PARALLEL_MODE = '2d'
-
-NUM_EPOCHS = 200
-WARMUP_EPOCHS = 40
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-seed = 42
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_2p5d.py
+++ b/benchmark/cifar/configs/vit_2p5d.py
@ -1,19 +0,0 @@
-BATCH_SIZE = 512
-LEARNING_RATE = 2e-3
-WEIGHT_DECAY = 3e-2
-
-TENSOR_PARALLEL_SIZE = 4
-DEPTH = 1
-TENSOR_PARALLEL_MODE = '2.5d'
-
-NUM_EPOCHS = 200
-WARMUP_EPOCHS = 40
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
-)
-
-seed = 42
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_3d.py
+++ b/benchmark/cifar/configs/vit_3d.py
@ -1,18 +0,0 @@
-BATCH_SIZE = 512
-LEARNING_RATE = 2e-3
-WEIGHT_DECAY = 3e-2
-
-TENSOR_PARALLEL_SIZE = 8
-TENSOR_PARALLEL_MODE = '3d'
-
-NUM_EPOCHS = 200
-WARMUP_EPOCHS = 40
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-seed = 42
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_vanilla.py
+++ b/benchmark/cifar/configs/vit_vanilla.py
@ -1,18 +0,0 @@
-BATCH_SIZE = 512
-LEARNING_RATE = 2e-3
-WEIGHT_DECAY = 3e-2
-
-TENSOR_PARALLEL_SIZE = 1
-TENSOR_PARALLEL_MODE = None
-
-NUM_EPOCHS = 200
-WARMUP_EPOCHS = 40
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-seed = 42
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/train.py
+++ b/benchmark/cifar/train.py
@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-
-import colossalai
-import torch
-import torchvision
-from colossalai.builder import *
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import Accuracy, CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.trainer import Trainer, hooks
-from colossalai.utils import MultiTimer, get_dataloader
-from model_zoo.vit import vit_lite_depth7_patch4_32
-from torchvision import transforms
-
-DATASET_PATH = str(os.environ['DATA'])
-
-
-def build_cifar(batch_size):
-    transform_train = transforms.Compose([
-        transforms.RandomCrop(32, padding=4),
-        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
-    transform_test = transforms.Compose([
-        transforms.Resize(32),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
-
-    train_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH,
-                                                 train=True,
-                                                 download=True,
-                                                 transform=transform_train)
-    test_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, transform=transform_test)
-    train_dataloader = get_dataloader(dataset=train_dataset,
-                                      shuffle=True,
-                                      batch_size=batch_size,
-                                      num_workers=4,
-                                      pin_memory=True)
-    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
-    return train_dataloader, test_dataloader
-
-
-def train_cifar():
-    args = colossalai.get_default_parser().parse_args()
-    # standard launch
-    # colossalai.launch(config=args.config,
-    #                   rank=args.rank,
-    #                   world_size=args.world_size,
-    #                   local_rank=args.local_rank,
-    #                   host=args.host,
-    #                   port=args.port)
-
-    # launch from torchrun
-    colossalai.launch_from_torch(config=args.config)
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    model = vit_lite_depth7_patch4_32()
-
-    train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
-
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-
-    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-
-    steps_per_epoch = len(train_dataloader)
-
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch)
-
-    engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model=model,
-                                                                                    optimizer=optimizer,
-                                                                                    criterion=criterion,
-                                                                                    train_dataloader=train_dataloader,
-                                                                                    test_dataloader=test_dataloader,
-                                                                                    lr_scheduler=lr_scheduler)
-
-    logger.info("Engine is built", ranks=[0])
-
-    timer = MultiTimer()
-
-    trainer = Trainer(engine=engine, logger=logger, timer=timer)
-    logger.info("Trainer is built", ranks=[0])
-
-    hook_list = [
-        hooks.LogMetricByEpochHook(logger=logger),
-        hooks.LogMetricByStepHook(),
-        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
-        # hooks.LogMemoryByEpochHook(logger=logger),
-        hooks.AccuracyHook(accuracy_func=Accuracy()),
-        hooks.LossHook(),
-        hooks.ThroughputHook(),
-        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
-    ]
-
-    logger.info("Train start", ranks=[0])
-    trainer.fit(train_dataloader=train_dataloader,
-                test_dataloader=test_dataloader,
-                epochs=gpc.config.NUM_EPOCHS,
-                hooks=hook_list,
-                display_progress=True,
-                test_interval=1)
-
-
-if __name__ == '__main__':
-    train_cifar()
--- a/benchmark/gpt2/configs/gpt2_1d.py
+++ b/benchmark/gpt2/configs/gpt2_1d.py
@ -1,29 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-VOCAB_SIZE = 50304
-SEQ_LENGTH = 1024
-
-TOTAL_BATCH_SIZE = 256
-LEARNING_RATE = 0.00015
-WEIGHT_DECAY = 1e-2
-
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-NUM_EPOCHS = 60
-WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_2d.py
+++ b/benchmark/gpt2/configs/gpt2_2d.py
@ -1,29 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-VOCAB_SIZE = 50304
-SEQ_LENGTH = 1024
-
-TOTAL_BATCH_SIZE = 256
-LEARNING_RATE = 0.00015
-WEIGHT_DECAY = 1e-2
-
-TENSOR_PARALLEL_SIZE = 4
-TENSOR_PARALLEL_MODE = '2d'
-
-NUM_EPOCHS = 60
-WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 1
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_2p5d.py
+++ b/benchmark/gpt2/configs/gpt2_2p5d.py
@ -1,30 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-VOCAB_SIZE = 50304
-SEQ_LENGTH = 1024
-
-TOTAL_BATCH_SIZE = 256
-LEARNING_RATE = 0.00015
-WEIGHT_DECAY = 1e-2
-
-TENSOR_PARALLEL_SIZE = 4
-DEPTH = 1
-TENSOR_PARALLEL_MODE = '2.5d'
-
-NUM_EPOCHS = 60
-WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 1
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_3d.py
+++ b/benchmark/gpt2/configs/gpt2_3d.py
@ -1,29 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-VOCAB_SIZE = 50304
-SEQ_LENGTH = 1024
-
-TOTAL_BATCH_SIZE = 256
-LEARNING_RATE = 0.00015
-WEIGHT_DECAY = 1e-2
-
-TENSOR_PARALLEL_SIZE = 8
-TENSOR_PARALLEL_MODE = '3d'
-
-NUM_EPOCHS = 60
-WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 1
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_vanilla.py
+++ b/benchmark/gpt2/configs/gpt2_vanilla.py
@ -1,29 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-VOCAB_SIZE = 50304
-SEQ_LENGTH = 1024
-
-TOTAL_BATCH_SIZE = 256
-LEARNING_RATE = 0.00015
-WEIGHT_DECAY = 1e-2
-
-TENSOR_PARALLEL_SIZE = 1
-TENSOR_PARALLEL_MODE = None
-
-NUM_EPOCHS = 60
-WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 1
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/data.py
+++ b/benchmark/gpt2/data.py
@ -1,37 +0,0 @@
-import json
-import os
-
-import torch
-from colossalai.registry import DATASETS
-from torch.utils.data import Dataset
-from transformers import GPT2Tokenizer
-
-
-@DATASETS.register_module
-class WebtextDataset(Dataset):
-    def __init__(self, path, seq_len=1024) -> None:
-        super().__init__()
-        root = os.path.dirname(path)
-        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
-        if os.path.isfile(encoded_data_cache_path):
-            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
-            if seq_len_ == seq_len:
-                self.data = data
-                self.attention_mask = attention_mask
-                return
-        raw_data = []
-        with open(path) as f:
-            for line in f.readlines():
-                raw_data.append(json.loads(line)['text'])
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.unk_token
-        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
-        self.data = encoded_data['input_ids']
-        self.attention_mask = encoded_data['attention_mask']
-        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, index):
-        return (self.data[index], self.attention_mask[index]), self.data[index]
--- a/benchmark/gpt2/train.py
+++ b/benchmark/gpt2/train.py
@ -1,105 +0,0 @@
-import contextlib
-import os
-
-import colossalai
-import torch
-from colossalai.core import global_context as gpc
-from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule)
-from colossalai.logging import get_dist_logger
-from colossalai.nn import CosineAnnealingWarmupLR
-from colossalai.trainer import Trainer, hooks
-from colossalai.utils import MultiTimer, get_dataloader
-from colossalai.zero import zero3_model_context
-from model_zoo.gpt import GPTLMLoss, gpt2_small, gpt2_medium, gpt2_large, gpt2_xl
-
-from data import WebtextDataset
-
-
-def train_gpt():
-    args = colossalai.get_default_parser().parse_args()
-    # standard launch
-    # colossalai.launch(config=args.config,
-    #                   rank=args.rank,
-    #                   world_size=args.world_size,
-    #                   local_rank=args.local_rank,
-    #                   host=args.host,
-    #                   port=args.port)
-
-    # launch from torchrun
-    colossalai.launch_from_torch(config=args.config)
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    train_dataset = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LENGTH)
-    train_dataloader = get_dataloader(train_dataset,
-                                      seed=42,
-                                      batch_size=gpc.config.BATCH_SIZE // gpc.data_parallel_size,
-                                      pin_memory=True,
-                                      shuffle=True,
-                                      drop_last=True)
-    logger.info(f'Loaded {len(train_dataset)}/{len(train_dataloader)} samples/batches', ranks=[0])
-
-    # zero3 under test
-    # use_zero3 = hasattr(gpc.config, 'zero') and gpc.config.zero.level == 3
-    # cm = zero3_model_context() if use_zero3 else contextlib.nullcontext()
-    # with cm:
-    #     model = gpc.config.model.pop('type')(**gpc.config.model)
-
-    model = gpt2_medium(vocab_size=gpc.config.VOCAB_SIZE,
-                        max_position_embeddings=gpc.config.SEQ_LENGTH,
-                        checkpoint=True)
-
-    criterion = GPTLMLoss()
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2)
-
-    steps_per_epoch = len(train_dataloader) // gpc.config.gradient_accumulation
-
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch,
-                                           eta_min=1e-5)
-
-    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model=model,
-                                                                      optimizer=optimizer,
-                                                                      criterion=criterion,
-                                                                      train_dataloader=train_dataloader,
-                                                                      lr_scheduler=lr_scheduler)
-
-    # pipeline under test
-    # num_model_chunks = getattr(gpc.config.model, 'num_chunks', 1)
-    # if num_model_chunks > 1:
-    #     logger.info('Build InterleavedPipelineSchedule', ranks=[0])
-    #     schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES, num_model_chunks)
-    # else:
-    #     logger.info('Build PipelineSchedule', ranks=[0])
-    #     schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES)
-
-    timer = MultiTimer()
-
-    trainer = Trainer(engine=engine, logger=logger, timer=timer)
-
-    hook_list = [
-        hooks.LogMetricByEpochHook(logger=logger),
-        hooks.LogMetricByStepHook(),
-        hooks.LossHook(),
-        hooks.ThroughputHook(),
-        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
-        # hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
-        # hooks.LogMemoryByEpochHook(logger),
-        # hooks.LogTimingByEpochHook(timer, logger),
-        # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
-    ]
-
-    logger.info("Training start", ranks=[0])
-    trainer.fit(train_dataloader=train_dataloader, epochs=gpc.config.NUM_EPOCHS, hooks=hook_list, display_progress=True)
-
-
-if __name__ == '__main__':
-    train_gpt()
--- a/benchmark/imagenet100/configs/vit_1d.py
+++ b/benchmark/imagenet100/configs/vit_1d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_2d.py
+++ b/benchmark/imagenet100/configs/vit_2d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 4
-TENSOR_PARALLEL_MODE = '2d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_2p5d.py
+++ b/benchmark/imagenet100/configs/vit_2p5d.py
@ -1,27 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 4
-DEPTH = 1
-TENSOR_PARALLEL_MODE = '2.5d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_3d.py
+++ b/benchmark/imagenet100/configs/vit_3d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 8
-TENSOR_PARALLEL_MODE = '3d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_vanilla.py
+++ b/benchmark/imagenet100/configs/vit_vanilla.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 1
-TENSOR_PARALLEL_MODE = None
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/train.py
+++ b/benchmark/imagenet100/train.py
@ -1,207 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import glob
-import os
-
-import colossalai
-import nvidia.dali.fn as fn
-import nvidia.dali.tfrecord as tfrec
-import torch
-from colossalai.builder import *
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import Accuracy, CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.trainer import Trainer, hooks
-from colossalai.utils import MultiTimer
-from model_zoo.vit import vit_small_patch16_224
-from nvidia.dali import types
-from nvidia.dali.pipeline import Pipeline
-from nvidia.dali.plugin.pytorch import DALIClassificationIterator
-
-DATASET_PATH = str(os.environ['DATA'])
-
-TRAIN_RECS = DATASET_PATH + '/train/*'
-VAL_RECS = DATASET_PATH + '/validation/*'
-TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
-VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
-
-
-class DaliDataloader(DALIClassificationIterator):
-    def __init__(self,
-                 tfrec_filenames,
-                 tfrec_idx_filenames,
-                 shard_id=0,
-                 num_shards=1,
-                 batch_size=128,
-                 num_threads=4,
-                 resize=256,
-                 crop=224,
-                 prefetch=2,
-                 training=True,
-                 gpu_aug=False,
-                 cuda=True):
-        pipe = Pipeline(batch_size=batch_size,
-                        num_threads=num_threads,
-                        device_id=torch.cuda.current_device() if cuda else None,
-                        seed=1024)
-        with pipe:
-            inputs = fn.readers.tfrecord(path=tfrec_filenames,
-                                         index_path=tfrec_idx_filenames,
-                                         random_shuffle=training,
-                                         shard_id=shard_id,
-                                         num_shards=num_shards,
-                                         initial_fill=10000,
-                                         read_ahead=True,
-                                         prefetch_queue_depth=prefetch,
-                                         name='Reader',
-                                         features={
-                                             'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
-                                             'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
-                                         })
-            images = inputs["image/encoded"]
-
-            if training:
-                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
-                images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
-                flip_lr = fn.random.coin_flip(probability=0.5)
-            else:
-                # decode jpeg and resize
-                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
-                images = fn.resize(images,
-                                   device='gpu' if gpu_aug else 'cpu',
-                                   resize_x=resize,
-                                   resize_y=resize,
-                                   dtype=types.FLOAT,
-                                   interp_type=types.INTERP_TRIANGULAR)
-                flip_lr = False
-
-            # center crop and normalise
-            images = fn.crop_mirror_normalize(images,
-                                              dtype=types.FLOAT,
-                                              crop=(crop, crop),
-                                              mean=[127.5],
-                                              std=[127.5],
-                                              mirror=flip_lr)
-            label = inputs["image/class/label"] - 1  # 0-999
-            # LSG: element_extract will raise exception, let's flatten outside
-            # label = fn.element_extract(label, element_map=0)  # Flatten
-            if cuda:  # transfer data to gpu
-                pipe.set_outputs(images.gpu(), label.gpu())
-            else:
-                pipe.set_outputs(images, label)
-
-        pipe.build()
-        last_batch_policy = 'DROP' if training else 'PARTIAL'
-        super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
-
-    def __iter__(self):
-        # if not reset (after an epoch), reset; if just initialize, ignore
-        if self._counter >= self._size or self._size < 0:
-            self.reset()
-        return self
-
-    def __next__(self):
-        data = super().__next__()
-        img, label = data[0]['data'], data[0]['label']
-        label = label.squeeze()
-        return (img, ), (label, )
-
-
-def build_dali_train(batch_size):
-    return DaliDataloader(
-        sorted(glob.glob(TRAIN_RECS)),
-        sorted(glob.glob(TRAIN_IDX)),
-        batch_size=batch_size,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=True,
-        gpu_aug=True,
-        cuda=True,
-    )
-
-
-def build_dali_test(batch_size):
-    return DaliDataloader(
-        sorted(glob.glob(VAL_RECS)),
-        sorted(glob.glob(VAL_IDX)),
-        batch_size=batch_size,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=False,
-        gpu_aug=True,
-        cuda=True,
-    )
-
-
-def train_imagenet():
-    args = colossalai.get_default_parser().parse_args()
-    # standard launch
-    # colossalai.launch(config=args.config,
-    #                   rank=args.rank,
-    #                   world_size=args.world_size,
-    #                   local_rank=args.local_rank,
-    #                   host=args.host,
-    #                   port=args.port)
-
-    # launch from torchrun
-    colossalai.launch_from_torch(config=args.config)
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    model = vit_small_patch16_224(num_classes=100, init_method='jax')
-
-    train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
-    test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
-
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-
-    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-
-    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
-                                                                         optimizer=optimizer,
-                                                                         criterion=criterion,
-                                                                         train_dataloader=train_dataloader,
-                                                                         test_dataloader=test_dataloader)
-
-    logger.info("Engine is built", ranks=[0])
-
-    timer = MultiTimer()
-
-    trainer = Trainer(engine=engine, logger=logger, timer=timer)
-    logger.info("Trainer is built", ranks=[0])
-
-    hook_list = [
-        hooks.LogMetricByEpochHook(logger=logger),
-        hooks.LogMetricByStepHook(),
-        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
-        # hooks.LogMemoryByEpochHook(logger=logger),
-        hooks.AccuracyHook(accuracy_func=Accuracy()),
-        hooks.LossHook(),
-        hooks.ThroughputHook(),
-        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
-    ]
-
-    logger.info("Train start", ranks=[0])
-    trainer.fit(train_dataloader=train_dataloader,
-                test_dataloader=test_dataloader,
-                epochs=gpc.config.NUM_EPOCHS,
-                hooks=hook_list,
-                display_progress=True,
-                test_interval=1)
-
-
-if __name__ == '__main__':
-    train_imagenet()
--- a/benchmark/imagenet1k/configs/vit_1d.py
+++ b/benchmark/imagenet1k/configs/vit_1d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_2d.py
+++ b/benchmark/imagenet1k/configs/vit_2d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 4
-TENSOR_PARALLEL_MODE = '2d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_2p5d.py
+++ b/benchmark/imagenet1k/configs/vit_2p5d.py
@ -1,27 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 4
-DEPTH = 1
-TENSOR_PARALLEL_MODE = '2.5d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_3d.py
+++ b/benchmark/imagenet1k/configs/vit_3d.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 8
-TENSOR_PARALLEL_MODE = '3d'
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_vanilla.py
+++ b/benchmark/imagenet1k/configs/vit_vanilla.py
@ -1,26 +0,0 @@
-from colossalai.amp import AMP_TYPE
-
-TOTAL_BATCH_SIZE = 4096
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-
-TENSOR_PARALLEL_SIZE = 1
-TENSOR_PARALLEL_MODE = None
-
-NUM_EPOCHS = 300
-WARMUP_EPOCHS = 32
-
-parallel = dict(
-    pipeline=1,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-
-fp16 = dict(mode=AMP_TYPE.TORCH, )
-
-gradient_accumulation = 2
-
-BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
-
-clip_grad_norm = 1.0
-
-LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/train.py
+++ b/benchmark/imagenet1k/train.py
@ -1,207 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import glob
-import os
-
-import colossalai
-import nvidia.dali.fn as fn
-import nvidia.dali.tfrecord as tfrec
-import torch
-from colossalai.builder import *
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import Accuracy, CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.trainer import Trainer, hooks
-from colossalai.utils import MultiTimer
-from model_zoo.vit import vit_small_patch16_224
-from nvidia.dali import types
-from nvidia.dali.pipeline import Pipeline
-from nvidia.dali.plugin.pytorch import DALIClassificationIterator
-
-DATASET_PATH = str(os.environ['DATA'])
-
-TRAIN_RECS = DATASET_PATH + '/train/*'
-VAL_RECS = DATASET_PATH + '/validation/*'
-TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
-VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
-
-
-class DaliDataloader(DALIClassificationIterator):
-    def __init__(self,
-                 tfrec_filenames,
-                 tfrec_idx_filenames,
-                 shard_id=0,
-                 num_shards=1,
-                 batch_size=128,
-                 num_threads=4,
-                 resize=256,
-                 crop=224,
-                 prefetch=2,
-                 training=True,
-                 gpu_aug=False,
-                 cuda=True):
-        pipe = Pipeline(batch_size=batch_size,
-                        num_threads=num_threads,
-                        device_id=torch.cuda.current_device() if cuda else None,
-                        seed=1024)
-        with pipe:
-            inputs = fn.readers.tfrecord(path=tfrec_filenames,
-                                         index_path=tfrec_idx_filenames,
-                                         random_shuffle=training,
-                                         shard_id=shard_id,
-                                         num_shards=num_shards,
-                                         initial_fill=10000,
-                                         read_ahead=True,
-                                         prefetch_queue_depth=prefetch,
-                                         name='Reader',
-                                         features={
-                                             'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
-                                             'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
-                                         })
-            images = inputs["image/encoded"]
-
-            if training:
-                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
-                images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
-                flip_lr = fn.random.coin_flip(probability=0.5)
-            else:
-                # decode jpeg and resize
-                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
-                images = fn.resize(images,
-                                   device='gpu' if gpu_aug else 'cpu',
-                                   resize_x=resize,
-                                   resize_y=resize,
-                                   dtype=types.FLOAT,
-                                   interp_type=types.INTERP_TRIANGULAR)
-                flip_lr = False
-
-            # center crop and normalise
-            images = fn.crop_mirror_normalize(images,
-                                              dtype=types.FLOAT,
-                                              crop=(crop, crop),
-                                              mean=[127.5],
-                                              std=[127.5],
-                                              mirror=flip_lr)
-            label = inputs["image/class/label"] - 1  # 0-999
-            # LSG: element_extract will raise exception, let's flatten outside
-            # label = fn.element_extract(label, element_map=0)  # Flatten
-            if cuda:  # transfer data to gpu
-                pipe.set_outputs(images.gpu(), label.gpu())
-            else:
-                pipe.set_outputs(images, label)
-
-        pipe.build()
-        last_batch_policy = 'DROP' if training else 'PARTIAL'
-        super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
-
-    def __iter__(self):
-        # if not reset (after an epoch), reset; if just initialize, ignore
-        if self._counter >= self._size or self._size < 0:
-            self.reset()
-        return self
-
-    def __next__(self):
-        data = super().__next__()
-        img, label = data[0]['data'], data[0]['label']
-        label = label.squeeze()
-        return (img, ), (label, )
-
-
-def build_dali_train(batch_size):
-    return DaliDataloader(
-        sorted(glob.glob(TRAIN_RECS)),
-        sorted(glob.glob(TRAIN_IDX)),
-        batch_size=batch_size,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=True,
-        gpu_aug=True,
-        cuda=True,
-    )
-
-
-def build_dali_test(batch_size):
-    return DaliDataloader(
-        sorted(glob.glob(VAL_RECS)),
-        sorted(glob.glob(VAL_IDX)),
-        batch_size=batch_size,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=False,
-        gpu_aug=True,
-        cuda=True,
-    )
-
-
-def train_imagenet():
-    args = colossalai.get_default_parser().parse_args()
-    # standard launch
-    # colossalai.launch(config=args.config,
-    #                   rank=args.rank,
-    #                   world_size=args.world_size,
-    #                   local_rank=args.local_rank,
-    #                   host=args.host,
-    #                   port=args.port)
-
-    # launch from torchrun
-    colossalai.launch_from_torch(config=args.config)
-
-    logger = get_dist_logger()
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-
-    model = vit_small_patch16_224(num_classes=1000, init_method='jax')
-
-    train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
-    test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
-
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-
-    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-
-    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
-                                                                         optimizer=optimizer,
-                                                                         criterion=criterion,
-                                                                         train_dataloader=train_dataloader,
-                                                                         test_dataloader=test_dataloader)
-
-    logger.info("Engine is built", ranks=[0])
-
-    timer = MultiTimer()
-
-    trainer = Trainer(engine=engine, logger=logger, timer=timer)
-    logger.info("Trainer is built", ranks=[0])
-
-    hook_list = [
-        hooks.LogMetricByEpochHook(logger=logger),
-        hooks.LogMetricByStepHook(),
-        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
-        # hooks.LogMemoryByEpochHook(logger=logger),
-        hooks.AccuracyHook(accuracy_func=Accuracy()),
-        hooks.LossHook(),
-        hooks.ThroughputHook(),
-        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
-    ]
-
-    logger.info("Train start", ranks=[0])
-    trainer.fit(train_dataloader=train_dataloader,
-                test_dataloader=test_dataloader,
-                epochs=gpc.config.NUM_EPOCHS,
-                hooks=hook_list,
-                display_progress=True,
-                test_interval=1)
-
-
-if __name__ == '__main__':
-    train_imagenet()
				`@ -0,0 +1 @@`
				`Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0`