set benchmarks as a git submodule (#156)

* remove benchmark folder * added benchmark submodule * update .gitmodules
2022-01-18 15:48:07 +08:00 · 2022-01-18 15:48:07 +08:00 · c7b8ece736
parent f3802d6b06
commit c7b8ece736
28 changed files with 5 additions and 1240 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,4 @@
 [submodule "benchmark"]
 	path = benchmark
 	url = https://github.com/hpcaitech/ColossalAI-Benchmark.git
 	branch = main
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -1,66 +0,0 @@
 # Benchmark for Tuning Accuracy and Efficiency
 ## Overview
 The benchmark includes our efforts in using Colossal-AI to train different tasks to achieve SOTA results.
 We are interested in both validataion accuracy and training speed, and prefer larger batch size to take advantage of more GPU devices.
 For example, we trained vision transformer with batch size 512 on CIFAR10 and 4096 on ImageNet1k, which are basically not used in existing works.
 Some of the results in the benchmark trained with 8x A100 are shown below.
 | Task       | Model        | Training Time | Top-1 Accuracy |
 | ---------- | ------------ | ------------- | -------------- |
 | CIFAR10    | [ViT-Lite-7/4](https://arxiv.org/pdf/2104.05704.pdf) | ~ 16 min      | ~ 90.5%        |
 | ImageNet1k | ViT-S/16     | ~ 16.5 h      | ~ 74.5%        |
 The `train.py` script in each task runs training with the specific configuration script in `configs/` for different parallelisms.
 Supported parallelisms include data parallel only (ends with `vanilla`), 1D (ends with `1d`), 2D (ends with `2d`), 2.5D (ends with `2p5d`), 3D (ends with `3d`).
 Each configuration scripts basically includes the following elements, taking ImageNet1k task as example:
 ```
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 # data parallel only
 TENSOR_PARALLEL_SIZE = 1    
 TENSOR_PARALLEL_MODE = None
 # parallelism setting
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, ) # amp setting
 gradient_accumulation = 2 # accumulate 2 steps for gradient update
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation # actual batch size for dataloader
 clip_grad_norm = 1.0 # clip gradient with norm 1.0
 ```
 Upper case elements are basically what `train.py` needs, and lower case elements are what Colossal-AI needs to initialize the training.
 ## Usage
 To start training, use the following command to run each worker:
 ```
 $ DATA=/path/to/dataset python train.py --world_size=WORLD_SIZE \
                                        --rank=RANK \
                                        --local_rank=LOCAL_RANK \
                                        --host=MASTER_IP_ADDRESS \
                                        --port=MASTER_PORT \
                                        --config=CONFIG_FILE
 ```
 It is also recommended to start training with `torchrun` as:
 ```
 $ DATA=/path/to/dataset torchrun --nproc_per_node=NUM_GPUS_PER_NODE \
                                 --nnodes=NUM_NODES \
                                 --node_rank=NODE_RANK \
                                 --master_addr=MASTER_IP_ADDRESS \
                                 --master_port=MASTER_PORT \
                                 train.py --config=CONFIG_FILE
 ```
--- a/benchmark/cifar/configs/vit_1d.py
+++ b/benchmark/cifar/configs/vit_1d.py
@ -1,18 +0,0 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 NUM_EPOCHS = 200
 WARMUP_EPOCHS = 40
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 seed = 42
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_2d.py
+++ b/benchmark/cifar/configs/vit_2d.py
@ -1,18 +0,0 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 TENSOR_PARALLEL_SIZE = 4
 TENSOR_PARALLEL_MODE = '2d'
 NUM_EPOCHS = 200
 WARMUP_EPOCHS = 40
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 seed = 42
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_2p5d.py
+++ b/benchmark/cifar/configs/vit_2p5d.py
@ -1,19 +0,0 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 TENSOR_PARALLEL_SIZE = 4
 DEPTH = 1
 TENSOR_PARALLEL_MODE = '2.5d'
 NUM_EPOCHS = 200
 WARMUP_EPOCHS = 40
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
 )
 seed = 42
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_3d.py
+++ b/benchmark/cifar/configs/vit_3d.py
@ -1,18 +0,0 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 TENSOR_PARALLEL_SIZE = 8
 TENSOR_PARALLEL_MODE = '3d'
 NUM_EPOCHS = 200
 WARMUP_EPOCHS = 40
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 seed = 42
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/configs/vit_vanilla.py
+++ b/benchmark/cifar/configs/vit_vanilla.py
@ -1,18 +0,0 @@
 BATCH_SIZE = 512
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 TENSOR_PARALLEL_SIZE = 1
 TENSOR_PARALLEL_MODE = None
 NUM_EPOCHS = 200
 WARMUP_EPOCHS = 40
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 seed = 42
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_cifar10_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}/"
--- a/benchmark/cifar/train.py
+++ b/benchmark/cifar/train.py
@ -1,119 +0,0 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import os
 import colossalai
 import torch
 import torchvision
 from colossalai.builder import *
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn import Accuracy, CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.trainer import Trainer, hooks
 from colossalai.utils import MultiTimer, get_dataloader
 from model_zoo.vit import vit_lite_depth7_patch4_32
 from torchvision import transforms
 DATASET_PATH = str(os.environ['DATA'])
 def build_cifar(batch_size):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
    transform_test = transforms.Compose([
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
    train_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH,
                                                 train=True,
                                                 download=True,
                                                 transform=transform_train)
    test_dataset = torchvision.datasets.CIFAR10(root=DATASET_PATH, train=False, transform=transform_test)
    train_dataloader = get_dataloader(dataset=train_dataset,
                                      shuffle=True,
                                      batch_size=batch_size,
                                      num_workers=4,
                                      pin_memory=True)
    test_dataloader = get_dataloader(dataset=test_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
    return train_dataloader, test_dataloader
 def train_cifar():
    args = colossalai.get_default_parser().parse_args()
    # standard launch
    # colossalai.launch(config=args.config,
    #                   rank=args.rank,
    #                   world_size=args.world_size,
    #                   local_rank=args.local_rank,
    #                   host=args.host,
    #                   port=args.port)
    # launch from torchrun
    colossalai.launch_from_torch(config=args.config)
    logger = get_dist_logger()
    if hasattr(gpc.config, 'LOG_PATH'):
        if gpc.get_global_rank() == 0:
            log_path = gpc.config.LOG_PATH
            if not os.path.exists(log_path):
                os.mkdir(log_path)
            logger.log_to_file(log_path)
    model = vit_lite_depth7_patch4_32()
    train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
    criterion = CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
    steps_per_epoch = len(train_dataloader)
    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
                                           warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch)
    engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model=model,
                                                                                    optimizer=optimizer,
                                                                                    criterion=criterion,
                                                                                    train_dataloader=train_dataloader,
                                                                                    test_dataloader=test_dataloader,
                                                                                    lr_scheduler=lr_scheduler)
    logger.info("Engine is built", ranks=[0])
    timer = MultiTimer()
    trainer = Trainer(engine=engine, logger=logger, timer=timer)
    logger.info("Trainer is built", ranks=[0])
    hook_list = [
        hooks.LogMetricByEpochHook(logger=logger),
        hooks.LogMetricByStepHook(),
        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
        # hooks.LogMemoryByEpochHook(logger=logger),
        hooks.AccuracyHook(accuracy_func=Accuracy()),
        hooks.LossHook(),
        hooks.ThroughputHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
    ]
    logger.info("Train start", ranks=[0])
    trainer.fit(train_dataloader=train_dataloader,
                test_dataloader=test_dataloader,
                epochs=gpc.config.NUM_EPOCHS,
                hooks=hook_list,
                display_progress=True,
                test_interval=1)
 if __name__ == '__main__':
    train_cifar()
--- a/benchmark/gpt2/configs/gpt2_1d.py
+++ b/benchmark/gpt2/configs/gpt2_1d.py
@ -1,29 +0,0 @@
 from colossalai.amp import AMP_TYPE
 VOCAB_SIZE = 50304
 SEQ_LENGTH = 1024
 TOTAL_BATCH_SIZE = 256
 LEARNING_RATE = 0.00015
 WEIGHT_DECAY = 1e-2
 TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_2d.py
+++ b/benchmark/gpt2/configs/gpt2_2d.py
@ -1,29 +0,0 @@
 from colossalai.amp import AMP_TYPE
 VOCAB_SIZE = 50304
 SEQ_LENGTH = 1024
 TOTAL_BATCH_SIZE = 256
 LEARNING_RATE = 0.00015
 WEIGHT_DECAY = 1e-2
 TENSOR_PARALLEL_SIZE = 4
 TENSOR_PARALLEL_MODE = '2d'
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 1
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_2p5d.py
+++ b/benchmark/gpt2/configs/gpt2_2p5d.py
@ -1,30 +0,0 @@
 from colossalai.amp import AMP_TYPE
 VOCAB_SIZE = 50304
 SEQ_LENGTH = 1024
 TOTAL_BATCH_SIZE = 256
 LEARNING_RATE = 0.00015
 WEIGHT_DECAY = 1e-2
 TENSOR_PARALLEL_SIZE = 4
 DEPTH = 1
 TENSOR_PARALLEL_MODE = '2.5d'
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 1
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_3d.py
+++ b/benchmark/gpt2/configs/gpt2_3d.py
@ -1,29 +0,0 @@
 from colossalai.amp import AMP_TYPE
 VOCAB_SIZE = 50304
 SEQ_LENGTH = 1024
 TOTAL_BATCH_SIZE = 256
 LEARNING_RATE = 0.00015
 WEIGHT_DECAY = 1e-2
 TENSOR_PARALLEL_SIZE = 8
 TENSOR_PARALLEL_MODE = '3d'
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 1
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/configs/gpt2_vanilla.py
+++ b/benchmark/gpt2/configs/gpt2_vanilla.py
@ -1,29 +0,0 @@
 from colossalai.amp import AMP_TYPE
 VOCAB_SIZE = 50304
 SEQ_LENGTH = 1024
 TOTAL_BATCH_SIZE = 256
 LEARNING_RATE = 0.00015
 WEIGHT_DECAY = 1e-2
 TENSOR_PARALLEL_SIZE = 1
 TENSOR_PARALLEL_MODE = None
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 1
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
--- a/benchmark/gpt2/data.py
+++ b/benchmark/gpt2/data.py
@ -1,37 +0,0 @@
 import json
 import os
 import torch
 from colossalai.registry import DATASETS
 from torch.utils.data import Dataset
 from transformers import GPT2Tokenizer
@DATASETS.register_module
 class WebtextDataset(Dataset):
    def __init__(self, path, seq_len=1024) -> None:
        super().__init__()
        root = os.path.dirname(path)
        encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
        if os.path.isfile(encoded_data_cache_path):
            seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
            if seq_len_ == seq_len:
                self.data = data
                self.attention_mask = attention_mask
                return
        raw_data = []
        with open(path) as f:
            for line in f.readlines():
                raw_data.append(json.loads(line)['text'])
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        tokenizer.pad_token = tokenizer.unk_token
        encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
        self.data = encoded_data['input_ids']
        self.attention_mask = encoded_data['attention_mask']
        torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return (self.data[index], self.attention_mask[index]), self.data[index]
--- a/benchmark/gpt2/train.py
+++ b/benchmark/gpt2/train.py
@ -1,105 +0,0 @@
 import contextlib
 import os
 import colossalai
 import torch
 from colossalai.core import global_context as gpc
 from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule)
 from colossalai.logging import get_dist_logger
 from colossalai.nn import CosineAnnealingWarmupLR
 from colossalai.trainer import Trainer, hooks
 from colossalai.utils import MultiTimer, get_dataloader
 from colossalai.zero import zero3_model_context
 from model_zoo.gpt import GPTLMLoss, gpt2_small, gpt2_medium, gpt2_large, gpt2_xl
 from data import WebtextDataset
 def train_gpt():
    args = colossalai.get_default_parser().parse_args()
    # standard launch
    # colossalai.launch(config=args.config,
    #                   rank=args.rank,
    #                   world_size=args.world_size,
    #                   local_rank=args.local_rank,
    #                   host=args.host,
    #                   port=args.port)
    # launch from torchrun
    colossalai.launch_from_torch(config=args.config)
    logger = get_dist_logger()
    if hasattr(gpc.config, 'LOG_PATH'):
        if gpc.get_global_rank() == 0:
            log_path = gpc.config.LOG_PATH
            if not os.path.exists(log_path):
                os.mkdir(log_path)
            logger.log_to_file(log_path)
    train_dataset = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LENGTH)
    train_dataloader = get_dataloader(train_dataset,
                                      seed=42,
                                      batch_size=gpc.config.BATCH_SIZE // gpc.data_parallel_size,
                                      pin_memory=True,
                                      shuffle=True,
                                      drop_last=True)
    logger.info(f'Loaded {len(train_dataset)}/{len(train_dataloader)} samples/batches', ranks=[0])
    # zero3 under test
    # use_zero3 = hasattr(gpc.config, 'zero') and gpc.config.zero.level == 3
    # cm = zero3_model_context() if use_zero3 else contextlib.nullcontext()
    # with cm:
    #     model = gpc.config.model.pop('type')(**gpc.config.model)
    model = gpt2_medium(vocab_size=gpc.config.VOCAB_SIZE,
                        max_position_embeddings=gpc.config.SEQ_LENGTH,
                        checkpoint=True)
    criterion = GPTLMLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2)
    steps_per_epoch = len(train_dataloader) // gpc.config.gradient_accumulation
    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
                                           warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch,
                                           eta_min=1e-5)
    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model=model,
                                                                      optimizer=optimizer,
                                                                      criterion=criterion,
                                                                      train_dataloader=train_dataloader,
                                                                      lr_scheduler=lr_scheduler)
    # pipeline under test
    # num_model_chunks = getattr(gpc.config.model, 'num_chunks', 1)
    # if num_model_chunks > 1:
    #     logger.info('Build InterleavedPipelineSchedule', ranks=[0])
    #     schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES, num_model_chunks)
    # else:
    #     logger.info('Build PipelineSchedule', ranks=[0])
    #     schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES)
    timer = MultiTimer()
    trainer = Trainer(engine=engine, logger=logger, timer=timer)
    hook_list = [
        hooks.LogMetricByEpochHook(logger=logger),
        hooks.LogMetricByStepHook(),
        hooks.LossHook(),
        hooks.ThroughputHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
        # hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
        # hooks.LogMemoryByEpochHook(logger),
        # hooks.LogTimingByEpochHook(timer, logger),
        # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
    ]
    logger.info("Training start", ranks=[0])
    trainer.fit(train_dataloader=train_dataloader, epochs=gpc.config.NUM_EPOCHS, hooks=hook_list, display_progress=True)
 if __name__ == '__main__':
    train_gpt()
--- a/benchmark/imagenet100/configs/vit_1d.py
+++ b/benchmark/imagenet100/configs/vit_1d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_2d.py
+++ b/benchmark/imagenet100/configs/vit_2d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 4
 TENSOR_PARALLEL_MODE = '2d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_2p5d.py
+++ b/benchmark/imagenet100/configs/vit_2p5d.py
@ -1,27 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 4
 DEPTH = 1
 TENSOR_PARALLEL_MODE = '2.5d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_3d.py
+++ b/benchmark/imagenet100/configs/vit_3d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 8
 TENSOR_PARALLEL_MODE = '3d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/configs/vit_vanilla.py
+++ b/benchmark/imagenet100/configs/vit_vanilla.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 1
 TENSOR_PARALLEL_MODE = None
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet100_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet100/train.py
+++ b/benchmark/imagenet100/train.py
@ -1,207 +0,0 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import glob
 import os
 import colossalai
 import nvidia.dali.fn as fn
 import nvidia.dali.tfrecord as tfrec
 import torch
 from colossalai.builder import *
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn import Accuracy, CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.trainer import Trainer, hooks
 from colossalai.utils import MultiTimer
 from model_zoo.vit import vit_small_patch16_224
 from nvidia.dali import types
 from nvidia.dali.pipeline import Pipeline
 from nvidia.dali.plugin.pytorch import DALIClassificationIterator
 DATASET_PATH = str(os.environ['DATA'])
 TRAIN_RECS = DATASET_PATH + '/train/*'
 VAL_RECS = DATASET_PATH + '/validation/*'
 TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
 VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
 class DaliDataloader(DALIClassificationIterator):
    def __init__(self,
                 tfrec_filenames,
                 tfrec_idx_filenames,
                 shard_id=0,
                 num_shards=1,
                 batch_size=128,
                 num_threads=4,
                 resize=256,
                 crop=224,
                 prefetch=2,
                 training=True,
                 gpu_aug=False,
                 cuda=True):
        pipe = Pipeline(batch_size=batch_size,
                        num_threads=num_threads,
                        device_id=torch.cuda.current_device() if cuda else None,
                        seed=1024)
        with pipe:
            inputs = fn.readers.tfrecord(path=tfrec_filenames,
                                         index_path=tfrec_idx_filenames,
                                         random_shuffle=training,
                                         shard_id=shard_id,
                                         num_shards=num_shards,
                                         initial_fill=10000,
                                         read_ahead=True,
                                         prefetch_queue_depth=prefetch,
                                         name='Reader',
                                         features={
                                             'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
                                             'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
                                         })
            images = inputs["image/encoded"]
            if training:
                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
                images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
                flip_lr = fn.random.coin_flip(probability=0.5)
            else:
                # decode jpeg and resize
                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
                images = fn.resize(images,
                                   device='gpu' if gpu_aug else 'cpu',
                                   resize_x=resize,
                                   resize_y=resize,
                                   dtype=types.FLOAT,
                                   interp_type=types.INTERP_TRIANGULAR)
                flip_lr = False
            # center crop and normalise
            images = fn.crop_mirror_normalize(images,
                                              dtype=types.FLOAT,
                                              crop=(crop, crop),
                                              mean=[127.5],
                                              std=[127.5],
                                              mirror=flip_lr)
            label = inputs["image/class/label"] - 1  # 0-999
            # LSG: element_extract will raise exception, let's flatten outside
            # label = fn.element_extract(label, element_map=0)  # Flatten
            if cuda:  # transfer data to gpu
                pipe.set_outputs(images.gpu(), label.gpu())
            else:
                pipe.set_outputs(images, label)
        pipe.build()
        last_batch_policy = 'DROP' if training else 'PARTIAL'
        super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
    def __iter__(self):
        # if not reset (after an epoch), reset; if just initialize, ignore
        if self._counter >= self._size or self._size < 0:
            self.reset()
        return self
    def __next__(self):
        data = super().__next__()
        img, label = data[0]['data'], data[0]['label']
        label = label.squeeze()
        return (img, ), (label, )
 def build_dali_train(batch_size):
    return DaliDataloader(
        sorted(glob.glob(TRAIN_RECS)),
        sorted(glob.glob(TRAIN_IDX)),
        batch_size=batch_size,
        shard_id=gpc.get_local_rank(ParallelMode.DATA),
        num_shards=gpc.get_world_size(ParallelMode.DATA),
        training=True,
        gpu_aug=True,
        cuda=True,
    )
 def build_dali_test(batch_size):
    return DaliDataloader(
        sorted(glob.glob(VAL_RECS)),
        sorted(glob.glob(VAL_IDX)),
        batch_size=batch_size,
        shard_id=gpc.get_local_rank(ParallelMode.DATA),
        num_shards=gpc.get_world_size(ParallelMode.DATA),
        training=False,
        gpu_aug=True,
        cuda=True,
    )
 def train_imagenet():
    args = colossalai.get_default_parser().parse_args()
    # standard launch
    # colossalai.launch(config=args.config,
    #                   rank=args.rank,
    #                   world_size=args.world_size,
    #                   local_rank=args.local_rank,
    #                   host=args.host,
    #                   port=args.port)
    # launch from torchrun
    colossalai.launch_from_torch(config=args.config)
    logger = get_dist_logger()
    if hasattr(gpc.config, 'LOG_PATH'):
        if gpc.get_global_rank() == 0:
            log_path = gpc.config.LOG_PATH
            if not os.path.exists(log_path):
                os.mkdir(log_path)
            logger.log_to_file(log_path)
    model = vit_small_patch16_224(num_classes=100, init_method='jax')
    train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
    test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
    criterion = CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=gpc.config.NUM_EPOCHS,
                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                         optimizer=optimizer,
                                                                         criterion=criterion,
                                                                         train_dataloader=train_dataloader,
                                                                         test_dataloader=test_dataloader)
    logger.info("Engine is built", ranks=[0])
    timer = MultiTimer()
    trainer = Trainer(engine=engine, logger=logger, timer=timer)
    logger.info("Trainer is built", ranks=[0])
    hook_list = [
        hooks.LogMetricByEpochHook(logger=logger),
        hooks.LogMetricByStepHook(),
        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
        # hooks.LogMemoryByEpochHook(logger=logger),
        hooks.AccuracyHook(accuracy_func=Accuracy()),
        hooks.LossHook(),
        hooks.ThroughputHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
    ]
    logger.info("Train start", ranks=[0])
    trainer.fit(train_dataloader=train_dataloader,
                test_dataloader=test_dataloader,
                epochs=gpc.config.NUM_EPOCHS,
                hooks=hook_list,
                display_progress=True,
                test_interval=1)
 if __name__ == '__main__':
    train_imagenet()
--- a/benchmark/imagenet1k/configs/vit_1d.py
+++ b/benchmark/imagenet1k/configs/vit_1d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_2d.py
+++ b/benchmark/imagenet1k/configs/vit_2d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 4
 TENSOR_PARALLEL_MODE = '2d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_2p5d.py
+++ b/benchmark/imagenet1k/configs/vit_2p5d.py
@ -1,27 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 4
 DEPTH = 1
 TENSOR_PARALLEL_MODE = '2.5d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_3d.py
+++ b/benchmark/imagenet1k/configs/vit_3d.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 8
 TENSOR_PARALLEL_MODE = '3d'
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/configs/vit_vanilla.py
+++ b/benchmark/imagenet1k/configs/vit_vanilla.py
@ -1,26 +0,0 @@
 from colossalai.amp import AMP_TYPE
 TOTAL_BATCH_SIZE = 4096
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 TENSOR_PARALLEL_SIZE = 1
 TENSOR_PARALLEL_MODE = None
 NUM_EPOCHS = 300
 WARMUP_EPOCHS = 32
 parallel = dict(
    pipeline=1,
    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
 )
 fp16 = dict(mode=AMP_TYPE.TORCH, )
 gradient_accumulation = 2
 BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
 clip_grad_norm = 1.0
 LOG_PATH = f"./vit_{TENSOR_PARALLEL_MODE}_imagenet1k_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_{fp16['mode']}_clip_grad{clip_grad_norm}/"
--- a/benchmark/imagenet1k/train.py
+++ b/benchmark/imagenet1k/train.py
@ -1,207 +0,0 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import glob
 import os
 import colossalai
 import nvidia.dali.fn as fn
 import nvidia.dali.tfrecord as tfrec
 import torch
 from colossalai.builder import *
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn import Accuracy, CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.trainer import Trainer, hooks
 from colossalai.utils import MultiTimer
 from model_zoo.vit import vit_small_patch16_224
 from nvidia.dali import types
 from nvidia.dali.pipeline import Pipeline
 from nvidia.dali.plugin.pytorch import DALIClassificationIterator
 DATASET_PATH = str(os.environ['DATA'])
 TRAIN_RECS = DATASET_PATH + '/train/*'
 VAL_RECS = DATASET_PATH + '/validation/*'
 TRAIN_IDX = DATASET_PATH + '/idx_files/train/*'
 VAL_IDX = DATASET_PATH + '/idx_files/validation/*'
 class DaliDataloader(DALIClassificationIterator):
    def __init__(self,
                 tfrec_filenames,
                 tfrec_idx_filenames,
                 shard_id=0,
                 num_shards=1,
                 batch_size=128,
                 num_threads=4,
                 resize=256,
                 crop=224,
                 prefetch=2,
                 training=True,
                 gpu_aug=False,
                 cuda=True):
        pipe = Pipeline(batch_size=batch_size,
                        num_threads=num_threads,
                        device_id=torch.cuda.current_device() if cuda else None,
                        seed=1024)
        with pipe:
            inputs = fn.readers.tfrecord(path=tfrec_filenames,
                                         index_path=tfrec_idx_filenames,
                                         random_shuffle=training,
                                         shard_id=shard_id,
                                         num_shards=num_shards,
                                         initial_fill=10000,
                                         read_ahead=True,
                                         prefetch_queue_depth=prefetch,
                                         name='Reader',
                                         features={
                                             'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
                                             'image/class/label': tfrec.FixedLenFeature([1], tfrec.int64, -1),
                                         })
            images = inputs["image/encoded"]
            if training:
                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
                images = fn.random_resized_crop(images, size=crop, device='gpu' if gpu_aug else 'cpu')
                flip_lr = fn.random.coin_flip(probability=0.5)
            else:
                # decode jpeg and resize
                images = fn.decoders.image(images, device='mixed' if gpu_aug else 'cpu', output_type=types.RGB)
                images = fn.resize(images,
                                   device='gpu' if gpu_aug else 'cpu',
                                   resize_x=resize,
                                   resize_y=resize,
                                   dtype=types.FLOAT,
                                   interp_type=types.INTERP_TRIANGULAR)
                flip_lr = False
            # center crop and normalise
            images = fn.crop_mirror_normalize(images,
                                              dtype=types.FLOAT,
                                              crop=(crop, crop),
                                              mean=[127.5],
                                              std=[127.5],
                                              mirror=flip_lr)
            label = inputs["image/class/label"] - 1  # 0-999
            # LSG: element_extract will raise exception, let's flatten outside
            # label = fn.element_extract(label, element_map=0)  # Flatten
            if cuda:  # transfer data to gpu
                pipe.set_outputs(images.gpu(), label.gpu())
            else:
                pipe.set_outputs(images, label)
        pipe.build()
        last_batch_policy = 'DROP' if training else 'PARTIAL'
        super().__init__(pipe, reader_name="Reader", auto_reset=True, last_batch_policy=last_batch_policy)
    def __iter__(self):
        # if not reset (after an epoch), reset; if just initialize, ignore
        if self._counter >= self._size or self._size < 0:
            self.reset()
        return self
    def __next__(self):
        data = super().__next__()
        img, label = data[0]['data'], data[0]['label']
        label = label.squeeze()
        return (img, ), (label, )
 def build_dali_train(batch_size):
    return DaliDataloader(
        sorted(glob.glob(TRAIN_RECS)),
        sorted(glob.glob(TRAIN_IDX)),
        batch_size=batch_size,
        shard_id=gpc.get_local_rank(ParallelMode.DATA),
        num_shards=gpc.get_world_size(ParallelMode.DATA),
        training=True,
        gpu_aug=True,
        cuda=True,
    )
 def build_dali_test(batch_size):
    return DaliDataloader(
        sorted(glob.glob(VAL_RECS)),
        sorted(glob.glob(VAL_IDX)),
        batch_size=batch_size,
        shard_id=gpc.get_local_rank(ParallelMode.DATA),
        num_shards=gpc.get_world_size(ParallelMode.DATA),
        training=False,
        gpu_aug=True,
        cuda=True,
    )
 def train_imagenet():
    args = colossalai.get_default_parser().parse_args()
    # standard launch
    # colossalai.launch(config=args.config,
    #                   rank=args.rank,
    #                   world_size=args.world_size,
    #                   local_rank=args.local_rank,
    #                   host=args.host,
    #                   port=args.port)
    # launch from torchrun
    colossalai.launch_from_torch(config=args.config)
    logger = get_dist_logger()
    if hasattr(gpc.config, 'LOG_PATH'):
        if gpc.get_global_rank() == 0:
            log_path = gpc.config.LOG_PATH
            if not os.path.exists(log_path):
                os.mkdir(log_path)
            logger.log_to_file(log_path)
    model = vit_small_patch16_224(num_classes=1000, init_method='jax')
    train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
    test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
    criterion = CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
                                           total_steps=gpc.config.NUM_EPOCHS,
                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
                                                                         optimizer=optimizer,
                                                                         criterion=criterion,
                                                                         train_dataloader=train_dataloader,
                                                                         test_dataloader=test_dataloader)
    logger.info("Engine is built", ranks=[0])
    timer = MultiTimer()
    trainer = Trainer(engine=engine, logger=logger, timer=timer)
    logger.info("Trainer is built", ranks=[0])
    hook_list = [
        hooks.LogMetricByEpochHook(logger=logger),
        hooks.LogMetricByStepHook(),
        # hooks.LogTimingByEpochHook(timer=timer, logger=logger),
        # hooks.LogMemoryByEpochHook(logger=logger),
        hooks.AccuracyHook(accuracy_func=Accuracy()),
        hooks.LossHook(),
        hooks.ThroughputHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
    ]
    logger.info("Train start", ranks=[0])
    trainer.fit(train_dataloader=train_dataloader,
                test_dataloader=test_dataloader,
                epochs=gpc.config.NUM_EPOCHS,
                hooks=hook_list,
                display_progress=True,
                test_interval=1)
 if __name__ == '__main__':
    train_imagenet()
		`@ -0,0 +1 @@`
							`Subproject commit c319bc2ee9db32aba4a522eccdf89e8d0fb8d9f0`