[example] update roberta with newer ColossalAI (#3472)

* update roberta example

* update roberta example
pull/3493/head
mandoxzhang 2023-04-07 10:34:51 +08:00 committed by GitHub
parent fb8fae6f29
commit ab5fd127e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 188 additions and 103 deletions

View File

@ -1,9 +1,9 @@
# Introduction
This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality bert.
This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta.
## 0. Prerequisite
- Install Colossal-AI
- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes"
- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes"
- Ensure that each host can log in to each other without password. If you have n hosts, need to execute n<sup>2</sup> times
```
@ -33,7 +33,7 @@ service ssh restart
```bash
cd preprocessing
```
following the `README.md`, preprocess original corpus to h5py+numpy
following the `README.md`, preprocess original corpus to h5py plus numpy
## 2. Pretrain
@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
## Contributors
The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
```
@misc{
title={A simple Chinese RoBERTa Example for Whole Word Masked},
author={Yehua Zhang, Chen Zhang},
year={2022}
}
```
The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!

View File

@ -1,9 +0,0 @@
from colossalai.nn.optimizer import FusedAdam
try:
from colossalai.zero.shard_utils import TensorShardStrategy
except ImportError:
# colossalai > 0.2.8
from colossalai.zero.legacy import TensorShardStrategy
clip_grad_norm = 1.0

View File

@ -1,37 +0,0 @@
from colossalai.nn.optimizer import FusedAdam
try:
from colossalai.zero.shard_utils import TensorShardStrategy
except ImportError:
# colossalai > 0.2.8
from colossalai.zero.legacy import TensorShardStrategy
# fp16 = dict(
# mode=AMP_TYPE.TORCH,
# )
# seed = 2
zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
reduce_scatter_bucket_size_mb=25,
fp32_reduce_scatter=False,
tensor_placement_policy="cuda",
gradient_predivide_factor=1.0,
reuse_fp16_shard=False),
optimizer_config=dict(gpu_margin_mem_ratio=0.8,
initial_scale=2**5,
min_scale=1,
growth_factor=2,
backoff_factor=0.5,
growth_interval=1000,
hysteresis=2,
max_scale=2**32))
# gradient_accumulation = 4
clip_grad_norm = 1.0
optimizer = dict(
type=FusedAdam,
lr=0.00015,
weight_decay=1e-2,
)
# 64433

View File

@ -163,16 +163,15 @@ class PreTrainingDataset():
def get_new_segment(self, segment):
"""
输入一句话返回一句经过处理的话: 为了支持中文全称mask将被分开的词将上特殊标记("#")使得后续处理模块能够知道哪些字是属于同一个词的
:param segment: 一句话
:return: 一句处理过的话
Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word.
:param segment: a sentence
"""
seq_cws = jieba.lcut(''.join(segment))
seq_cws_dict = {x: 1 for x in seq_cws}
new_segment = []
i = 0
while i < len(segment):
if len(self.rec.findall(segment[i])) == 0: # 不是中文的,原文加进去。
if len(self.rec.findall(segment[i])) == 0:
new_segment.append(segment[i])
i += 1
continue

View File

@ -10,26 +10,19 @@ import argparse
import functools
def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
"""
Args:
document:
flag: Type:str, "all" 中英文标点分句"zh" 中文标点分句"en" 英文标点分句
limit: 默认单句最大长度为510个字符
Returns: Type:list
"""
sent_list = []
try:
if flag == "zh":
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document) # 特殊引号
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document)
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
elif flag == "en":
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 英文单字符断句符
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # 特殊引号
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # Special quotation marks
else:
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
document = re.sub('(?P<quotation_mark>(([。?!.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
document) # 特殊引号
document) # Special quotation marks
sent_list_ori = document.splitlines()
for sent in sent_list_ori:

View File

@ -15,8 +15,8 @@ from get_mask import PreTrainingDataset
def get_raw_instance(document, max_sequence_length=512):
"""
获取初步的训练实例将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回
:param document: 一整段
Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances.
:param document: document
:param max_sequence_length:
:return: a list. each element is a sequence of text
"""
@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512):
sizes = [len(seq) for seq in document]
result_list = []
curr_seq = [] # 当前处理的序列
curr_seq = []
sz_idx = 0
while sz_idx < len(sizes):
# 当前句子加上新的句子,如果长度小于最大限制,则合并当前句子和新句子;否则即超过了最大限制,那么做为一个新的序列加到目标列表中
if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
curr_seq += document[sz_idx]
@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512):
else:
result_list.append(curr_seq)
curr_seq = []
# 对最后一个序列进行处理,如果太短的话,丢弃掉。
if len(curr_seq) > max_sequence_length_allowed / 2: # /2
result_list.append(curr_seq)
# # 计算总共可以得到多少份
# num_instance=int(len(big_list)/max_sequence_length_allowed)+1
# print("num_instance:",num_instance)
# # 切分成多份,添加到列表中
# result_list=[]
# for j in range(num_instance):
# index=j*max_sequence_length_allowed

View File

@ -6,6 +6,30 @@ __all__ = ['parse_args']
def parse_args():
parser = colossalai.get_default_parser()
parser.add_argument(
"--distplan",
type=str,
default='CAI_Gemini',
help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
)
parser.add_argument(
"--tp_degree",
type=int,
default=1,
help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--placement",
type=str,
default='cpu',
help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--shardinit",
action='store_true',
help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
)
parser.add_argument(
'--lr',

View File

@ -5,11 +5,11 @@ from tqdm import tqdm
from utils.global_vars import get_timers, get_tensorboard_writer
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
def evaluate(engine, args, logger, global_step):
def evaluate(model, args, logger, global_step, criterion):
evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True)
start_shard = 0
engine.eval()
model.eval()
timers = get_timers()
eval_step = 0
eval_loss = 0
@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step):
mlm_label = batch_data[3].cuda()
# nsp_label = batch_data[5].cuda()
output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
loss = engine.criterion(output.logits, mlm_label)#prediction_scores
loss = criterion(output.logits, mlm_label)#prediction_scores
evaluate_dataset_provider.prefetch_batch()
eval_loss += loss.float().item()
@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step):
logger.info('')
evaluate_dataset_provider.release_shard()
engine.train()
return cur_loss
model.train()
return cur_loss

View File

@ -5,7 +5,7 @@ from transformers import get_linear_schedule_with_warmup
from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from colossalai.nn.optimizer import FusedAdam
from colossalai.nn.optimizer import FusedAdam, HybridAdam
from torch.optim import AdamW
from colossalai.core import global_context as gpc
import torch
@ -83,7 +83,7 @@ def get_optimizer(model, lr):
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay': 0.0
}]
optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
return optimizer

View File

@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
log_path="$root_path/exp_log"
ckpt_path="$root_path/ckpt"
colossal_config="$root_path/../configs/colossalai_ddp.py"
mkdir -p $tensorboard_path
mkdir -p $log_path
@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
--tensorboard_path $tensorboard_path \
--log_path $log_path \
--ckpt_path $ckpt_path \
--colossal_config $colossal_config \
--log_interval 50 \
--mlm bert \
--wandb \

View File

@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
log_path="$root_path/exp_log"
ckpt_path="$root_path/ckpt"
colossal_config="$root_path/../configs/colossalai_ddp.py"
mkdir -p $tensorboard_path
mkdir -p $log_path
@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
--tensorboard_path $tensorboard_path \
--log_path $log_path \
--ckpt_path $ckpt_path \
--colossal_config $colossal_config \
--log_interval 50 \
--mlm bert \
--wandb \

View File

@ -4,9 +4,31 @@ import time
from functools import partial
import torch
<<<<<<< HEAD
from tqdm import tqdm
import os
import time
from functools import partial
from transformers import AutoTokenizer
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.zero import ZeroOptimizer
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
=======
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
from arguments import parse_args
from evaluation import evaluate
from loss import LossForPretraining
<<<<<<< HEAD
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
=======
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
from pretrain_utils import get_lr_scheduler, get_model, get_optimizer, save_ckpt
from tqdm import tqdm
@ -27,6 +49,7 @@ from colossalai.zero import ZeroOptimizer
from colossalai.zero.gemini import ChunkManager, ColoInitContext, GeminiManager
from colossalai.zero.legacy import ShardedModelV2, ShardedOptimizerV2, ZeroInitContext
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
def main():
@ -36,8 +59,13 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
<<<<<<< HEAD
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
=======
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
logger = Logger(os.path.join(args.log_path, launch_time), cuda=torch.cuda.is_available(), debug=args.vscode_debug)
if args.vscode_debug:
@ -50,7 +78,11 @@ def main():
args.local_rank = -1
args.log_interval = 1
else:
<<<<<<< HEAD
colossalai.launch_from_torch(config={}) #args.colossal_config
=======
colossalai.launch_from_torch(args.colossal_config) # args.colossal_config
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
args.local_rank = int(os.environ["LOCAL_RANK"])
logger.info(
f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
@ -61,32 +93,94 @@ def main():
args.tokenizer = tokenizer
args.logger = logger
set_global_variables(launch_time, args.tensorboard_path)
<<<<<<< HEAD
=======
use_zero = hasattr(gpc.config, 'zero')
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
world_size = torch.distributed.get_world_size()
init_dev = get_current_device()
# build model, optimizer and criterion
<<<<<<< HEAD
if args.distplan.startswith("CAI"):
# all param must use the same process group.
world_size = torch.distributed.get_world_size()
shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
if args.shardinit and args.distplan != "CAI_Gemini":
raise RuntimeError("You can only use shardinit with CAI_Gemini")
# build GPT model
with ColoInitContext(device=get_current_device(),
dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=shard_pg):
=======
if use_zero:
shard_strategy = TensorShardStrategy()
with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy,
shard_param=True):
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
config, model, numel = get_model(args, logger)
# model = ShardedModelV2(model, shard_strategy, tensor_placement_policy='cpu', reuse_fp16_shard=True)
# asign running configurations
gemini_config = None
if args.distplan.startswith("CAI_ZeRO"):
optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
elif args.distplan == "CAI_Gemini":
gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
device=get_current_device(),
placement_policy=args.placement,
pin_memory=True,
hidden_dim=model.config.hidden_size,
search_range_mb=128)
optim_config = dict(gpu_margin_mem_ratio=0.)
else:
raise RuntimeError
# build a highly optimized gpu/cpu optimizer
optimizer = get_optimizer(model, lr=args.lr)
if args.distplan == "CAI_ZeRO1":
zero_stage = 1
elif args.distplan == "CAI_ZeRO2":
zero_stage = 2
elif args.distplan == "CAI_Gemini":
zero_stage = 3
else:
raise RuntimeError
# wrap your model and optimizer
model = zero_model_wrapper(model, zero_stage, gemini_config)
optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
logger.info(get_mem_info(prefix='After init optim, '))
else:
config, model, numel = get_model(args, logger)
logger.info("no_zero")
if torch.distributed.get_rank() == 0:
os.mkdir(os.path.join(args.ckpt_path, launch_time))
logger.info(f'Model numel: {numel}')
get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length)
<<<<<<< HEAD
# 144003367 is is the length of the entire dataset
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size #len(dataloader)
=======
# len(dataloader)
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
total_steps = steps_per_epoch * args.epoch
# build optimizer and lr_scheduler
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
start_epoch = 0
start_shard = 0
@ -95,7 +189,6 @@ def main():
assert os.path.exists(args.load_optimizer_lr)
o_l_state_dict = torch.load(args.load_optimizer_lr, map_location='cpu')
o_l_state_dict['lr_scheduler']['last_epoch'] = o_l_state_dict['lr_scheduler']['last_epoch'] - 1
optimizer = get_optimizer(model, lr=args.lr)
optimizer.load_state_dict(o_l_state_dict['optimizer'])
# o_l_state_dict['lr_scheduler']['last_epoch']
lr_scheduler = get_lr_scheduler(optimizer,
@ -105,33 +198,38 @@ def main():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.cuda(f"cuda:{torch.cuda.current_device()}")
# if you want delete the above three code, have to move the model to gpu, because in optimizer.step()
# if you want delete the above three code, must move the model to gpu. Because in optimizer.step()
lr_scheduler.load_state_dict(o_l_state_dict['lr_scheduler'])
start_epoch = o_l_state_dict['epoch']
start_shard = o_l_state_dict['shard'] + 1
# global_step = o_l_state_dict['global_step'] + 1
<<<<<<< HEAD
logger.info(f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}')
=======
logger.info(
f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}'
)
else:
optimizer = get_optimizer(model, lr=args.lr)
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
# optimizer = gpc.config.optimizer.pop('type')(
# model.parameters(), **gpc.config.optimizer)
# optimizer = ShardedOptimizerV2(model, optimizer, initial_scale=2**5)
criterion = LossForPretraining(config.vocab_size)
# build dataloader
pretrain_dataset_provider = NvidiaBertDatasetProvider(args)
<<<<<<< HEAD
=======
# initialize with colossalai
engine, _, _, lr_scheduelr = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
lr_scheduler=lr_scheduler)
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
logger.info(get_mem_info(prefix='After init model, '))
best_loss = None
@ -156,9 +254,15 @@ def main():
else:
iterator_data = enumerate(dataset_iterator)
<<<<<<< HEAD
model.train()
for step, batch_data in iterator_data:
=======
engine.train()
for step, batch_data in iterator_data:
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
# batch_data = pretrain_dataset_provider.get_batch(batch_index)
input_ids = batch_data[0].cuda(f"cuda:{torch.cuda.current_device()}")
@ -167,18 +271,31 @@ def main():
mlm_label = batch_data[3].cuda(f"cuda:{torch.cuda.current_device()}")
# nsp_label = batch_data[5].cuda()
<<<<<<< HEAD
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
loss = criterion(output.logits, mlm_label)
=======
output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
loss = engine.criterion(output.logits, mlm_label)
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
pretrain_dataset_provider.prefetch_batch()
engine.backward(loss)
optimizer.backward(loss)
train_loss += loss.float().item()
# if (step + 1) % args.accumulation_step == 0:
<<<<<<< HEAD
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
=======
engine.step()
lr_scheduelr.step()
engine.zero_grad()
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
global_step += 1
if global_step % args.log_interval == 0 and global_step != 0 \
@ -189,7 +306,7 @@ def main():
numel, args, config, elapsed_time, global_step, world_size)
cur_loss = train_loss / args.log_interval
current_lr = lr_scheduelr.get_last_lr()[0]
current_lr = lr_scheduler.get_last_lr()[0]
log_str = f'| epoch: {epoch} | shard: {shard} | step: {global_step} | lr {current_lr:.7f} | elapsed_time: {elapsed_time / 60 :.3f} minutes ' + \
f'| mins/batch: {elapsed_time_per_iteration :.3f} seconds | loss: {cur_loss:.7f} | ppl: {math.exp(cur_loss):.3f} | TFLOPS: {get_tflops_func(elapsed_time_per_iteration):.3f} or {tflops:.3f}'
logger.info(log_str, print_=False)
@ -209,11 +326,18 @@ def main():
logger.info(f'epoch {epoch} shard {shard} has cost {timers("shard_time").elapsed() / 60 :.3f} mins')
logger.info('*' * 100)
<<<<<<< HEAD
eval_loss += evaluate(model, args, logger, global_step, criterion)
save_ckpt(model, optimizer, lr_scheduler, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step)
=======
eval_loss += evaluate(engine, args, logger, global_step)
save_ckpt(engine.model, optimizer, lr_scheduelr,
os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch,
shard, global_step)
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
eval_loss /= len(os.listdir(args.data_path_prefix))
logger.info(
f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins'

View File

@ -1,2 +1,7 @@
colossalai >= 0.1.12
torch >= 1.8.1
tqdm
tensorboard
numpy
h5py
wandb