diff --git a/examples/language/roberta/README.md b/examples/language/roberta/README.md index a42b1935d..0e080d009 100644 --- a/examples/language/roberta/README.md +++ b/examples/language/roberta/README.md @@ -1,9 +1,9 @@ # Introduction -This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality bert. +This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta. ## 0. Prerequisite - Install Colossal-AI -- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes" +- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes" - Ensure that each host can log in to each other without password. If you have n hosts, need to execute n2 times ``` @@ -33,7 +33,7 @@ service ssh restart ```bash cd preprocessing ``` -following the `README.md`, preprocess original corpus to h5py+numpy +following the `README.md`, preprocess original corpus to h5py plus numpy ## 2. Pretrain @@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application. ## Contributors -The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution! - -``` -@misc{ - title={A simple Chinese RoBERTa Example for Whole Word Masked}, - author={Yehua Zhang, Chen Zhang}, - year={2022} -} -``` +The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution! diff --git a/examples/language/roberta/configs/colossalai_ddp.py b/examples/language/roberta/configs/colossalai_ddp.py deleted file mode 100644 index 3146ffc45..000000000 --- a/examples/language/roberta/configs/colossalai_ddp.py +++ /dev/null @@ -1,9 +0,0 @@ -from colossalai.nn.optimizer import FusedAdam - -try: - from colossalai.zero.shard_utils import TensorShardStrategy -except ImportError: - # colossalai > 0.2.8 - from colossalai.zero.legacy import TensorShardStrategy - -clip_grad_norm = 1.0 diff --git a/examples/language/roberta/configs/colossalai_zero.py b/examples/language/roberta/configs/colossalai_zero.py deleted file mode 100644 index bae4c723c..000000000 --- a/examples/language/roberta/configs/colossalai_zero.py +++ /dev/null @@ -1,37 +0,0 @@ -from colossalai.nn.optimizer import FusedAdam - -try: - from colossalai.zero.shard_utils import TensorShardStrategy -except ImportError: - # colossalai > 0.2.8 - from colossalai.zero.legacy import TensorShardStrategy - -# fp16 = dict( -# mode=AMP_TYPE.TORCH, -# ) - -# seed = 2 -zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), - reduce_scatter_bucket_size_mb=25, - fp32_reduce_scatter=False, - tensor_placement_policy="cuda", - gradient_predivide_factor=1.0, - reuse_fp16_shard=False), - optimizer_config=dict(gpu_margin_mem_ratio=0.8, - initial_scale=2**5, - min_scale=1, - growth_factor=2, - backoff_factor=0.5, - growth_interval=1000, - hysteresis=2, - max_scale=2**32)) - -# gradient_accumulation = 4 -clip_grad_norm = 1.0 -optimizer = dict( - type=FusedAdam, - lr=0.00015, - weight_decay=1e-2, -) - -# 64433 diff --git a/examples/language/roberta/preprocessing/get_mask.py b/examples/language/roberta/preprocessing/get_mask.py index da297f98e..869ef2cb3 100644 --- a/examples/language/roberta/preprocessing/get_mask.py +++ b/examples/language/roberta/preprocessing/get_mask.py @@ -163,16 +163,15 @@ class PreTrainingDataset(): def get_new_segment(self, segment): """ - 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 - :param segment: 一句话 - :return: 一句处理过的话 + Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word. + :param segment: a sentence """ seq_cws = jieba.lcut(''.join(segment)) seq_cws_dict = {x: 1 for x in seq_cws} new_segment = [] i = 0 while i < len(segment): - if len(self.rec.findall(segment[i])) == 0: # 不是中文的,原文加进去。 + if len(self.rec.findall(segment[i])) == 0: new_segment.append(segment[i]) i += 1 continue diff --git a/examples/language/roberta/preprocessing/sentence_split.py b/examples/language/roberta/preprocessing/sentence_split.py index 231be152b..f0ed83f90 100644 --- a/examples/language/roberta/preprocessing/sentence_split.py +++ b/examples/language/roberta/preprocessing/sentence_split.py @@ -10,26 +10,19 @@ import argparse import functools def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]: - """ - Args: - document: - flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句 - limit: 默认单句最大长度为510个字符 - Returns: Type:list - """ sent_list = [] try: if flag == "zh": - document = re.sub('(?P([。?!…](?![”’"\'])))', r'\g\n', document) # 单字符断句符 - document = re.sub('(?P([。?!]|…{1,2})[”’"\'])', r'\g\n', document) # 特殊引号 + document = re.sub('(?P([。?!…](?![”’"\'])))', r'\g\n', document) + document = re.sub('(?P([。?!]|…{1,2})[”’"\'])', r'\g\n', document) elif flag == "en": - document = re.sub('(?P([.?!](?![”’"\'])))', r'\g\n', document) # 英文单字符断句符 - document = re.sub('(?P([?!.]["\']))', r'\g\n', document) # 特殊引号 + document = re.sub('(?P([.?!](?![”’"\'])))', r'\g\n', document) + document = re.sub('(?P([?!.]["\']))', r'\g\n', document) # Special quotation marks else: - document = re.sub('(?P([。?!….?!](?![”’"\'])))', r'\g\n', document) # 单字符断句符 + document = re.sub('(?P([。?!….?!](?![”’"\'])))', r'\g\n', document) document = re.sub('(?P(([。?!.!?]|…{1,2})[”’"\']))', r'\g\n', - document) # 特殊引号 + document) # Special quotation marks sent_list_ori = document.splitlines() for sent in sent_list_ori: diff --git a/examples/language/roberta/preprocessing/tokenize_mask.py b/examples/language/roberta/preprocessing/tokenize_mask.py index b33871d5d..76c74868e 100644 --- a/examples/language/roberta/preprocessing/tokenize_mask.py +++ b/examples/language/roberta/preprocessing/tokenize_mask.py @@ -15,8 +15,8 @@ from get_mask import PreTrainingDataset def get_raw_instance(document, max_sequence_length=512): """ - 获取初步的训练实例,将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。 - :param document: 一整段 + Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances. + :param document: document :param max_sequence_length: :return: a list. each element is a sequence of text """ @@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512): sizes = [len(seq) for seq in document] result_list = [] - curr_seq = [] # 当前处理的序列 + curr_seq = [] sz_idx = 0 while sz_idx < len(sizes): - # 当前句子加上新的句子,如果长度小于最大限制,则合并当前句子和新句子;否则即超过了最大限制,那么做为一个新的序列加到目标列表中 if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0: curr_seq += document[sz_idx] @@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512): else: result_list.append(curr_seq) curr_seq = [] - # 对最后一个序列进行处理,如果太短的话,丢弃掉。 + if len(curr_seq) > max_sequence_length_allowed / 2: # /2 result_list.append(curr_seq) - # # 计算总共可以得到多少份 # num_instance=int(len(big_list)/max_sequence_length_allowed)+1 # print("num_instance:",num_instance) - # # 切分成多份,添加到列表中 + # result_list=[] # for j in range(num_instance): # index=j*max_sequence_length_allowed diff --git a/examples/language/roberta/pretraining/arguments.py b/examples/language/roberta/pretraining/arguments.py index 3a9370e00..87fa8dd8a 100644 --- a/examples/language/roberta/pretraining/arguments.py +++ b/examples/language/roberta/pretraining/arguments.py @@ -6,6 +6,30 @@ __all__ = ['parse_args'] def parse_args(): parser = colossalai.get_default_parser() + + parser.add_argument( + "--distplan", + type=str, + default='CAI_Gemini', + help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].", + ) + parser.add_argument( + "--tp_degree", + type=int, + default=1, + help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.", + ) + parser.add_argument( + "--placement", + type=str, + default='cpu', + help="Placement Policy for Gemini. Valid when using colossalai as dist plan.", + ) + parser.add_argument( + "--shardinit", + action='store_true', + help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.", + ) parser.add_argument( '--lr', diff --git a/examples/language/roberta/pretraining/evaluation.py b/examples/language/roberta/pretraining/evaluation.py index 83f94082f..8fc019c12 100644 --- a/examples/language/roberta/pretraining/evaluation.py +++ b/examples/language/roberta/pretraining/evaluation.py @@ -5,11 +5,11 @@ from tqdm import tqdm from utils.global_vars import get_timers, get_tensorboard_writer from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider -def evaluate(engine, args, logger, global_step): +def evaluate(model, args, logger, global_step, criterion): evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True) start_shard = 0 - engine.eval() + model.eval() timers = get_timers() eval_step = 0 eval_loss = 0 @@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step): mlm_label = batch_data[3].cuda() # nsp_label = batch_data[5].cuda() - output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) + output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) - loss = engine.criterion(output.logits, mlm_label)#prediction_scores + loss = criterion(output.logits, mlm_label)#prediction_scores evaluate_dataset_provider.prefetch_batch() eval_loss += loss.float().item() @@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step): logger.info('') evaluate_dataset_provider.release_shard() - engine.train() - return cur_loss + model.train() + return cur_loss \ No newline at end of file diff --git a/examples/language/roberta/pretraining/pretrain_utils.py b/examples/language/roberta/pretraining/pretrain_utils.py index ba17b0f5e..54fc2affe 100644 --- a/examples/language/roberta/pretraining/pretrain_utils.py +++ b/examples/language/roberta/pretraining/pretrain_utils.py @@ -5,7 +5,7 @@ from transformers import get_linear_schedule_with_warmup from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig from transformers import GPT2Config, GPT2LMHeadModel from transformers import AutoTokenizer, AutoModelForMaskedLM -from colossalai.nn.optimizer import FusedAdam +from colossalai.nn.optimizer import FusedAdam, HybridAdam from torch.optim import AdamW from colossalai.core import global_context as gpc import torch @@ -83,7 +83,7 @@ def get_optimizer(model, lr): 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] - optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95]) + optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95]) return optimizer diff --git a/examples/language/roberta/pretraining/run_pretrain.sh b/examples/language/roberta/pretraining/run_pretrain.sh index 144cd0ab9..38fdefe0a 100644 --- a/examples/language/roberta/pretraining/run_pretrain.sh +++ b/examples/language/roberta/pretraining/run_pretrain.sh @@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard" log_path="$root_path/exp_log" ckpt_path="$root_path/ckpt" -colossal_config="$root_path/../configs/colossalai_ddp.py" mkdir -p $tensorboard_path mkdir -p $log_path @@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \ --tensorboard_path $tensorboard_path \ --log_path $log_path \ --ckpt_path $ckpt_path \ - --colossal_config $colossal_config \ --log_interval 50 \ --mlm bert \ --wandb \ diff --git a/examples/language/roberta/pretraining/run_pretrain_resume.sh b/examples/language/roberta/pretraining/run_pretrain_resume.sh index a0704cf7c..351c98d3e 100644 --- a/examples/language/roberta/pretraining/run_pretrain_resume.sh +++ b/examples/language/roberta/pretraining/run_pretrain_resume.sh @@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard" log_path="$root_path/exp_log" ckpt_path="$root_path/ckpt" -colossal_config="$root_path/../configs/colossalai_ddp.py" mkdir -p $tensorboard_path mkdir -p $log_path @@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \ --tensorboard_path $tensorboard_path \ --log_path $log_path \ --ckpt_path $ckpt_path \ - --colossal_config $colossal_config \ --log_interval 50 \ --mlm bert \ --wandb \ diff --git a/examples/language/roberta/pretraining/run_pretraining.py b/examples/language/roberta/pretraining/run_pretraining.py index eef7bb6ad..6e3cae6ec 100644 --- a/examples/language/roberta/pretraining/run_pretraining.py +++ b/examples/language/roberta/pretraining/run_pretraining.py @@ -4,9 +4,31 @@ import time from functools import partial import torch +<<<<<<< HEAD +from tqdm import tqdm +import os +import time +from functools import partial +from transformers import AutoTokenizer + +import colossalai +from colossalai.context import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper +from colossalai.utils import get_current_device +from colossalai.utils.model.colo_init_context import ColoInitContext +from colossalai.zero import ZeroOptimizer +from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec + +======= +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 from arguments import parse_args from evaluation import evaluate from loss import LossForPretraining +<<<<<<< HEAD + +from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider +======= from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider from pretrain_utils import get_lr_scheduler, get_model, get_optimizer, save_ckpt from tqdm import tqdm @@ -27,6 +49,7 @@ from colossalai.zero import ZeroOptimizer from colossalai.zero.gemini import ChunkManager, ColoInitContext, GeminiManager from colossalai.zero.legacy import ShardedModelV2, ShardedOptimizerV2, ZeroInitContext from colossalai.zero.legacy.shard_utils import TensorShardStrategy +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 def main(): @@ -36,8 +59,13 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) +<<<<<<< HEAD + # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' + +======= os.environ['CUDA_LAUNCH_BLOCKING'] = '1' +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 logger = Logger(os.path.join(args.log_path, launch_time), cuda=torch.cuda.is_available(), debug=args.vscode_debug) if args.vscode_debug: @@ -50,7 +78,11 @@ def main(): args.local_rank = -1 args.log_interval = 1 else: +<<<<<<< HEAD + colossalai.launch_from_torch(config={}) #args.colossal_config +======= colossalai.launch_from_torch(args.colossal_config) # args.colossal_config +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 args.local_rank = int(os.environ["LOCAL_RANK"]) logger.info( f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' + @@ -61,32 +93,94 @@ def main(): args.tokenizer = tokenizer args.logger = logger set_global_variables(launch_time, args.tensorboard_path) +<<<<<<< HEAD + +======= use_zero = hasattr(gpc.config, 'zero') +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 world_size = torch.distributed.get_world_size() + init_dev = get_current_device() # build model, optimizer and criterion +<<<<<<< HEAD + if args.distplan.startswith("CAI"): + # all param must use the same process group. + world_size = torch.distributed.get_world_size() + shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None + default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None + + if args.shardinit and args.distplan != "CAI_Gemini": + raise RuntimeError("You can only use shardinit with CAI_Gemini") + + # build GPT model + with ColoInitContext(device=get_current_device(), + dtype=torch.half, + default_dist_spec=default_dist_spec, + default_pg=shard_pg): +======= if use_zero: shard_strategy = TensorShardStrategy() with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy, shard_param=True): +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 config, model, numel = get_model(args, logger) - # model = ShardedModelV2(model, shard_strategy, tensor_placement_policy='cpu', reuse_fp16_shard=True) + + # asign running configurations + gemini_config = None + if args.distplan.startswith("CAI_ZeRO"): + optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True) + elif args.distplan == "CAI_Gemini": + gemini_config = dict(strict_ddp_mode=args.tp_degree == 1, + device=get_current_device(), + placement_policy=args.placement, + pin_memory=True, + hidden_dim=model.config.hidden_size, + search_range_mb=128) + optim_config = dict(gpu_margin_mem_ratio=0.) + else: + raise RuntimeError + + # build a highly optimized gpu/cpu optimizer + optimizer = get_optimizer(model, lr=args.lr) + + if args.distplan == "CAI_ZeRO1": + zero_stage = 1 + elif args.distplan == "CAI_ZeRO2": + zero_stage = 2 + elif args.distplan == "CAI_Gemini": + zero_stage = 3 + else: + raise RuntimeError + + # wrap your model and optimizer + model = zero_model_wrapper(model, zero_stage, gemini_config) + optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config) + + logger.info(get_mem_info(prefix='After init optim, ')) + else: config, model, numel = get_model(args, logger) logger.info("no_zero") + if torch.distributed.get_rank() == 0: os.mkdir(os.path.join(args.ckpt_path, launch_time)) logger.info(f'Model numel: {numel}') get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length) +<<<<<<< HEAD + + # 144003367 is is the length of the entire dataset + steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size #len(dataloader) +======= # len(dataloader) steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 total_steps = steps_per_epoch * args.epoch - # build optimizer and lr_scheduler + lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1) start_epoch = 0 start_shard = 0 @@ -95,7 +189,6 @@ def main(): assert os.path.exists(args.load_optimizer_lr) o_l_state_dict = torch.load(args.load_optimizer_lr, map_location='cpu') o_l_state_dict['lr_scheduler']['last_epoch'] = o_l_state_dict['lr_scheduler']['last_epoch'] - 1 - optimizer = get_optimizer(model, lr=args.lr) optimizer.load_state_dict(o_l_state_dict['optimizer']) # o_l_state_dict['lr_scheduler']['last_epoch'] lr_scheduler = get_lr_scheduler(optimizer, @@ -105,33 +198,38 @@ def main(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda(f"cuda:{torch.cuda.current_device()}") - # if you want delete the above three code, have to move the model to gpu, because in optimizer.step() + # if you want delete the above three code, must move the model to gpu. Because in optimizer.step() lr_scheduler.load_state_dict(o_l_state_dict['lr_scheduler']) start_epoch = o_l_state_dict['epoch'] start_shard = o_l_state_dict['shard'] + 1 # global_step = o_l_state_dict['global_step'] + 1 +<<<<<<< HEAD + logger.info(f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}') +======= logger.info( f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}' ) else: optimizer = get_optimizer(model, lr=args.lr) lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1) +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 - # optimizer = gpc.config.optimizer.pop('type')( - # model.parameters(), **gpc.config.optimizer) - # optimizer = ShardedOptimizerV2(model, optimizer, initial_scale=2**5) criterion = LossForPretraining(config.vocab_size) # build dataloader pretrain_dataset_provider = NvidiaBertDatasetProvider(args) +<<<<<<< HEAD + +======= # initialize with colossalai engine, _, _, lr_scheduelr = colossalai.initialize(model=model, optimizer=optimizer, criterion=criterion, lr_scheduler=lr_scheduler) +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 logger.info(get_mem_info(prefix='After init model, ')) best_loss = None @@ -156,9 +254,15 @@ def main(): else: iterator_data = enumerate(dataset_iterator) +<<<<<<< HEAD + model.train() + + for step, batch_data in iterator_data: +======= engine.train() for step, batch_data in iterator_data: +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 # batch_data = pretrain_dataset_provider.get_batch(batch_index) input_ids = batch_data[0].cuda(f"cuda:{torch.cuda.current_device()}") @@ -167,18 +271,31 @@ def main(): mlm_label = batch_data[3].cuda(f"cuda:{torch.cuda.current_device()}") # nsp_label = batch_data[5].cuda() +<<<<<<< HEAD + output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) + + loss = criterion(output.logits, mlm_label) +======= output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) loss = engine.criterion(output.logits, mlm_label) +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 pretrain_dataset_provider.prefetch_batch() - engine.backward(loss) + optimizer.backward(loss) train_loss += loss.float().item() # if (step + 1) % args.accumulation_step == 0: +<<<<<<< HEAD + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + +======= engine.step() lr_scheduelr.step() engine.zero_grad() +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 global_step += 1 if global_step % args.log_interval == 0 and global_step != 0 \ @@ -189,7 +306,7 @@ def main(): numel, args, config, elapsed_time, global_step, world_size) cur_loss = train_loss / args.log_interval - current_lr = lr_scheduelr.get_last_lr()[0] + current_lr = lr_scheduler.get_last_lr()[0] log_str = f'| epoch: {epoch} | shard: {shard} | step: {global_step} | lr {current_lr:.7f} | elapsed_time: {elapsed_time / 60 :.3f} minutes ' + \ f'| mins/batch: {elapsed_time_per_iteration :.3f} seconds | loss: {cur_loss:.7f} | ppl: {math.exp(cur_loss):.3f} | TFLOPS: {get_tflops_func(elapsed_time_per_iteration):.3f} or {tflops:.3f}' logger.info(log_str, print_=False) @@ -209,11 +326,18 @@ def main(): logger.info(f'epoch {epoch} shard {shard} has cost {timers("shard_time").elapsed() / 60 :.3f} mins') logger.info('*' * 100) +<<<<<<< HEAD + eval_loss += evaluate(model, args, logger, global_step, criterion) + save_ckpt(model, optimizer, lr_scheduler, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step) + + +======= eval_loss += evaluate(engine, args, logger, global_step) save_ckpt(engine.model, optimizer, lr_scheduelr, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step) +>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41 eval_loss /= len(os.listdir(args.data_path_prefix)) logger.info( f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins' diff --git a/examples/language/roberta/requirements.txt b/examples/language/roberta/requirements.txt index 137a69e80..d351f362f 100644 --- a/examples/language/roberta/requirements.txt +++ b/examples/language/roberta/requirements.txt @@ -1,2 +1,7 @@ colossalai >= 0.1.12 torch >= 1.8.1 +tqdm +tensorboard +numpy +h5py +wandb \ No newline at end of file