From ab5fd127e393f7298c6497fd638d0f89c53a9452 Mon Sep 17 00:00:00 2001
From: mandoxzhang <111039218+mandoxzhang@users.noreply.github.com>
Date: Fri, 7 Apr 2023 10:34:51 +0800
Subject: [PATCH] [example] update roberta with newer ColossalAI (#3472)

* update roberta example

* update roberta example
---
 examples/language/roberta/README.md           |  16 +-
 .../roberta/configs/colossalai_ddp.py         |   9 --
 .../roberta/configs/colossalai_zero.py        |  37 -----
 .../roberta/preprocessing/get_mask.py         |   7 +-
 .../roberta/preprocessing/sentence_split.py   |  19 +--
 .../roberta/preprocessing/tokenize_mask.py    |  12 +-
 .../language/roberta/pretraining/arguments.py |  24 +++
 .../roberta/pretraining/evaluation.py         |  12 +-
 .../roberta/pretraining/pretrain_utils.py     |   4 +-
 .../roberta/pretraining/run_pretrain.sh       |   2 -
 .../pretraining/run_pretrain_resume.sh        |   2 -
 .../roberta/pretraining/run_pretraining.py    | 142 ++++++++++++++++--
 examples/language/roberta/requirements.txt    |   5 +
 13 files changed, 188 insertions(+), 103 deletions(-)
 delete mode 100644 examples/language/roberta/configs/colossalai_ddp.py
 delete mode 100644 examples/language/roberta/configs/colossalai_zero.py
diff --git a/examples/language/roberta/README.md b/examples/language/roberta/README.md
index a42b1935d..0e080d009 100644
--- a/examples/language/roberta/README.md
+++ b/examples/language/roberta/README.md
@@ -1,9 +1,9 @@
 # Introduction
-This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality  bert.
+This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta.
 
 ## 0. Prerequisite
 - Install Colossal-AI
-- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes"
+- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes"
 - Ensure that each host can log in to each other without password. If you have n hosts, need to execute n<sup>2</sup> times
 
 ```
@@ -33,7 +33,7 @@ service ssh restart
 ```bash
 cd preprocessing
 ```
-following the `README.md`, preprocess original corpus to h5py+numpy
+following the `README.md`, preprocess original corpus to h5py plus numpy
 
 ## 2. Pretrain
 
@@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
 The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
 
 ## Contributors
-The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
-
-```
-@misc{
-  title={A simple Chinese RoBERTa Example for Whole Word Masked},
-  author={Yehua Zhang, Chen Zhang},
-  year={2022}
-}
-```
+The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
diff --git a/examples/language/roberta/configs/colossalai_ddp.py b/examples/language/roberta/configs/colossalai_ddp.py
deleted file mode 100644
index 3146ffc45..000000000
--- a/examples/language/roberta/configs/colossalai_ddp.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from colossalai.nn.optimizer import FusedAdam
-
-try:
-    from colossalai.zero.shard_utils import TensorShardStrategy
-except ImportError:
-    # colossalai > 0.2.8
-    from colossalai.zero.legacy import TensorShardStrategy
-
-clip_grad_norm = 1.0
diff --git a/examples/language/roberta/configs/colossalai_zero.py b/examples/language/roberta/configs/colossalai_zero.py
deleted file mode 100644
index bae4c723c..000000000
--- a/examples/language/roberta/configs/colossalai_zero.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from colossalai.nn.optimizer import FusedAdam
-
-try:
-    from colossalai.zero.shard_utils import TensorShardStrategy
-except ImportError:
-    # colossalai > 0.2.8
-    from colossalai.zero.legacy import TensorShardStrategy
-
-# fp16 = dict(
-#     mode=AMP_TYPE.TORCH,
-# )
-
-# seed = 2
-zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
-                              reduce_scatter_bucket_size_mb=25,
-                              fp32_reduce_scatter=False,
-                              tensor_placement_policy="cuda",
-                              gradient_predivide_factor=1.0,
-                              reuse_fp16_shard=False),
-            optimizer_config=dict(gpu_margin_mem_ratio=0.8,
-                                  initial_scale=2**5,
-                                  min_scale=1,
-                                  growth_factor=2,
-                                  backoff_factor=0.5,
-                                  growth_interval=1000,
-                                  hysteresis=2,
-                                  max_scale=2**32))
-
-# gradient_accumulation = 4
-clip_grad_norm = 1.0
-optimizer = dict(
-    type=FusedAdam,
-    lr=0.00015,
-    weight_decay=1e-2,
-)
-
-# 64433
diff --git a/examples/language/roberta/preprocessing/get_mask.py b/examples/language/roberta/preprocessing/get_mask.py
index da297f98e..869ef2cb3 100644
--- a/examples/language/roberta/preprocessing/get_mask.py
+++ b/examples/language/roberta/preprocessing/get_mask.py
@@ -163,16 +163,15 @@ class PreTrainingDataset():
 
     def get_new_segment(self, segment):
         """
-        输入一句话，返回一句经过处理的话: 为了支持中文全称mask，将被分开的词，将上特殊标记("#")，使得后续处理模块，能够知道哪些字是属于同一个词的。
-        :param segment: 一句话
-        :return: 一句处理过的话
+        Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word.
+        :param segment: a sentence
         """
         seq_cws = jieba.lcut(''.join(segment))
         seq_cws_dict = {x: 1 for x in seq_cws}
         new_segment = []
         i = 0
         while i < len(segment):
-            if len(self.rec.findall(segment[i])) == 0: # 不是中文的，原文加进去。
+            if len(self.rec.findall(segment[i])) == 0: 
                 new_segment.append(segment[i])
                 i += 1
                 continue
diff --git a/examples/language/roberta/preprocessing/sentence_split.py b/examples/language/roberta/preprocessing/sentence_split.py
index 231be152b..f0ed83f90 100644
--- a/examples/language/roberta/preprocessing/sentence_split.py
+++ b/examples/language/roberta/preprocessing/sentence_split.py
@@ -10,26 +10,19 @@ import argparse
 import functools
 
 def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
-    """
-    Args:
-        document:
-        flag: Type:str, "all" 中英文标点分句，"zh" 中文标点分句，"en" 英文标点分句
-        limit: 默认单句最大长度为510个字符
-    Returns: Type:list
-    """
     sent_list = []
     try:
         if flag == "zh":
-            document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
-            document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)  # 特殊引号
+            document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document) 
+            document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
         elif flag == "en":
-            document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 英文单字符断句符
-            document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document)  # 特殊引号
+            document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  
+            document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document)  # Special quotation marks
         else:
-            document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
+            document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  
             
             document = re.sub('(?P<quotation_mark>(([。？！.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
-                            document)  # 特殊引号
+                            document)  # Special quotation marks
 
         sent_list_ori = document.splitlines()
         for sent in sent_list_ori:
diff --git a/examples/language/roberta/preprocessing/tokenize_mask.py b/examples/language/roberta/preprocessing/tokenize_mask.py
index b33871d5d..76c74868e 100644
--- a/examples/language/roberta/preprocessing/tokenize_mask.py
+++ b/examples/language/roberta/preprocessing/tokenize_mask.py
@@ -15,8 +15,8 @@ from get_mask import PreTrainingDataset
 def get_raw_instance(document, max_sequence_length=512):
 
     """
-    获取初步的训练实例，将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。
-    :param document: 一整段
+    Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances.
+    :param document: document
     :param max_sequence_length:
     :return: a list. each element is a sequence of text
     """
@@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512):
     sizes = [len(seq) for seq in document]
 
     result_list = []
-    curr_seq = [] # 当前处理的序列
+    curr_seq = [] 
     sz_idx = 0
     while sz_idx < len(sizes):
-        # 当前句子加上新的句子，如果长度小于最大限制，则合并当前句子和新句子；否则即超过了最大限制，那么做为一个新的序列加到目标列表中
         
         if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
             curr_seq += document[sz_idx]
@@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512):
         else:
             result_list.append(curr_seq)
             curr_seq = []
-    # 对最后一个序列进行处理，如果太短的话，丢弃掉。
+
     if len(curr_seq) > max_sequence_length_allowed / 2: # /2
         result_list.append(curr_seq)
 
-    # # 计算总共可以得到多少份
     # num_instance=int(len(big_list)/max_sequence_length_allowed)+1
     # print("num_instance:",num_instance)
-    # # 切分成多份，添加到列表中
+
     # result_list=[]
     # for j in range(num_instance):
     #     index=j*max_sequence_length_allowed
diff --git a/examples/language/roberta/pretraining/arguments.py b/examples/language/roberta/pretraining/arguments.py
index 3a9370e00..87fa8dd8a 100644
--- a/examples/language/roberta/pretraining/arguments.py
+++ b/examples/language/roberta/pretraining/arguments.py
@@ -6,6 +6,30 @@ __all__ = ['parse_args']
 
 def parse_args():
     parser = colossalai.get_default_parser()
+
+    parser.add_argument(
+        "--distplan",
+        type=str,
+        default='CAI_Gemini',
+        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
+    )
+    parser.add_argument(
+        "--tp_degree",
+        type=int,
+        default=1,
+        help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default='cpu',
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--shardinit",
+        action='store_true',
+        help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
+    )
     
     parser.add_argument(
         '--lr', 
diff --git a/examples/language/roberta/pretraining/evaluation.py b/examples/language/roberta/pretraining/evaluation.py
index 83f94082f..8fc019c12 100644
--- a/examples/language/roberta/pretraining/evaluation.py
+++ b/examples/language/roberta/pretraining/evaluation.py
@@ -5,11 +5,11 @@ from tqdm import tqdm
 from utils.global_vars import get_timers, get_tensorboard_writer
 from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider 
 
-def evaluate(engine, args, logger, global_step):
+def evaluate(model, args, logger, global_step, criterion):
     evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True)
     start_shard = 0
 
-    engine.eval()
+    model.eval()
     timers = get_timers()
     eval_step = 0
     eval_loss = 0
@@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step):
                 mlm_label = batch_data[3].cuda()
                 # nsp_label = batch_data[5].cuda()
 
-                output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+                output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
                 
-                loss = engine.criterion(output.logits, mlm_label)#prediction_scores
+                loss = criterion(output.logits, mlm_label)#prediction_scores
                 evaluate_dataset_provider.prefetch_batch()
 
                 eval_loss += loss.float().item()
@@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step):
             logger.info('')
 
     evaluate_dataset_provider.release_shard()
-    engine.train()
-    return cur_loss
+    model.train()
+    return cur_loss
\ No newline at end of file
diff --git a/examples/language/roberta/pretraining/pretrain_utils.py b/examples/language/roberta/pretraining/pretrain_utils.py
index ba17b0f5e..54fc2affe 100644
--- a/examples/language/roberta/pretraining/pretrain_utils.py
+++ b/examples/language/roberta/pretraining/pretrain_utils.py
@@ -5,7 +5,7 @@ from transformers import get_linear_schedule_with_warmup
 from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig
 from transformers import GPT2Config, GPT2LMHeadModel
 from transformers import AutoTokenizer, AutoModelForMaskedLM
-from colossalai.nn.optimizer import FusedAdam
+from colossalai.nn.optimizer import FusedAdam, HybridAdam
 from torch.optim import AdamW
 from colossalai.core import global_context as gpc
 import torch
@@ -83,7 +83,7 @@ def get_optimizer(model, lr):
         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0
     }]
-    optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
     return optimizer
 
 
diff --git a/examples/language/roberta/pretraining/run_pretrain.sh b/examples/language/roberta/pretraining/run_pretrain.sh
index 144cd0ab9..38fdefe0a 100644
--- a/examples/language/roberta/pretraining/run_pretrain.sh
+++ b/examples/language/roberta/pretraining/run_pretrain.sh
@@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
 log_path="$root_path/exp_log"
 ckpt_path="$root_path/ckpt"
 
-colossal_config="$root_path/../configs/colossalai_ddp.py"
 
 mkdir -p $tensorboard_path
 mkdir -p $log_path
@@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
                 --tensorboard_path $tensorboard_path \
                 --log_path $log_path \
                 --ckpt_path $ckpt_path \
-                --colossal_config $colossal_config \
                 --log_interval 50 \
                 --mlm bert \
                 --wandb \
diff --git a/examples/language/roberta/pretraining/run_pretrain_resume.sh b/examples/language/roberta/pretraining/run_pretrain_resume.sh
index a0704cf7c..351c98d3e 100644
--- a/examples/language/roberta/pretraining/run_pretrain_resume.sh
+++ b/examples/language/roberta/pretraining/run_pretrain_resume.sh
@@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
 log_path="$root_path/exp_log"
 ckpt_path="$root_path/ckpt"
 
-colossal_config="$root_path/../configs/colossalai_ddp.py"
 
 mkdir -p $tensorboard_path
 mkdir -p $log_path
@@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
                 --tensorboard_path $tensorboard_path \
                 --log_path $log_path \
                 --ckpt_path $ckpt_path \
-                --colossal_config $colossal_config \
                 --log_interval 50 \
                 --mlm bert \
                 --wandb \
diff --git a/examples/language/roberta/pretraining/run_pretraining.py b/examples/language/roberta/pretraining/run_pretraining.py
index eef7bb6ad..6e3cae6ec 100644
--- a/examples/language/roberta/pretraining/run_pretraining.py
+++ b/examples/language/roberta/pretraining/run_pretraining.py
@@ -4,9 +4,31 @@ import time
 from functools import partial
 
 import torch
+<<<<<<< HEAD
+from tqdm import tqdm
+import os
+import time
+from functools import partial
+from transformers import AutoTokenizer
+
+import colossalai
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import ZeroOptimizer
+from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
+
+=======
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
 from arguments import parse_args
 from evaluation import evaluate
 from loss import LossForPretraining
+<<<<<<< HEAD
+
+from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
+=======
 from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
 from pretrain_utils import get_lr_scheduler, get_model, get_optimizer, save_ckpt
 from tqdm import tqdm
@@ -27,6 +49,7 @@ from colossalai.zero import ZeroOptimizer
 from colossalai.zero.gemini import ChunkManager, ColoInitContext, GeminiManager
 from colossalai.zero.legacy import ShardedModelV2, ShardedOptimizerV2, ZeroInitContext
 from colossalai.zero.legacy.shard_utils import TensorShardStrategy
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
 
 
 def main():
@@ -36,8 +59,13 @@ def main():
 
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
 
+<<<<<<< HEAD
+    # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+    
+=======
     os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
     logger = Logger(os.path.join(args.log_path, launch_time), cuda=torch.cuda.is_available(), debug=args.vscode_debug)
 
     if args.vscode_debug:
@@ -50,7 +78,11 @@ def main():
         args.local_rank = -1
         args.log_interval = 1
     else:
+<<<<<<< HEAD
+        colossalai.launch_from_torch(config={}) #args.colossal_config
+=======
         colossalai.launch_from_torch(args.colossal_config)    # args.colossal_config
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
         args.local_rank = int(os.environ["LOCAL_RANK"])
         logger.info(
             f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
@@ -61,32 +93,94 @@ def main():
     args.tokenizer = tokenizer
     args.logger = logger
     set_global_variables(launch_time, args.tensorboard_path)
+<<<<<<< HEAD
+    
+=======
 
     use_zero = hasattr(gpc.config, 'zero')
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
     world_size = torch.distributed.get_world_size()
+    init_dev = get_current_device()
 
     # build model, optimizer and criterion
+<<<<<<< HEAD
+    if args.distplan.startswith("CAI"):
+        # all param must use the same process group.
+        world_size = torch.distributed.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
+        default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
+
+        if args.shardinit and args.distplan != "CAI_Gemini":
+            raise RuntimeError("You can only use shardinit with CAI_Gemini")
+
+        # build GPT model
+        with ColoInitContext(device=get_current_device(),
+                             dtype=torch.half,
+                             default_dist_spec=default_dist_spec,
+                             default_pg=shard_pg):
+=======
     if use_zero:
         shard_strategy = TensorShardStrategy()
         with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy,
                              shard_param=True):
 
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
             config, model, numel = get_model(args, logger)
-            # model = ShardedModelV2(model, shard_strategy, tensor_placement_policy='cpu', reuse_fp16_shard=True)
+
+        # asign running configurations
+        gemini_config = None
+        if args.distplan.startswith("CAI_ZeRO"):
+            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
+        elif args.distplan == "CAI_Gemini":
+            gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
+                                 device=get_current_device(),
+                                 placement_policy=args.placement,
+                                 pin_memory=True,
+                                 hidden_dim=model.config.hidden_size,
+                                 search_range_mb=128)
+            optim_config = dict(gpu_margin_mem_ratio=0.)
+        else:
+            raise RuntimeError
+
+        # build a highly optimized gpu/cpu optimizer
+        optimizer = get_optimizer(model, lr=args.lr)
+
+        if args.distplan == "CAI_ZeRO1":
+            zero_stage = 1
+        elif args.distplan == "CAI_ZeRO2":
+            zero_stage = 2
+        elif args.distplan == "CAI_Gemini":
+            zero_stage = 3
+        else:
+            raise RuntimeError
+
+        # wrap your model and optimizer
+        model = zero_model_wrapper(model, zero_stage, gemini_config)
+        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
+
+        logger.info(get_mem_info(prefix='After init optim, '))
+   
     else:
         config, model, numel = get_model(args, logger)
         logger.info("no_zero")
+
     if torch.distributed.get_rank() == 0:
         os.mkdir(os.path.join(args.ckpt_path, launch_time))
 
     logger.info(f'Model numel: {numel}')
 
     get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length)
+<<<<<<< HEAD
+
+    # 144003367 is is the length of the entire dataset
+    steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size #len(dataloader)
+=======
     # len(dataloader)
     steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
     total_steps = steps_per_epoch * args.epoch
 
-    # build optimizer and lr_scheduler
+    lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
 
     start_epoch = 0
     start_shard = 0
@@ -95,7 +189,6 @@ def main():
         assert os.path.exists(args.load_optimizer_lr)
         o_l_state_dict = torch.load(args.load_optimizer_lr, map_location='cpu')
         o_l_state_dict['lr_scheduler']['last_epoch'] = o_l_state_dict['lr_scheduler']['last_epoch'] - 1
-        optimizer = get_optimizer(model, lr=args.lr)
         optimizer.load_state_dict(o_l_state_dict['optimizer'])
         # o_l_state_dict['lr_scheduler']['last_epoch']
         lr_scheduler = get_lr_scheduler(optimizer,
@@ -105,33 +198,38 @@ def main():
             for k, v in state.items():
                 if isinstance(v, torch.Tensor):
                     state[k] = v.cuda(f"cuda:{torch.cuda.current_device()}")
-        # if you want delete the above three code, have to move the model to gpu, because in optimizer.step()
+        # if you want delete the above three code, must move the model to gpu. Because in optimizer.step()
         lr_scheduler.load_state_dict(o_l_state_dict['lr_scheduler'])
 
         start_epoch = o_l_state_dict['epoch']
         start_shard = o_l_state_dict['shard'] + 1
         # global_step = o_l_state_dict['global_step'] + 1
+<<<<<<< HEAD
+        logger.info(f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}')
+=======
         logger.info(
             f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}'
         )
     else:
         optimizer = get_optimizer(model, lr=args.lr)
         lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
 
-    # optimizer = gpc.config.optimizer.pop('type')(
-    # model.parameters(), **gpc.config.optimizer)
-    # optimizer = ShardedOptimizerV2(model, optimizer, initial_scale=2**5)
     criterion = LossForPretraining(config.vocab_size)
 
     # build dataloader
     pretrain_dataset_provider = NvidiaBertDatasetProvider(args)
 
+<<<<<<< HEAD
+    
+=======
     # initialize with colossalai
     engine, _, _, lr_scheduelr = colossalai.initialize(model=model,
                                                        optimizer=optimizer,
                                                        criterion=criterion,
                                                        lr_scheduler=lr_scheduler)
 
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
     logger.info(get_mem_info(prefix='After init model, '))
 
     best_loss = None
@@ -156,9 +254,15 @@ def main():
             else:
                 iterator_data = enumerate(dataset_iterator)
 
+<<<<<<< HEAD
+            model.train()
+            
+            for step, batch_data in iterator_data: 
+=======
             engine.train()
 
             for step, batch_data in iterator_data:
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
 
                 # batch_data = pretrain_dataset_provider.get_batch(batch_index)
                 input_ids = batch_data[0].cuda(f"cuda:{torch.cuda.current_device()}")
@@ -167,18 +271,31 @@ def main():
                 mlm_label = batch_data[3].cuda(f"cuda:{torch.cuda.current_device()}")
                 # nsp_label = batch_data[5].cuda()
 
+<<<<<<< HEAD
+                output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+                
+                loss = criterion(output.logits, mlm_label)
+=======
                 output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
 
                 loss = engine.criterion(output.logits, mlm_label)
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
                 pretrain_dataset_provider.prefetch_batch()
 
-                engine.backward(loss)
+                optimizer.backward(loss)
                 train_loss += loss.float().item()
                 # if  (step + 1) % args.accumulation_step == 0:
+<<<<<<< HEAD
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                
+=======
                 engine.step()
                 lr_scheduelr.step()
                 engine.zero_grad()
 
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
                 global_step += 1
 
                 if global_step % args.log_interval == 0 and global_step != 0 \
@@ -189,7 +306,7 @@ def main():
                         numel, args, config, elapsed_time, global_step, world_size)
 
                     cur_loss = train_loss / args.log_interval
-                    current_lr = lr_scheduelr.get_last_lr()[0]
+                    current_lr = lr_scheduler.get_last_lr()[0]
                     log_str = f'| epoch: {epoch} | shard: {shard} | step: {global_step} | lr {current_lr:.7f} | elapsed_time: {elapsed_time / 60 :.3f} minutes ' + \
                               f'| mins/batch: {elapsed_time_per_iteration :.3f} seconds | loss: {cur_loss:.7f} | ppl: {math.exp(cur_loss):.3f} | TFLOPS: {get_tflops_func(elapsed_time_per_iteration):.3f} or {tflops:.3f}'
                     logger.info(log_str, print_=False)
@@ -209,11 +326,18 @@ def main():
             logger.info(f'epoch {epoch} shard {shard} has cost {timers("shard_time").elapsed() / 60 :.3f} mins')
             logger.info('*' * 100)
 
+<<<<<<< HEAD
+            eval_loss += evaluate(model, args, logger, global_step, criterion)
+            save_ckpt(model, optimizer, lr_scheduler, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step)
+        
+        
+=======
             eval_loss += evaluate(engine, args, logger, global_step)
             save_ckpt(engine.model, optimizer, lr_scheduelr,
                       os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch,
                       shard, global_step)
 
+>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
         eval_loss /= len(os.listdir(args.data_path_prefix))
         logger.info(
             f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins'
diff --git a/examples/language/roberta/requirements.txt b/examples/language/roberta/requirements.txt
index 137a69e80..d351f362f 100644
--- a/examples/language/roberta/requirements.txt
+++ b/examples/language/roberta/requirements.txt
@@ -1,2 +1,7 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+tqdm
+tensorboard
+numpy
+h5py
+wandb
\ No newline at end of file