mirror of https://github.com/hpcaitech/ColossalAI
[example] update roberta with newer ColossalAI (#3472)
* update roberta example * update roberta examplepull/3493/head
parent
fb8fae6f29
commit
ab5fd127e3
|
@ -1,9 +1,9 @@
|
|||
# Introduction
|
||||
This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality bert.
|
||||
This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta.
|
||||
|
||||
## 0. Prerequisite
|
||||
- Install Colossal-AI
|
||||
- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes"
|
||||
- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes"
|
||||
- Ensure that each host can log in to each other without password. If you have n hosts, need to execute n<sup>2</sup> times
|
||||
|
||||
```
|
||||
|
@ -33,7 +33,7 @@ service ssh restart
|
|||
```bash
|
||||
cd preprocessing
|
||||
```
|
||||
following the `README.md`, preprocess original corpus to h5py+numpy
|
||||
following the `README.md`, preprocess original corpus to h5py plus numpy
|
||||
|
||||
## 2. Pretrain
|
||||
|
||||
|
@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
|
|||
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
|
||||
|
||||
## Contributors
|
||||
The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
|
||||
|
||||
```
|
||||
@misc{
|
||||
title={A simple Chinese RoBERTa Example for Whole Word Masked},
|
||||
author={Yehua Zhang, Chen Zhang},
|
||||
year={2022}
|
||||
}
|
||||
```
|
||||
The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
from colossalai.nn.optimizer import FusedAdam
|
||||
|
||||
try:
|
||||
from colossalai.zero.shard_utils import TensorShardStrategy
|
||||
except ImportError:
|
||||
# colossalai > 0.2.8
|
||||
from colossalai.zero.legacy import TensorShardStrategy
|
||||
|
||||
clip_grad_norm = 1.0
|
|
@ -1,37 +0,0 @@
|
|||
from colossalai.nn.optimizer import FusedAdam
|
||||
|
||||
try:
|
||||
from colossalai.zero.shard_utils import TensorShardStrategy
|
||||
except ImportError:
|
||||
# colossalai > 0.2.8
|
||||
from colossalai.zero.legacy import TensorShardStrategy
|
||||
|
||||
# fp16 = dict(
|
||||
# mode=AMP_TYPE.TORCH,
|
||||
# )
|
||||
|
||||
# seed = 2
|
||||
zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
|
||||
reduce_scatter_bucket_size_mb=25,
|
||||
fp32_reduce_scatter=False,
|
||||
tensor_placement_policy="cuda",
|
||||
gradient_predivide_factor=1.0,
|
||||
reuse_fp16_shard=False),
|
||||
optimizer_config=dict(gpu_margin_mem_ratio=0.8,
|
||||
initial_scale=2**5,
|
||||
min_scale=1,
|
||||
growth_factor=2,
|
||||
backoff_factor=0.5,
|
||||
growth_interval=1000,
|
||||
hysteresis=2,
|
||||
max_scale=2**32))
|
||||
|
||||
# gradient_accumulation = 4
|
||||
clip_grad_norm = 1.0
|
||||
optimizer = dict(
|
||||
type=FusedAdam,
|
||||
lr=0.00015,
|
||||
weight_decay=1e-2,
|
||||
)
|
||||
|
||||
# 64433
|
|
@ -163,16 +163,15 @@ class PreTrainingDataset():
|
|||
|
||||
def get_new_segment(self, segment):
|
||||
"""
|
||||
输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。
|
||||
:param segment: 一句话
|
||||
:return: 一句处理过的话
|
||||
Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word.
|
||||
:param segment: a sentence
|
||||
"""
|
||||
seq_cws = jieba.lcut(''.join(segment))
|
||||
seq_cws_dict = {x: 1 for x in seq_cws}
|
||||
new_segment = []
|
||||
i = 0
|
||||
while i < len(segment):
|
||||
if len(self.rec.findall(segment[i])) == 0: # 不是中文的,原文加进去。
|
||||
if len(self.rec.findall(segment[i])) == 0:
|
||||
new_segment.append(segment[i])
|
||||
i += 1
|
||||
continue
|
||||
|
|
|
@ -10,26 +10,19 @@ import argparse
|
|||
import functools
|
||||
|
||||
def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
|
||||
"""
|
||||
Args:
|
||||
document:
|
||||
flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句
|
||||
limit: 默认单句最大长度为510个字符
|
||||
Returns: Type:list
|
||||
"""
|
||||
sent_list = []
|
||||
try:
|
||||
if flag == "zh":
|
||||
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
|
||||
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document) # 特殊引号
|
||||
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document)
|
||||
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
|
||||
elif flag == "en":
|
||||
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 英文单字符断句符
|
||||
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # 特殊引号
|
||||
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
|
||||
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # Special quotation marks
|
||||
else:
|
||||
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
|
||||
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
|
||||
|
||||
document = re.sub('(?P<quotation_mark>(([。?!.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
|
||||
document) # 特殊引号
|
||||
document) # Special quotation marks
|
||||
|
||||
sent_list_ori = document.splitlines()
|
||||
for sent in sent_list_ori:
|
||||
|
|
|
@ -15,8 +15,8 @@ from get_mask import PreTrainingDataset
|
|||
def get_raw_instance(document, max_sequence_length=512):
|
||||
|
||||
"""
|
||||
获取初步的训练实例,将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。
|
||||
:param document: 一整段
|
||||
Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances.
|
||||
:param document: document
|
||||
:param max_sequence_length:
|
||||
:return: a list. each element is a sequence of text
|
||||
"""
|
||||
|
@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512):
|
|||
sizes = [len(seq) for seq in document]
|
||||
|
||||
result_list = []
|
||||
curr_seq = [] # 当前处理的序列
|
||||
curr_seq = []
|
||||
sz_idx = 0
|
||||
while sz_idx < len(sizes):
|
||||
# 当前句子加上新的句子,如果长度小于最大限制,则合并当前句子和新句子;否则即超过了最大限制,那么做为一个新的序列加到目标列表中
|
||||
|
||||
if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
|
||||
curr_seq += document[sz_idx]
|
||||
|
@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512):
|
|||
else:
|
||||
result_list.append(curr_seq)
|
||||
curr_seq = []
|
||||
# 对最后一个序列进行处理,如果太短的话,丢弃掉。
|
||||
|
||||
if len(curr_seq) > max_sequence_length_allowed / 2: # /2
|
||||
result_list.append(curr_seq)
|
||||
|
||||
# # 计算总共可以得到多少份
|
||||
# num_instance=int(len(big_list)/max_sequence_length_allowed)+1
|
||||
# print("num_instance:",num_instance)
|
||||
# # 切分成多份,添加到列表中
|
||||
|
||||
# result_list=[]
|
||||
# for j in range(num_instance):
|
||||
# index=j*max_sequence_length_allowed
|
||||
|
|
|
@ -7,6 +7,30 @@ __all__ = ['parse_args']
|
|||
def parse_args():
|
||||
parser = colossalai.get_default_parser()
|
||||
|
||||
parser.add_argument(
|
||||
"--distplan",
|
||||
type=str,
|
||||
default='CAI_Gemini',
|
||||
help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp_degree",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--placement",
|
||||
type=str,
|
||||
default='cpu',
|
||||
help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shardinit",
|
||||
action='store_true',
|
||||
help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--lr',
|
||||
type=float,
|
||||
|
|
|
@ -5,11 +5,11 @@ from tqdm import tqdm
|
|||
from utils.global_vars import get_timers, get_tensorboard_writer
|
||||
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
|
||||
|
||||
def evaluate(engine, args, logger, global_step):
|
||||
def evaluate(model, args, logger, global_step, criterion):
|
||||
evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True)
|
||||
start_shard = 0
|
||||
|
||||
engine.eval()
|
||||
model.eval()
|
||||
timers = get_timers()
|
||||
eval_step = 0
|
||||
eval_loss = 0
|
||||
|
@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step):
|
|||
mlm_label = batch_data[3].cuda()
|
||||
# nsp_label = batch_data[5].cuda()
|
||||
|
||||
output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
||||
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
||||
|
||||
loss = engine.criterion(output.logits, mlm_label)#prediction_scores
|
||||
loss = criterion(output.logits, mlm_label)#prediction_scores
|
||||
evaluate_dataset_provider.prefetch_batch()
|
||||
|
||||
eval_loss += loss.float().item()
|
||||
|
@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step):
|
|||
logger.info('')
|
||||
|
||||
evaluate_dataset_provider.release_shard()
|
||||
engine.train()
|
||||
model.train()
|
||||
return cur_loss
|
|
@ -5,7 +5,7 @@ from transformers import get_linear_schedule_with_warmup
|
|||
from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig
|
||||
from transformers import GPT2Config, GPT2LMHeadModel
|
||||
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
||||
from colossalai.nn.optimizer import FusedAdam
|
||||
from colossalai.nn.optimizer import FusedAdam, HybridAdam
|
||||
from torch.optim import AdamW
|
||||
from colossalai.core import global_context as gpc
|
||||
import torch
|
||||
|
@ -83,7 +83,7 @@ def get_optimizer(model, lr):
|
|||
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': 0.0
|
||||
}]
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
|
||||
optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
|
||||
return optimizer
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
|
|||
log_path="$root_path/exp_log"
|
||||
ckpt_path="$root_path/ckpt"
|
||||
|
||||
colossal_config="$root_path/../configs/colossalai_ddp.py"
|
||||
|
||||
mkdir -p $tensorboard_path
|
||||
mkdir -p $log_path
|
||||
|
@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
|
|||
--tensorboard_path $tensorboard_path \
|
||||
--log_path $log_path \
|
||||
--ckpt_path $ckpt_path \
|
||||
--colossal_config $colossal_config \
|
||||
--log_interval 50 \
|
||||
--mlm bert \
|
||||
--wandb \
|
||||
|
|
|
@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
|
|||
log_path="$root_path/exp_log"
|
||||
ckpt_path="$root_path/ckpt"
|
||||
|
||||
colossal_config="$root_path/../configs/colossalai_ddp.py"
|
||||
|
||||
mkdir -p $tensorboard_path
|
||||
mkdir -p $log_path
|
||||
|
@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
|
|||
--tensorboard_path $tensorboard_path \
|
||||
--log_path $log_path \
|
||||
--ckpt_path $ckpt_path \
|
||||
--colossal_config $colossal_config \
|
||||
--log_interval 50 \
|
||||
--mlm bert \
|
||||
--wandb \
|
||||
|
|
|
@ -4,9 +4,31 @@ import time
|
|||
from functools import partial
|
||||
|
||||
import torch
|
||||
<<<<<<< HEAD
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import time
|
||||
from functools import partial
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
import colossalai
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ZeroOptimizer
|
||||
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
|
||||
|
||||
=======
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
from arguments import parse_args
|
||||
from evaluation import evaluate
|
||||
from loss import LossForPretraining
|
||||
<<<<<<< HEAD
|
||||
|
||||
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
|
||||
=======
|
||||
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
|
||||
from pretrain_utils import get_lr_scheduler, get_model, get_optimizer, save_ckpt
|
||||
from tqdm import tqdm
|
||||
|
@ -27,6 +49,7 @@ from colossalai.zero import ZeroOptimizer
|
|||
from colossalai.zero.gemini import ChunkManager, ColoInitContext, GeminiManager
|
||||
from colossalai.zero.legacy import ShardedModelV2, ShardedOptimizerV2, ZeroInitContext
|
||||
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -36,8 +59,13 @@ def main():
|
|||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
|
||||
|
||||
<<<<<<< HEAD
|
||||
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||
|
||||
=======
|
||||
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
logger = Logger(os.path.join(args.log_path, launch_time), cuda=torch.cuda.is_available(), debug=args.vscode_debug)
|
||||
|
||||
if args.vscode_debug:
|
||||
|
@ -50,7 +78,11 @@ def main():
|
|||
args.local_rank = -1
|
||||
args.log_interval = 1
|
||||
else:
|
||||
<<<<<<< HEAD
|
||||
colossalai.launch_from_torch(config={}) #args.colossal_config
|
||||
=======
|
||||
colossalai.launch_from_torch(args.colossal_config) # args.colossal_config
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
args.local_rank = int(os.environ["LOCAL_RANK"])
|
||||
logger.info(
|
||||
f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
|
||||
|
@ -61,32 +93,94 @@ def main():
|
|||
args.tokenizer = tokenizer
|
||||
args.logger = logger
|
||||
set_global_variables(launch_time, args.tensorboard_path)
|
||||
<<<<<<< HEAD
|
||||
|
||||
=======
|
||||
|
||||
use_zero = hasattr(gpc.config, 'zero')
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
world_size = torch.distributed.get_world_size()
|
||||
init_dev = get_current_device()
|
||||
|
||||
# build model, optimizer and criterion
|
||||
<<<<<<< HEAD
|
||||
if args.distplan.startswith("CAI"):
|
||||
# all param must use the same process group.
|
||||
world_size = torch.distributed.get_world_size()
|
||||
shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
|
||||
default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
|
||||
|
||||
if args.shardinit and args.distplan != "CAI_Gemini":
|
||||
raise RuntimeError("You can only use shardinit with CAI_Gemini")
|
||||
|
||||
# build GPT model
|
||||
with ColoInitContext(device=get_current_device(),
|
||||
dtype=torch.half,
|
||||
default_dist_spec=default_dist_spec,
|
||||
default_pg=shard_pg):
|
||||
=======
|
||||
if use_zero:
|
||||
shard_strategy = TensorShardStrategy()
|
||||
with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy,
|
||||
shard_param=True):
|
||||
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
config, model, numel = get_model(args, logger)
|
||||
# model = ShardedModelV2(model, shard_strategy, tensor_placement_policy='cpu', reuse_fp16_shard=True)
|
||||
|
||||
# asign running configurations
|
||||
gemini_config = None
|
||||
if args.distplan.startswith("CAI_ZeRO"):
|
||||
optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
|
||||
elif args.distplan == "CAI_Gemini":
|
||||
gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
|
||||
device=get_current_device(),
|
||||
placement_policy=args.placement,
|
||||
pin_memory=True,
|
||||
hidden_dim=model.config.hidden_size,
|
||||
search_range_mb=128)
|
||||
optim_config = dict(gpu_margin_mem_ratio=0.)
|
||||
else:
|
||||
raise RuntimeError
|
||||
|
||||
# build a highly optimized gpu/cpu optimizer
|
||||
optimizer = get_optimizer(model, lr=args.lr)
|
||||
|
||||
if args.distplan == "CAI_ZeRO1":
|
||||
zero_stage = 1
|
||||
elif args.distplan == "CAI_ZeRO2":
|
||||
zero_stage = 2
|
||||
elif args.distplan == "CAI_Gemini":
|
||||
zero_stage = 3
|
||||
else:
|
||||
raise RuntimeError
|
||||
|
||||
# wrap your model and optimizer
|
||||
model = zero_model_wrapper(model, zero_stage, gemini_config)
|
||||
optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)
|
||||
|
||||
logger.info(get_mem_info(prefix='After init optim, '))
|
||||
|
||||
else:
|
||||
config, model, numel = get_model(args, logger)
|
||||
logger.info("no_zero")
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
os.mkdir(os.path.join(args.ckpt_path, launch_time))
|
||||
|
||||
logger.info(f'Model numel: {numel}')
|
||||
|
||||
get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length)
|
||||
<<<<<<< HEAD
|
||||
|
||||
# 144003367 is is the length of the entire dataset
|
||||
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size #len(dataloader)
|
||||
=======
|
||||
# len(dataloader)
|
||||
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
total_steps = steps_per_epoch * args.epoch
|
||||
|
||||
# build optimizer and lr_scheduler
|
||||
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
|
||||
|
||||
start_epoch = 0
|
||||
start_shard = 0
|
||||
|
@ -95,7 +189,6 @@ def main():
|
|||
assert os.path.exists(args.load_optimizer_lr)
|
||||
o_l_state_dict = torch.load(args.load_optimizer_lr, map_location='cpu')
|
||||
o_l_state_dict['lr_scheduler']['last_epoch'] = o_l_state_dict['lr_scheduler']['last_epoch'] - 1
|
||||
optimizer = get_optimizer(model, lr=args.lr)
|
||||
optimizer.load_state_dict(o_l_state_dict['optimizer'])
|
||||
# o_l_state_dict['lr_scheduler']['last_epoch']
|
||||
lr_scheduler = get_lr_scheduler(optimizer,
|
||||
|
@ -105,33 +198,38 @@ def main():
|
|||
for k, v in state.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
state[k] = v.cuda(f"cuda:{torch.cuda.current_device()}")
|
||||
# if you want delete the above three code, have to move the model to gpu, because in optimizer.step()
|
||||
# if you want delete the above three code, must move the model to gpu. Because in optimizer.step()
|
||||
lr_scheduler.load_state_dict(o_l_state_dict['lr_scheduler'])
|
||||
|
||||
start_epoch = o_l_state_dict['epoch']
|
||||
start_shard = o_l_state_dict['shard'] + 1
|
||||
# global_step = o_l_state_dict['global_step'] + 1
|
||||
<<<<<<< HEAD
|
||||
logger.info(f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}')
|
||||
=======
|
||||
logger.info(
|
||||
f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}'
|
||||
)
|
||||
else:
|
||||
optimizer = get_optimizer(model, lr=args.lr)
|
||||
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
|
||||
# optimizer = gpc.config.optimizer.pop('type')(
|
||||
# model.parameters(), **gpc.config.optimizer)
|
||||
# optimizer = ShardedOptimizerV2(model, optimizer, initial_scale=2**5)
|
||||
criterion = LossForPretraining(config.vocab_size)
|
||||
|
||||
# build dataloader
|
||||
pretrain_dataset_provider = NvidiaBertDatasetProvider(args)
|
||||
|
||||
<<<<<<< HEAD
|
||||
|
||||
=======
|
||||
# initialize with colossalai
|
||||
engine, _, _, lr_scheduelr = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
lr_scheduler=lr_scheduler)
|
||||
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
logger.info(get_mem_info(prefix='After init model, '))
|
||||
|
||||
best_loss = None
|
||||
|
@ -156,9 +254,15 @@ def main():
|
|||
else:
|
||||
iterator_data = enumerate(dataset_iterator)
|
||||
|
||||
<<<<<<< HEAD
|
||||
model.train()
|
||||
|
||||
for step, batch_data in iterator_data:
|
||||
=======
|
||||
engine.train()
|
||||
|
||||
for step, batch_data in iterator_data:
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
|
||||
# batch_data = pretrain_dataset_provider.get_batch(batch_index)
|
||||
input_ids = batch_data[0].cuda(f"cuda:{torch.cuda.current_device()}")
|
||||
|
@ -167,18 +271,31 @@ def main():
|
|||
mlm_label = batch_data[3].cuda(f"cuda:{torch.cuda.current_device()}")
|
||||
# nsp_label = batch_data[5].cuda()
|
||||
|
||||
<<<<<<< HEAD
|
||||
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
||||
|
||||
loss = criterion(output.logits, mlm_label)
|
||||
=======
|
||||
output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
||||
|
||||
loss = engine.criterion(output.logits, mlm_label)
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
pretrain_dataset_provider.prefetch_batch()
|
||||
|
||||
engine.backward(loss)
|
||||
optimizer.backward(loss)
|
||||
train_loss += loss.float().item()
|
||||
# if (step + 1) % args.accumulation_step == 0:
|
||||
<<<<<<< HEAD
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
=======
|
||||
engine.step()
|
||||
lr_scheduelr.step()
|
||||
engine.zero_grad()
|
||||
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.log_interval == 0 and global_step != 0 \
|
||||
|
@ -189,7 +306,7 @@ def main():
|
|||
numel, args, config, elapsed_time, global_step, world_size)
|
||||
|
||||
cur_loss = train_loss / args.log_interval
|
||||
current_lr = lr_scheduelr.get_last_lr()[0]
|
||||
current_lr = lr_scheduler.get_last_lr()[0]
|
||||
log_str = f'| epoch: {epoch} | shard: {shard} | step: {global_step} | lr {current_lr:.7f} | elapsed_time: {elapsed_time / 60 :.3f} minutes ' + \
|
||||
f'| mins/batch: {elapsed_time_per_iteration :.3f} seconds | loss: {cur_loss:.7f} | ppl: {math.exp(cur_loss):.3f} | TFLOPS: {get_tflops_func(elapsed_time_per_iteration):.3f} or {tflops:.3f}'
|
||||
logger.info(log_str, print_=False)
|
||||
|
@ -209,11 +326,18 @@ def main():
|
|||
logger.info(f'epoch {epoch} shard {shard} has cost {timers("shard_time").elapsed() / 60 :.3f} mins')
|
||||
logger.info('*' * 100)
|
||||
|
||||
<<<<<<< HEAD
|
||||
eval_loss += evaluate(model, args, logger, global_step, criterion)
|
||||
save_ckpt(model, optimizer, lr_scheduler, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step)
|
||||
|
||||
|
||||
=======
|
||||
eval_loss += evaluate(engine, args, logger, global_step)
|
||||
save_ckpt(engine.model, optimizer, lr_scheduelr,
|
||||
os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch,
|
||||
shard, global_step)
|
||||
|
||||
>>>>>>> 52a933e17509c71811e919b165de38cb3d5d6d41
|
||||
eval_loss /= len(os.listdir(args.data_path_prefix))
|
||||
logger.info(
|
||||
f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins'
|
||||
|
|
|
@ -1,2 +1,7 @@
|
|||
colossalai >= 0.1.12
|
||||
torch >= 1.8.1
|
||||
tqdm
|
||||
tensorboard
|
||||
numpy
|
||||
h5py
|
||||
wandb
|
Loading…
Reference in New Issue