ColossalAI/examples/community/roberta/pretraining/pretrain_utils.py

import os
import sys

import torch
import transformers
from transformers import get_linear_schedule_with_warmup

from colossalai.legacy.core import global_context as gpc
from colossalai.nn.optimizer import HybridAdam

sys.path.append(os.getcwd())
from collections import OrderedDict

import torch.nn as nn
from model.bert import BertForMaskedLM
from model.deberta_v2 import DebertaV2ForMaskedLM

__all__ = ["get_model", "get_optimizer", "get_lr_scheduler", "get_dataloader_for_pretraining"]


def get_new_state_dict(state_dict, start_index=13):
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[start_index:]
        new_state_dict[name] = v
    return new_state_dict


class LMModel(nn.Module):
    def __init__(self, model, config, args):
        super().__init__()

        self.checkpoint = args.checkpoint_activations
        self.config = config
        self.model = model
        if self.checkpoint:
            self.model.gradient_checkpointing_enable()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        # Only return lm_logits
        return self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)


def get_model(args, logger):
    if args.mlm == "bert":
        config = transformers.BertConfig.from_json_file(args.bert_config)
        model = BertForMaskedLM(config)
    elif args.mlm == "deberta_v2":
        config = transformers.DebertaV2Config.from_json_file(args.bert_config)
        model = DebertaV2ForMaskedLM(config)
    else:
        raise Exception("Invalid mlm!")

    if len(args.load_pretrain_model) > 0:
        assert os.path.exists(args.load_pretrain_model)
        # load_checkpoint(args.load_pretrain_model, model, strict=False)
        m_state_dict = torch.load(
            args.load_pretrain_model, map_location=torch.device(f"cuda:{torch.cuda.current_device()}")
        )
        # new_state_dict = get_new_state_dict(m_state_dict)
        model.load_state_dict(
            m_state_dict, strict=True
        )  # must insure that every process have identical parameters !!!!!!!
        logger.info("load model success")

    numel = sum([p.numel() for p in model.parameters()])
    if args.checkpoint_activations:
        model.gradient_checkpointing_enable()
    # model = LMModel(model, config, args)

    return config, model, numel


def get_optimizer(model, lr):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta", "LayerNorm"]

    # configure the weight decay for bert models
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.1},
        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
    return optimizer


def get_lr_scheduler(optimizer, total_steps, warmup_steps=2000, last_epoch=-1):
    # warmup_steps = int(total_steps * warmup_ratio)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, last_epoch=last_epoch
    )
    # lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps)
    return lr_scheduler


def save_ckpt(model, optimizer, lr_scheduler, path, epoch, shard, global_step):
    model_path = path + "_pytorch_model.bin"
    optimizer_lr_path = path + ".op_lrs"
    checkpoint = {}
    checkpoint["optimizer"] = optimizer.state_dict()
    checkpoint["lr_scheduler"] = lr_scheduler.state_dict()
    checkpoint["epoch"] = epoch
    checkpoint["shard"] = shard
    checkpoint["global_step"] = global_step
    model_state = model.state_dict()  # each process must run model.state_dict()
    if gpc.get_global_rank() == 0:
        torch.save(checkpoint, optimizer_lr_path)
        torch.save(model_state, model_path)
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`import os`
			`import sys`

[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`import torch`
			`import transformers`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`from transformers import get_linear_schedule_with_warmup`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
[legacy] clean up legacy code (#4743) * [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci 2023-09-18 08:31:06 +00:00			`from colossalai.legacy.core import global_context as gpc`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`from colossalai.nn.optimizer import HybridAdam`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
			`sys.path.append(os.getcwd())`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`from collections import OrderedDict`

[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`import torch.nn as nn`
			`from model.bert import BertForMaskedLM`
			`from model.deberta_v2 import DebertaV2ForMaskedLM`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`__all__ = ["get_model", "get_optimizer", "get_lr_scheduler", "get_dataloader_for_pretraining"]`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00

			`def get_new_state_dict(state_dict, start_index=13):`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`new_state_dict = OrderedDict()`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`for k, v in state_dict.items():`
			`name = k[start_index:]`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00			`new_state_dict[name] = v`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`return new_state_dict`


			`class LMModel(nn.Module):`
			`def __init__(self, model, config, args):`
			`super().__init__()`

			`self.checkpoint = args.checkpoint_activations`
			`self.config = config`
			`self.model = model`
			`if self.checkpoint:`
			`self.model.gradient_checkpointing_enable()`

			`def forward(self, input_ids, token_type_ids=None, attention_mask=None):`
			`# Only return lm_logits`
			`return self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)`


			`def get_model(args, logger):`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`if args.mlm == "bert":`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`config = transformers.BertConfig.from_json_file(args.bert_config)`
			`model = BertForMaskedLM(config)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`elif args.mlm == "deberta_v2":`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`config = transformers.DebertaV2Config.from_json_file(args.bert_config)`
			`model = DebertaV2ForMaskedLM(config)`
			`else:`
			`raise Exception("Invalid mlm!")`

			`if len(args.load_pretrain_model) > 0:`
			`assert os.path.exists(args.load_pretrain_model)`
			`# load_checkpoint(args.load_pretrain_model, model, strict=False)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`m_state_dict = torch.load(`
			`args.load_pretrain_model, map_location=torch.device(f"cuda:{torch.cuda.current_device()}")`
			`)`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`# new_state_dict = get_new_state_dict(m_state_dict)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`model.load_state_dict(`
			`m_state_dict, strict=True`
			`) # must insure that every process have identical parameters !!!!!!!`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`logger.info("load model success")`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`numel = sum([p.numel() for p in model.parameters()])`
			`if args.checkpoint_activations:`
			`model.gradient_checkpointing_enable()`
			`# model = LMModel(model, config, args)`
[example] reorganize for community examples (#3557) 2023-04-14 08:27:48 +00:00
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`return config, model, numel`


			`def get_optimizer(model, lr):`
			`param_optimizer = list(model.named_parameters())`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`no_decay = ["bias", "gamma", "beta", "LayerNorm"]`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00
			`# configure the weight decay for bert models`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`optimizer_grouped_parameters = [`
			`{"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.1},`
			`{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},`
			`]`
[example] update roberta with newer ColossalAI (#3472) * update roberta example * update roberta example 2023-04-07 02:34:51 +00:00			`optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`return optimizer`


			`def get_lr_scheduler(optimizer, total_steps, warmup_steps=2000, last_epoch=-1):`
			`# warmup_steps = int(total_steps * warmup_ratio)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`lr_scheduler = get_linear_schedule_with_warmup(`
			`optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps, last_epoch=last_epoch`
			`)`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`# lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps)`
			`return lr_scheduler`


			`def save_ckpt(model, optimizer, lr_scheduler, path, epoch, shard, global_step):`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`model_path = path + "_pytorch_model.bin"`
			`optimizer_lr_path = path + ".op_lrs"`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`checkpoint = {}`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`checkpoint["optimizer"] = optimizer.state_dict()`
			`checkpoint["lr_scheduler"] = lr_scheduler.state_dict()`
			`checkpoint["epoch"] = epoch`
			`checkpoint["shard"] = shard`
			`checkpoint["global_step"] = global_step`
			`model_state = model.state_dict() # each process must run model.state_dict()`
add RoBERTa (#1980) * update roberta * update roberta & readme * update roberta & readme * update roberta & readme 2022-11-18 06:04:49 +00:00			`if gpc.get_global_rank() == 0:`
			`torch.save(checkpoint, optimizer_lr_path)`
			`torch.save(model_state, model_path)`