ColossalAI/tests/test_auto_parallel/test_offload/model_utils.py

import torch
import torch.nn as nn
from transformers import BertConfig, BertLMHeadModel, GPT2Config, GPT2LMHeadModel

# from tests.components_to_test.registry import non_distributed_component_funcs


class GPTLMModel(nn.Module):
    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257):
        super().__init__()
        self.model = GPT2LMHeadModel(
            GPT2Config(
                n_embd=hidden_size,
                n_layer=num_layers,
                n_head=num_attention_heads,
                n_positions=max_seq_len,
                n_ctx=max_seq_len,
                vocab_size=vocab_size,
            )
        )

    def forward(self, input_ids, attention_mask):
        # Only return lm_logits
        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]


class LMLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))


class BertLMModel(nn.Module):
    def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=32, vocab_size=30522):
        super().__init__()
        self.model = BertLMHeadModel(
            BertConfig(
                n_embd=hidden_size,
                num_hidden_layers=num_layers,
                hidden_size=hidden_size,
                num_attention_heads=num_attention_heads,
                max_position_embeddings=hidden_size,
                vocab_size=vocab_size,
            )
        )

    def forward(self, input_ids, attention_mask):
        # Only return lm_logits
        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]


# @non_distributed_component_funcs.register(name="bert_")
def get_bert_components():
    vocab_size = 1024
    seq_len = 64
    batchSize = 64

    def bert_model_builder():
        model = BertLMModel(hidden_size=8192, num_layers=4, num_attention_heads=32, vocab_size=vocab_size)
        return model

    def bert_data_gen(device="meta"):
        input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
        attention_mask = torch.ones_like(input_ids, device=device)
        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
        return kwargs

    return bert_model_builder, bert_data_gen


# @non_distributed_component_funcs.register(name="gpt2_")
def get_gpt2_components():
    vocab_size = 1024
    seq_len = 8
    batchSize = 64

    def gpt2_model_builder():
        model = GPTLMModel(hidden_size=8192, num_layers=2, num_attention_heads=32, vocab_size=vocab_size)
        return model

    def gpt2_data_gen(device="meta"):
        input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
        attention_mask = torch.ones_like(input_ids, device=device)
        kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
        return kwargs

    return gpt2_model_builder, gpt2_data_gen
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`import torch`
			`import torch.nn as nn`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`from transformers import BertConfig, BertLMHeadModel, GPT2Config, GPT2LMHeadModel`

[test] Fix/fix testcase (#5770) * [fix] branch for fix testcase; * [fix] fix test_analyzer & test_auto_parallel; * [fix] remove local change about moe; * [fix] rm local change moe; 6 months ago			`# from tests.components_to_test.registry import non_distributed_component_funcs`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`class GPTLMModel(nn.Module):`
			`def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257):`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`super().__init__()`
			`self.model = GPT2LMHeadModel(`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`GPT2Config(`
			`n_embd=hidden_size,`
			`n_layer=num_layers,`
			`n_head=num_attention_heads,`
			`n_positions=max_seq_len,`
			`n_ctx=max_seq_len,`
			`vocab_size=vocab_size,`
			`)`
			`)`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`def forward(self, input_ids, attention_mask):`
			`# Only return lm_logits`
			`return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]`


			`class LMLoss(nn.Module):`
			`def __init__(self):`
			`super().__init__()`
			`self.loss_fn = nn.CrossEntropyLoss()`

			`def forward(self, logits, labels):`
			`shift_logits = logits[..., :-1, :].contiguous()`
			`shift_labels = labels[..., 1:].contiguous()`
			`# Flatten the tokens`
			`return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`class BertLMModel(nn.Module):`
			`def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=32, vocab_size=30522):`
			`super().__init__()`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`self.model = BertLMHeadModel(`
			`BertConfig(`
			`n_embd=hidden_size,`
			`num_hidden_layers=num_layers,`
			`hidden_size=hidden_size,`
			`num_attention_heads=num_attention_heads,`
			`max_position_embeddings=hidden_size,`
			`vocab_size=vocab_size,`
			`)`
			`)`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`def forward(self, input_ids, attention_mask):`
			`# Only return lm_logits`
			`return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago
[test] Fix/fix testcase (#5770) * [fix] branch for fix testcase; * [fix] fix test_analyzer & test_auto_parallel; * [fix] remove local change about moe; * [fix] rm local change moe; 6 months ago			`# @non_distributed_component_funcs.register(name="bert_")`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`def get_bert_components():`
			`vocab_size = 1024`
			`seq_len = 64`
			`batchSize = 64`

			`def bert_model_builder():`
			`model = BertLMModel(hidden_size=8192, num_layers=4, num_attention_heads=32, vocab_size=vocab_size)`
			`return model`

			`def bert_data_gen(device="meta"):`
			`input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)`
			`attention_mask = torch.ones_like(input_ids, device=device)`
			`kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)`
			`return kwargs`

			`return bert_model_builder, bert_data_gen`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago
[test] Fix/fix testcase (#5770) * [fix] branch for fix testcase; * [fix] fix test_analyzer & test_auto_parallel; * [fix] remove local change about moe; * [fix] rm local change moe; 6 months ago			`# @non_distributed_component_funcs.register(name="gpt2_")`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`def get_gpt2_components():`
			`vocab_size = 1024`
			`seq_len = 8`
			`batchSize = 64`

			`def gpt2_model_builder():`
			`model = GPTLMModel(hidden_size=8192, num_layers=2, num_attention_heads=32, vocab_size=vocab_size)`
			`return model`

			`def gpt2_data_gen(device="meta"):`
			`input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)`
			`attention_mask = torch.ones_like(input_ids, device=device)`
			`kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)`
			`return kwargs`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`return gpt2_model_builder, gpt2_data_gen`