|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
from transformers import BertConfig, BertLMHeadModel, GPT2Config, GPT2LMHeadModel
|
|
|
|
|
|
|
|
# from tests.components_to_test.registry import non_distributed_component_funcs
|
|
|
|
|
|
|
|
|
|
|
|
class GPTLMModel(nn.Module):
|
|
|
|
def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=12, max_seq_len=1024, vocab_size=50257):
|
|
|
|
super().__init__()
|
|
|
|
self.model = GPT2LMHeadModel(
|
|
|
|
GPT2Config(
|
|
|
|
n_embd=hidden_size,
|
|
|
|
n_layer=num_layers,
|
|
|
|
n_head=num_attention_heads,
|
|
|
|
n_positions=max_seq_len,
|
|
|
|
n_ctx=max_seq_len,
|
|
|
|
vocab_size=vocab_size,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
def forward(self, input_ids, attention_mask):
|
|
|
|
# Only return lm_logits
|
|
|
|
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
|
|
|
|
|
|
|
|
|
|
|
|
class LMLoss(nn.Module):
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self.loss_fn = nn.CrossEntropyLoss()
|
|
|
|
|
|
|
|
def forward(self, logits, labels):
|
|
|
|
shift_logits = logits[..., :-1, :].contiguous()
|
|
|
|
shift_labels = labels[..., 1:].contiguous()
|
|
|
|
# Flatten the tokens
|
|
|
|
return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
|
|
|
|
|
|
|
|
|
|
|
class BertLMModel(nn.Module):
|
|
|
|
def __init__(self, hidden_size=768, num_layers=12, num_attention_heads=32, vocab_size=30522):
|
|
|
|
super().__init__()
|
|
|
|
self.model = BertLMHeadModel(
|
|
|
|
BertConfig(
|
|
|
|
n_embd=hidden_size,
|
|
|
|
num_hidden_layers=num_layers,
|
|
|
|
hidden_size=hidden_size,
|
|
|
|
num_attention_heads=num_attention_heads,
|
|
|
|
max_position_embeddings=hidden_size,
|
|
|
|
vocab_size=vocab_size,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
def forward(self, input_ids, attention_mask):
|
|
|
|
# Only return lm_logits
|
|
|
|
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
|
|
|
|
|
|
|
|
|
|
|
|
# @non_distributed_component_funcs.register(name="bert_")
|
|
|
|
def get_bert_components():
|
|
|
|
vocab_size = 1024
|
|
|
|
seq_len = 64
|
|
|
|
batchSize = 64
|
|
|
|
|
|
|
|
def bert_model_builder():
|
|
|
|
model = BertLMModel(hidden_size=8192, num_layers=4, num_attention_heads=32, vocab_size=vocab_size)
|
|
|
|
return model
|
|
|
|
|
|
|
|
def bert_data_gen(device="meta"):
|
|
|
|
input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
|
|
|
|
attention_mask = torch.ones_like(input_ids, device=device)
|
|
|
|
kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
|
|
|
|
return kwargs
|
|
|
|
|
|
|
|
return bert_model_builder, bert_data_gen
|
|
|
|
|
|
|
|
|
|
|
|
# @non_distributed_component_funcs.register(name="gpt2_")
|
|
|
|
def get_gpt2_components():
|
|
|
|
vocab_size = 1024
|
|
|
|
seq_len = 8
|
|
|
|
batchSize = 64
|
|
|
|
|
|
|
|
def gpt2_model_builder():
|
|
|
|
model = GPTLMModel(hidden_size=8192, num_layers=2, num_attention_heads=32, vocab_size=vocab_size)
|
|
|
|
return model
|
|
|
|
|
|
|
|
def gpt2_data_gen(device="meta"):
|
|
|
|
input_ids = torch.randint(0, vocab_size, (batchSize, seq_len), device=device)
|
|
|
|
attention_mask = torch.ones_like(input_ids, device=device)
|
|
|
|
kwargs = dict(input_ids=input_ids, attention_mask=attention_mask)
|
|
|
|
return kwargs
|
|
|
|
|
|
|
|
return gpt2_model_builder, gpt2_data_gen
|