[Gemini] add albert in test models. (#2075)

pull/2076/head^2
Jiarui Fang 2022-12-05 14:09:34 +08:00 committed by GitHub
parent 616ed91ecd
commit 40b7d55bf3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 64 additions and 3 deletions

View File

@ -9,3 +9,5 @@ from . import (
simple_net,
)
from .utils import run_fwd_bwd
from . import albert # isort:skip

View File

@ -0,0 +1,59 @@
import torch
import transformers
from packaging import version
from transformers import AlbertConfig, AlbertForSequenceClassification
from .bert import get_bert_data_loader
from .registry import non_distributed_component_funcs
@non_distributed_component_funcs.register(name='albert')
def get_training_components():
hidden_dim = 8
num_head = 4
sequence_length = 12
num_layer = 2
vocab_size = 32
def bert_model_builder(checkpoint: bool = False):
config = AlbertConfig(vocab_size=vocab_size,
gradient_checkpointing=checkpoint,
hidden_size=hidden_dim,
intermediate_size=hidden_dim * 4,
num_attention_heads=num_head,
max_position_embeddings=sequence_length,
num_hidden_layers=num_layer,
hidden_dropout_prob=0.,
attention_probs_dropout_prob=0.)
print('building AlbertForSequenceClassification model')
# adapting huggingface BertForSequenceClassification for single unitest calling interface
class ModelAaptor(AlbertForSequenceClassification):
def forward(self, input_ids, labels):
"""
inputs: data, label
outputs: loss
"""
return super().forward(input_ids=input_ids, labels=labels)[0]
model = ModelAaptor(config)
# if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
# model.gradient_checkpointing_enable()
return model
is_distrbuted = torch.distributed.is_initialized()
trainloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distrbuted=is_distrbuted)
testloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distrbuted=is_distrbuted)
criterion = None
return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion

View File

@ -22,7 +22,7 @@ def run_fwd_bwd(model, data, label, criterion, enable_autocast=False, dtype=torc
def test_runtime_mem_tracer():
test_models = ['gpt2', 'bert', 'simple_net', 'repeated_computed_layers', 'nested_model']
test_models = ['gpt2', 'bert', 'simple_net', 'repeated_computed_layers', 'nested_model', 'albert']
for model_name in test_models:
get_components_func = non_distributed_component_funcs.get_callable(model_name)

View File

@ -36,7 +36,7 @@ def check_grad(model: ZeroDDP, torch_model: torch.nn.Module):
@parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const'])
@parameterize('keep_gather', [False, True])
@parameterize('model_name', ['gpt2', 'bert'])
@parameterize('model_name', ['gpt2', 'bert', 'albert'])
@parameterize('use_grad_checkpoint', [False, True])
def exam_gpt_fwd_bwd(placement_policy, keep_gather, model_name: str, use_grad_checkpoint: bool = False):
set_seed(42)

View File

@ -27,7 +27,7 @@ from tests.test_tensor.common_utils import debug_print, set_seed
# this model is large enough to slice to chunks
TEST_MODELS = ['gpt2']
# these models are too small, all parameters in these models are compacted into one chunk
EXAMPLE_MODELS = ['hanging_param_model', 'bert', 'simple_net', 'nested_model', 'repeated_computed_layers']
EXAMPLE_MODELS = ['albert', 'hanging_param_model', 'bert', 'simple_net', 'nested_model', 'repeated_computed_layers']
def check_param(model: ZeroDDP, torch_model: torch.nn.Module):