diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py index d5a4ce083..145ccf97c 100644 --- a/tests/test_shardformer/test_model/test_shard_bloom.py +++ b/tests/test_shardformer/test_model/test_shard_bloom.py @@ -36,11 +36,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # check last hidden state & loss if stage_manager is None or stage_manager.is_last_stage(): - + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 if org_model.__class__.__name__ == 'BloomModel': - check_output_hidden_state(org_output, sharded_output, stage_manager, atol=1e-5, rtol=1e-3) + check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol) - check_loss(org_loss, sharded_loss, atol=1e-6, rtol=1e-3) + check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model if org_model.__class__.__name__ == 'BloomModel': @@ -54,14 +57,22 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, row_layer_for_check = ['h[0].self_attention.query_key_value', 'word_embeddings'] col_layer_for_check = ['h[0].self_attention.dense'] if stage_manager is None or stage_manager.is_first_stage(): - check_grad(bloom, sharded_bloom, row_layer_for_check, tp_group, atol=1e-6, rtol=1e-5, dim=0, verbose=False) - check_grad(bloom, sharded_bloom, col_layer_for_check, tp_group, atol=1e-6, rtol=1e-5, dim=1, verbose=False) + if test_config['precision'] == 'fp32': + atol, rtol = 1e-6, 1e-5 + else: + atol, rtol = 5e-3, 5e-3 + check_grad(bloom, sharded_bloom, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False) + check_grad(bloom, sharded_bloom, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False) # check weights after optimizer.step() org_optimizer.step() sharded_optimizer.step() if stage_manager is None or stage_manager.is_first_stage(): - check_weight(bloom, sharded_bloom, col_layer_for_check, tp_group, atol=1e-4, rtol=1e-3, dim=1, verbose=False) + if test_config['precision'] == 'fp32': + atol, rtol = 1e-4, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 + check_weight(bloom, sharded_bloom, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False) torch.cuda.empty_cache() @@ -70,29 +81,29 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'tp_size': 2, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': True, - 'use_lazy_init': True + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp16', + 'initial_scale': 1, }, { 'tp_size': 1, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': False, - 'use_lazy_init': False + 'enable_all_optimization': False, + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 4, 'pp_size': 1, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', }]) def run_bloom_test(test_config): # TODO: add test_config for TP+DP after supporting & debugging it - # {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True} - - # TODO: add test_config for flash attention & jit operator after supporting sub_model_zoo = model_zoo.get_sub_registry('transformers_bloom') - test_config['precision'] = 'float' # Do not use fp16/bf16 in testing for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) diff --git a/tests/test_shardformer/test_model/test_shard_chatglm.py b/tests/test_shardformer/test_model/test_shard_chatglm.py index 69e63ffc8..e9c74b300 100644 --- a/tests/test_shardformer/test_model/test_shard_chatglm.py +++ b/tests/test_shardformer/test_model/test_shard_chatglm.py @@ -37,11 +37,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # check last hidden state & loss if stage_manager is None or stage_manager.is_last_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 if org_model.__class__.__name__ == 'ChatGLMModel': - check_output_hidden_state(org_output, sharded_output, stage_manager, atol=1e-5, rtol=1e-3, dim=1) + check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol, dim=1) - check_loss(org_loss, sharded_loss, atol=1e-6, rtol=1e-3) + check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model if org_model.__class__.__name__ == 'ChatGLMModel': @@ -55,12 +59,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, row_layer_for_check = ['encoder.layers[0].self_attention.query_key_value', 'embedding.word_embeddings'] col_layer_for_check = ['encoder.layers[0].self_attention.dense'] if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-6, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_grad(chatglm_model, shard_chatglm_model, row_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=0, verbose=False) @@ -68,8 +76,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, shard_chatglm_model, col_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -77,12 +85,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, org_optimizer.step() sharded_optimizer.step() if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-4, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_weight(chatglm_model, shard_chatglm_model, col_layer_for_check, tp_group, - atol=1e-4, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -93,29 +105,29 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'tp_size': 2, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': True, - 'use_lazy_init': True + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp16', + 'initial_scale': 1, }, { 'tp_size': 1, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': False, - 'use_lazy_init': False + 'enable_all_optimization': False, + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 4, 'pp_size': 1, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', }]) def run_chatglm_test(test_config): # TODO: add test_config for TP+DP after supporting & debugging it - # {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True} - - # TODO: add test_config for flash attention & jit operator after supporting sub_model_zoo = model_zoo.get_sub_registry('transformers_chatglm') - test_config['precision'] = 'float' # Do not use fp16/bf16 in testing for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py index 274cfaa39..8b7a6bf29 100644 --- a/tests/test_shardformer/test_model/test_shard_gpt2.py +++ b/tests/test_shardformer/test_model/test_shard_gpt2.py @@ -63,22 +63,22 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, row_layer_for_check = ['wte', 'h[0].mlp.c_proj'] # check grad - if test_config['precision'] == 'fp32': - atol, rtol = 1e-4, 1e-3 - else: - atol, rtol = 5e-3, 5e-3 if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-4, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_grad(gpt2, sharded_gpt2, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False) check_grad(gpt2, sharded_gpt2, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False) # check weights after optimizer.step() org_optimizer.step() sharded_optimizer.step() - if test_config['precision'] == 'fp32': - atol, rtol = 5e-3, 1e-3 - else: - atol, rtol = 5e-3, 5e-3 if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 5e-3, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_weight(gpt2, sharded_gpt2, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False) torch.cuda.empty_cache() diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index c5f8d22f1..fa4ee43e3 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -41,11 +41,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # check last hidden state & loss if stage_manager is None or stage_manager.is_last_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 if org_model.__class__.__name__ == 'LlamaModel': - check_output_hidden_state(org_output, sharded_output, stage_manager, atol=1e-5, rtol=1e-3) + check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol) - check_loss(org_loss, sharded_loss, atol=1e-6, rtol=1e-3) + check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model if org_model.__class__.__name__ == 'LlamaModel': @@ -59,20 +63,24 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, row_layer_for_check = ['layers[0].self_attn.q_proj', 'embed_tokens'] col_layer_for_check = ['layers[0].self_attn.o_proj'] if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-6, 1e-4 + else: + atol, rtol = 5e-3, 5e-3 check_grad(llama_model, shard_llama_model, row_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-4, + atol=atol, + rtol=rtol, dim=0, verbose=False) check_grad(llama_model, shard_llama_model, col_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-4, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -80,12 +88,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, org_optimizer.step() sharded_optimizer.step() if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-4, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_weight(llama_model, shard_llama_model, col_layer_for_check, tp_group, - atol=1e-4, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -96,33 +108,34 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'tp_size': 2, 'pp_size': 2, 'num_microbatches': 2, - 'enable_fused_normalization': True, - 'use_lazy_init': True + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp16', + 'initial_scale': 1, }, { 'tp_size': 1, 'pp_size': 2, 'num_microbatches': 4, - 'use_lazy_init': False + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 4, 'pp_size': 1, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 1, 'pp_size': 4, 'num_microbatches': 4, - 'use_lazy_init': False + 'use_lazy_init': False, + 'precision': 'fp32', }]) def run_llama_test(test_config): # TODO: add test_config for TP+DP after supporting & debugging it - # {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True} - - # TODO: add test_config for flash attention & jit operator after supporting sub_model_zoo = model_zoo.get_sub_registry('transformers_llama') - test_config['precision'] = 'float' # Do not use fp16/bf16 in testing for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) diff --git a/tests/test_shardformer/test_model/test_shard_opt.py b/tests/test_shardformer/test_model/test_shard_opt.py index d8fa8104b..403c3e75f 100644 --- a/tests/test_shardformer/test_model/test_shard_opt.py +++ b/tests/test_shardformer/test_model/test_shard_opt.py @@ -41,11 +41,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # check last hidden state & loss if stage_manager is None or stage_manager.is_last_stage(): - + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 if org_model.__class__.__name__ == 'OPTModel': - check_output_hidden_state(org_output, sharded_output, stage_manager, atol=1e-5, rtol=1e-3) + check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol) - check_loss(org_loss, sharded_loss, atol=1e-6, rtol=1e-3) + check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model if org_model.__class__.__name__ == 'OPTModel': @@ -56,23 +59,27 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, shard_opt_model = sharded_model.unwrap().model # check grad - row_layer_for_check = ['decoder.layers[0].self_attn.q_proj', 'decoder.embed_tokens'] + row_layer_for_check = ['decoder.layers[0].self_attn.q_proj', 'decoder.embed_tokens'] # 'decoder.embed_tokens' col_layer_for_check = ['decoder.layers[0].self_attn.out_proj'] if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-6, 1e-3 + else: + atol, rtol = 3e-2, 3e-2 check_grad(opt_model, shard_opt_model, row_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=0, verbose=False) check_grad(opt_model, shard_opt_model, col_layer_for_check, tp_group, - atol=1e-6, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -80,12 +87,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, org_optimizer.step() sharded_optimizer.step() if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-3, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_weight(opt_model, shard_opt_model, col_layer_for_check, tp_group, - atol=1e-3, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -96,29 +107,29 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'tp_size': 2, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': True, - 'use_lazy_init': True + 'enable_all_optimization': True, + 'use_lazy_init': True, + 'precision': 'fp16', + 'initial_scale': 1, }, { 'tp_size': 1, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': False, - 'use_lazy_init': False + 'enable_all_optimization': False, + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 4, 'pp_size': 1, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', }]) def run_opt_test(test_config): # TODO: add test_config for TP+DP after supporting & debugging it - # {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True} - - # TODO: add test_config for flash attention & jit operator after supporting sub_model_zoo = model_zoo.get_sub_registry('transformers_opt') - test_config['precision'] = 'float' # Do not use fp16/bf16 in testing for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py index 8a78d7c2b..919bceffc 100644 --- a/tests/test_shardformer/test_model/test_shard_vit.py +++ b/tests/test_shardformer/test_model/test_shard_vit.py @@ -37,11 +37,15 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, # check last hidden state & loss if stage_manager is None or stage_manager.is_last_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 if org_model.__class__.__name__ == 'ViTModel': - check_output_hidden_state(org_output, sharded_output, stage_manager, atol=1e-5, rtol=1e-3) + check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol) - check_loss(org_loss, sharded_loss, atol=1e-6, rtol=1e-3) + check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol) # unwrap model if org_model.__class__.__name__ == 'ViTModel': @@ -55,20 +59,24 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, row_layer_for_check = ['encoder.layer[0].attention.attention.query', 'embeddings.patch_embeddings.projection'] col_layer_for_check = ['encoder.layer[0].attention.output.dense'] if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 1e-5, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_grad(vit_model, shard_vit_model, row_layer_for_check, tp_group, - atol=1e-5, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=0, verbose=False) check_grad(vit_model, shard_vit_model, col_layer_for_check, tp_group, - atol=1e-5, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -76,12 +84,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, org_optimizer.step() sharded_optimizer.step() if stage_manager is None or stage_manager.is_first_stage(): + if test_config['precision'] == 'fp32': + atol, rtol = 5e-3, 1e-3 + else: + atol, rtol = 5e-3, 5e-3 check_weight(vit_model, shard_vit_model, col_layer_for_check, tp_group, - atol=5e-3, - rtol=1e-3, + atol=atol, + rtol=rtol, dim=1, verbose=False) @@ -92,30 +104,30 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, 'tp_size': 2, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp16', + 'initial_scale': 1, }, { 'tp_size': 1, 'pp_size': 2, 'num_microbatches': 4, - 'enable_fused_normalization': False, - 'use_lazy_init': False + 'enable_all_optimization': False, + 'use_lazy_init': False, + 'precision': 'fp32', }, { 'tp_size': 4, 'pp_size': 1, - 'enable_fused_normalization': True, - 'use_lazy_init': False + 'enable_all_optimization': True, + 'use_lazy_init': False, + 'precision': 'fp32', }]) def run_vit_test(test_config): # TODO: add test_config for TP+DP after supporting & debugging it - # {'tp_size': 2, 'pp_size': 1, 'enable_fused_normalization': True} - - # TODO: add test_config for flash attention & jit operator after supporting # TODO: fix bug when settign lazy_init for Conv2D Layers in ViT models sub_model_zoo = model_zoo.get_sub_registry('transformers_vit') - test_config['precision'] = 'float' # Do not use fp16/bf16 in testing for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)