ColossalAI/tests/test_shardformer/test_model/test_shard_whisper.py

import pytest
import torch

import colossalai
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer.layer.utils import Randomizer
from colossalai.tensor.d_tensor.api import clear_layout_converter
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
from tests.kit.model_zoo import model_zoo
from tests.test_shardformer.test_model._utils import (
    build_model_from_hybrid_plugin,
    check_all_grad_tensors,
    check_loss,
    check_output_hidden_state,
    check_weight,
    get_grad_tensors_for_check,
    run_forward_backward_with_hybrid_plugin,
)


def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config):
    # check forward
    org_model, org_optimizer, sharded_model, sharded_optimizer, criterion, booster = build_model_from_hybrid_plugin(
        model_fn, loss_fn, test_config
    )

    org_loss, org_output, sharded_loss, sharded_output = run_forward_backward_with_hybrid_plugin(
        org_model, sharded_model, sharded_optimizer, data_gen_fn, output_transform_fn, criterion, booster
    )

    stage_manager = booster.plugin.stage_manager
    tp_group = booster.plugin.tp_group

    # unwarp the model
    if org_model.__class__.__name__ == "WhisperForConditionalGeneration":
        whisper = org_model.model
        sharded_whisper = sharded_model.unwrap().model
    else:
        whisper = org_model
        sharded_whisper = sharded_model.unwrap()

    # check grad
    if org_model.__class__.__name__ == "WhisperForAudioClassification":
        col_layer_for_check = ["encoder.layers[0].self_attn.q_proj"]
        row_layer_for_check = ["encoder.layers[0].self_attn.out_proj"]
    else:
        col_layer_for_check = [
            "encoder.layers[0].self_attn.q_proj",
            # 'decoder.layers[0].self_attn.q_proj'
        ]
        row_layer_for_check = [
            "encoder.layers[0].self_attn.out_proj",
            #'decoder.layers[0].self_attn.out_proj'
        ]

    # Save gradient tensors for comparison between the original model and the sharded model before optimizer step.
    grads_to_check = {}
    if test_config["precision"] == "fp32":
        atol, rtol = 2e-4, 2e-4
    else:
        atol, rtol = 5e-3, 5e-3

    if stage_manager is None or stage_manager.is_first_stage():
        row_layer_grads = get_grad_tensors_for_check(
            whisper, sharded_whisper, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1
        )
        col_layer_grads = get_grad_tensors_for_check(
            whisper, sharded_whisper, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0
        )
        grads_to_check.update(col_layer_grads)
        grads_to_check.update(row_layer_grads)

    # optimizer executes step
    org_optimizer.step()
    sharded_optimizer.step()

    # check last hidden state & loss
    if stage_manager is None or stage_manager.is_last_stage():
        if test_config["precision"] == "fp32":
            atol, rtol = 2e-4, 2e-4
        else:
            atol, rtol = 5e-3, 5e-3

        if org_model.__class__.__name__ == "WhisperModel":
            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)

        check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)

    # check weights
    if test_config["precision"] == "fp32":
        atol, rtol = 1e-3, 1e-3
    else:
        atol, rtol = 5e-3, 5e-3
    if stage_manager is None or stage_manager.is_first_stage():
        check_weight(
            whisper, sharded_whisper, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
        )
        check_weight(
            whisper, sharded_whisper, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False
        )

    # check grads
    check_all_grad_tensors(grads_to_check)

    torch.cuda.empty_cache()


# TODO fix WhisperForConditionalGeneration enable jit fused operato
# TODO（jianghai) fix fp16
@parameterize(
    "test_config",
    [
        {
            "tp_size": 2,
            "pp_size": 2,
            "num_microbatches": 2,
            "enable_metadata_cache": False,
            "enable_all_optimization": True,
            "use_lazy_init": False,
            "precision": "fp32",
            "initial_scale": 1,
        },
        {
            "tp_size": 1,
            "pp_size": 2,
            "num_microbatches": 4,
            "enable_metadata_cache": False,
            "use_lazy_init": False,
            "precision": "fp32",
            "initial_scale": 1,
        },
        {
            "tp_size": 4,
            "pp_size": 1,
            "enable_all_optimization": True,
            "use_lazy_init": False,
            "precision": "fp32",
        },
        {
            "tp_size": 1,
            "pp_size": 4,
            "num_microbatches": 4,
            "enable_metadata_cache": False,
            "use_lazy_init": False,
            "precision": "fp32",
        },
        # whisper is not supported fp16 for now.
    ],
)
def run_whisper_test(test_config):
    sub_model_zoo = model_zoo.get_sub_registry("transformers_whisper")
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        if test_config["pp_size"] > 2 and name == "transformers_whisper_for_audio_classification":
            continue
        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)

    clear_layout_converter()
    Randomizer.reset_index()
    torch.cuda.empty_cache()


@parameterize(
    "test_config",
    [
        {
            "tp_size": 2,
            "pp_size": 2,
            "num_microbatches": 4,
            "enable_metadata_cache": False,
            "enable_all_optimization": False,
            "use_lazy_init": False,
            "precision": "fp32",
            "initial_scale": 1,
        },
        {
            "tp_size": 2,
            "pp_size": 2,
            "num_microbatches": 2,
            "enable_metadata_cache": False,
            "enable_all_optimization": False,
            "use_lazy_init": False,
            "precision": "fp32",
            "initial_scale": 1,
        },
    ],
)
def run_whisper_3d_test(test_config):
    sub_model_zoo = model_zoo.get_sub_registry("transformers_whisper")

    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)

    clear_layout_converter()
    torch.cuda.empty_cache()


def check_whisper(rank, world_size, port):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    run_whisper_test()


def check_whisper_3d(rank, world_size, port):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    run_whisper_3d_test()


@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_whisper():
    spawn(check_whisper, 4)


@pytest.mark.largedist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_whisper_3d():
    spawn(check_whisper_3d, 8)


if __name__ == "__main__":
    test_whisper()
    test_whisper_3d()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								import pytest
 								import torch
 								import colossalai
 								from colossalai.logging import disable_existing_loggers
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								from colossalai.shardformer.layer.utils import Randomizer
 								from colossalai.tensor.d_tensor.api import clear_layout_converter
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								from tests.kit.model_zoo import model_zoo
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								from tests.test_shardformer.test_model._utils import (
 								    build_model_from_hybrid_plugin,
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								    check_all_grad_tensors,
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    check_loss,
 								    check_output_hidden_state,
 								    check_weight,
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								    get_grad_tensors_for_check,
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    run_forward_backward_with_hybrid_plugin,
 								)
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config):
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    # check forward
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    org_model, org_optimizer, sharded_model, sharded_optimizer, criterion, booster = build_model_from_hybrid_plugin(
 								        model_fn, loss_fn, test_config
 								    )
 								    org_loss, org_output, sharded_loss, sharded_output = run_forward_backward_with_hybrid_plugin(
 								        org_model, sharded_model, sharded_optimizer, data_gen_fn, output_transform_fn, criterion, booster
 								    )
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
 								    stage_manager = booster.plugin.stage_manager
 								    tp_group = booster.plugin.tp_group
-												[test] Hotfix/fix some model test and refactor check util api (#4369)

* fix llama test

* fix test bug of bert, blip2, bloom, gpt2

* fix llama test

* fix opt test

* fix sam test

* fix sam test

* fix t5 test

* fix vit test

* fix whisper test

* fix whisper test

* polish code

* adjust allclose parameter

* Add mistakenly deleted code

* addjust allclose

* change loss function for some base model
											
										
										
											2023-08-03 06:51:36 +00:00
+								    # unwarp the model
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    if org_model.__class__.__name__ == "WhisperForConditionalGeneration":
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								        whisper = org_model.model
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								        sharded_whisper = sharded_model.unwrap().model
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    else:
 								        whisper = org_model
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								        sharded_whisper = sharded_model.unwrap()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
-												[test] Hotfix/fix some model test and refactor check util api (#4369)

* fix llama test

* fix test bug of bert, blip2, bloom, gpt2

* fix llama test

* fix opt test

* fix sam test

* fix sam test

* fix t5 test

* fix vit test

* fix whisper test

* fix whisper test

* polish code

* adjust allclose parameter

* Add mistakenly deleted code

* addjust allclose

* change loss function for some base model
											
										
										
											2023-08-03 06:51:36 +00:00
+								    # check grad
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    if org_model.__class__.__name__ == "WhisperForAudioClassification":
 								        col_layer_for_check = ["encoder.layers[0].self_attn.q_proj"]
 								        row_layer_for_check = ["encoder.layers[0].self_attn.out_proj"]
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    else:
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								        col_layer_for_check = [
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "encoder.layers[0].self_attn.q_proj",
 								            # 'decoder.layers[0].self_attn.q_proj'
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								        ]
 								        row_layer_for_check = [
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "encoder.layers[0].self_attn.out_proj",
 								            #'decoder.layers[0].self_attn.out_proj'
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								        ]
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								    # Save gradient tensors for comparison between the original model and the sharded model before optimizer step.
 								    grads_to_check = {}
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    if test_config["precision"] == "fp32":
-												[shardformer] vit/llama/t5 ignore the sequence parallelism flag and some fix. (#4498)

* [shardformer] chatglm support sequence parallel

[shardformer] chatglm support sequence parallel

[shardformer] chatglm support sequence parallel

[shardformer] chatglm support sequence parallel

[shardformer] chatglm support sequence parallel

[shardformer] chatglm support sequence parallel

* fix

fix

fix

fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* [shardformer] jit fused fix

* activate checks
											
										
										
											2023-08-24 07:50:02 +00:00
+								        atol, rtol = 2e-4, 2e-4
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    else:
 								        atol, rtol = 5e-3, 5e-3
 								    if stage_manager is None or stage_manager.is_first_stage():
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        row_layer_grads = get_grad_tensors_for_check(
 								            whisper, sharded_whisper, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1
 								        )
 								        col_layer_grads = get_grad_tensors_for_check(
 								            whisper, sharded_whisper, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0
 								        )
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								        grads_to_check.update(col_layer_grads)
 								        grads_to_check.update(row_layer_grads)
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								    # optimizer executes step
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    org_optimizer.step()
 								    sharded_optimizer.step()
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
 								    # check last hidden state & loss
 								    if stage_manager is None or stage_manager.is_last_stage():
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        if test_config["precision"] == "fp32":
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								            atol, rtol = 2e-4, 2e-4
 								        else:
 								            atol, rtol = 5e-3, 5e-3
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        if org_model.__class__.__name__ == "WhisperModel":
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
 								        check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)
 								    # check weights
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    if test_config["precision"] == "fp32":
-												[shardformer] support pp+tp+zero1 tests (#4531)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix

* [shardformer] pp+tp+zero1

[shardformer] pp+tp+zero1

[shardformer] pp+tp+zero1

[shardformer] pp+tp+zero1

[shardformer] pp+tp+zero1

[shardformer] pp+tp+zero1

* [shardformer] pp+tp+zero1

* [shardformer] pp+tp+zero1

* [shardformer] pp+tp+zero1

* [shardformer] pp+tp+zero1
											
										
										
											2023-08-30 13:29:18 +00:00
+								        atol, rtol = 1e-3, 1e-3
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    else:
 								        atol, rtol = 5e-3, 5e-3
 								    if stage_manager is None or stage_manager.is_first_stage():
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        check_weight(
 								            whisper, sharded_whisper, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
 								        )
 								        check_weight(
 								            whisper, sharded_whisper, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False
 								        )
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
+								    # check grads
 								    check_all_grad_tensors(grads_to_check)
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    torch.cuda.empty_cache()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
-												[shardformer] fix opt test hanging (#4521)

* [shardformer] fix opt test hanging

* fix

* test

* test

* test

* fix test

* fix test

* remove print

* add fix
											
										
										
											2023-08-30 06:50:34 +00:00
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								# TODO fix WhisperForConditionalGeneration enable jit fused operato
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								# TODO（jianghai) fix fp16
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								@parameterize(
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    "test_config",
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								    [
 								        {
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "tp_size": 2,
 								            "pp_size": 2,
 								            "num_microbatches": 2,
-												[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
											
										
										
											2024-01-11 11:07:45 +00:00
+								            "enable_metadata_cache": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "enable_all_optimization": True,
-												[shardformer] update transformers (#5583)

* flash_attention forward upgrade

* llama_model_forward

* remove useless comment

* update the requirements.txt

* add the transformers version requirements

* remove the LATEST VERSION try

* [shardformer] update bloom model (#5518)

* update bloom model

* remove the version restriction

* [shardformer] update_falcon (#5520)

* [shardformer] update mistral model (#5511)

* [shardformer] update gpt2 (#5502)

* [shardformer] update gptj model (#5503)

* [shardformer] update opt (#5522)

* [shardformer] update t5 model (#5524)

* [shardformer] update whisper model (#5529)

* [shardformer] update vit model (#5530)

* update vit model

* remove the output_hidden_states

* [shardformer] fix llama modeling

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [zero] support multiple (partial) backward passes (#5596)

* [zero] support multiple (partial) backward passes

* [misc] update requirements

* [zero] support multiple (partial) backward passes (#5596)

* [zero] support multiple (partial) backward passes

* [misc] update requirements

* fix conflicts

* [doc] fix ColossalMoE readme (#5599)

* fix readme

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* merge with main

* merge with main

* llama_model_forward

* remove useless comment

* remove the LATEST VERSION try

* [shardformer] update bloom model (#5518)

* update bloom model

* remove the version restriction

* [shardformer] update mistral model (#5511)

* [shardformer] update opt (#5522)

* [shardformer] update whisper model (#5529)

* [shardformer] fix llama modeling

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)

* fix no pad token bug

* fixed some auto parallel codegen bug, but might not run on torch 2.1

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [shardformer] fix pipeline grad ckpt (#5620)

* [shardformer] fix pipeline grad ckpt

* [shardformer] fix whisper (#5628)

* [test] fix llama model test

* fix the opt upgrade (#5634)

* [shardformer] fix attn replacement (#5636)

* [shardformer] update flashattention replacement (#5637)

* update transformers

update transformers

fix

fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [test] fix llama test (#5638)

* [gemini] fix buffer cast (#5639)

* Fix shardformer upgrade (#5640)

* fix llama model

* fix the mistral

* fix the shardformer model

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [shardformer]support pipeline parallelism for mistral. (#5642)

* [shardformer] fix attn replacement (#5636)

* [shardformer] update flashattention replacement (#5637)

* update transformers

update transformers

fix

fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Feature] Support LLaMA-3 CPT and ST (#5619)

* support LLaMA-3

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Run pre-commit

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [exampe] update llama example (#5626)

* [plugin] support dp inside for hybriad parallel

* [example] update llama benchmark

* [example] update llama benchmark

* [example] update llama readme

* [example] update llama readme

* [example] llama3 (#5631)

* release llama3

* [release] llama3

* [release] llama3

* [release] llama3

* [release] llama3

* [test] fix llama test (#5638)

* [gemini] fix buffer cast (#5639)

* support pp for mistral

* fix

* fix

fix

fix

* fix

---------

Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>

---------

Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
											
										
										
											2024-04-24 14:51:50 +00:00
+								            "use_lazy_init": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "precision": "fp32",
 								            "initial_scale": 1,
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								        },
 								        {
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "tp_size": 1,
 								            "pp_size": 2,
 								            "num_microbatches": 4,
-												[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
											
										
										
											2024-01-11 11:07:45 +00:00
+								            "enable_metadata_cache": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "use_lazy_init": False,
 								            "precision": "fp32",
 								            "initial_scale": 1,
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								        },
 								        {
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "tp_size": 4,
 								            "pp_size": 1,
 								            "enable_all_optimization": True,
 								            "use_lazy_init": False,
 								            "precision": "fp32",
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								        },
 								        {
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "tp_size": 1,
 								            "pp_size": 4,
 								            "num_microbatches": 4,
-												[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
											
										
										
											2024-01-11 11:07:45 +00:00
+								            "enable_metadata_cache": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "use_lazy_init": False,
 								            "precision": "fp32",
-												[shardformer] zero1+pp and the corresponding tests (#4517)

* pause

* finish pp+zero1

* Update test_shard_vit.py
											
										
										
											2023-08-28 02:51:16 +00:00
+								        },
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        # whisper is not supported fp16 for now.
 								    ],
 								)
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								def run_whisper_test(test_config):
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    sub_model_zoo = model_zoo.get_sub_registry("transformers_whisper")
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								        if test_config["pp_size"] > 2 and name == "transformers_whisper_for_audio_classification":
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								            continue
 								        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 								    clear_layout_converter()
 								    Randomizer.reset_index()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    torch.cuda.empty_cache()
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								@parameterize(
 								    "test_config",
 								    [
 								        {
 								            "tp_size": 2,
 								            "pp_size": 2,
 								            "num_microbatches": 4,
-												[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
											
										
										
											2024-01-11 11:07:45 +00:00
+								            "enable_metadata_cache": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "enable_all_optimization": False,
 								            "use_lazy_init": False,
 								            "precision": "fp32",
 								            "initial_scale": 1,
 								        },
 								        {
 								            "tp_size": 2,
 								            "pp_size": 2,
 								            "num_microbatches": 2,
-												[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
											
										
										
											2024-01-11 11:07:45 +00:00
+								            "enable_metadata_cache": False,
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								            "enable_all_optimization": False,
 								            "use_lazy_init": False,
 								            "precision": "fp32",
 								            "initial_scale": 1,
 								        },
 								    ],
 								)
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
+								def run_whisper_3d_test(test_config):
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    sub_model_zoo = model_zoo.get_sub_registry("transformers_whisper")
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
 								    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
 								        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
 								    clear_layout_converter()
 								    torch.cuda.empty_cache()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								def check_whisper(rank, world_size, port):
 								    disable_existing_loggers()
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								    run_whisper_test()
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
+								def check_whisper_3d(rank, world_size, port):
 								    disable_existing_loggers()
-												[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
											
										
										
											2023-09-19 06:20:26 +00:00
+								    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
+								    run_whisper_3d_test()
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								@pytest.mark.dist
 								@rerun_if_address_is_in_use()
 								@clear_cache_before_run()
 								def test_whisper():
-												[shardformer] Pipeline/whisper (#4456)

* add some base tests and policies

* finish whisper base model

* add conditional generation

* finish basic tests

* whisper

* finish whisper

* finish whisper

* del useless  whisper test

* fix

* add argmin to replace

* finish revision
											
										
										
											2023-08-18 13:29:25 +00:00
+								    spawn(check_whisper, 4)
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
+								@pytest.mark.largedist
 								@rerun_if_address_is_in_use()
 								@clear_cache_before_run()
 								def test_whisper_3d():
 								    spawn(check_whisper_3d, 8)
-												[shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme

											
										
										
											2023-07-17 06:25:32 +00:00
+								if __name__ == "__main__":
 								    test_whisper()
-												[shardformer] tests for 3d parallel (#4493)


											
										
										
											2023-08-23 07:05:24 +00:00
+								    test_whisper_3d()