ColossalAI/tests/test_pipeline/test_policy/test_bert_model.py

import pytest
import torch
import torch.distributed as dist
from transformers.models.bert.modeling_bert import BertModel

import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.pipeline.policy.bert import BertModelPolicy, bert_model_forward
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.testing import rerun_if_address_is_in_use, spawn


def check_bert_model_forward():
    model = BertModel.from_pretrained('bert-base-uncased')
    DP_DIM, PP_DIM = 0, 1
    DP_SIZE, PP_SIZE = 2, 2
    RANK_TO_COORDINATE = {
        0: (0, 0),
        1: (0, 1),
        2: (1, 0),
        3: (1, 1),
    }
    PP_RANKS_IN_GROUP = {
        0: [0, 1],
        1: [0, 1],
        2: [2, 3],
        3: [2, 3],
    }
    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)

    # print(pg_mesh)

    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    rank = dist.get_rank()
    # print(rank)

    x = torch.randint(0, 1000, (2, 3))
    hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
    if stage_manager.stage == 0:
        attention_mask = torch.ones_like(x)
        output = bert_model_forward(self=model, input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
        print(output['hidden_states'].shape)
        assert output['hidden_states'].shape == (2, 3, 768)
        print('start the training')
    else:
        attention_mask = torch.ones((2, 3))
        output = bert_model_forward(self=model,
                                    hidden_states=hidden_states,
                                    attention_mask=attention_mask,
                                    stage_manager=stage_manager)
        print(output[0].shape)
        assert output[0].shape == (2, 3, 768)
        print('end the training')
        print(output)

    # assert output[1].shape == (2, 768)


def check_bert_model_policy():
    model = BertModel.from_pretrained('bert-base-uncased')
    DP_DIM, PP_DIM = 0, 1
    DP_SIZE, PP_SIZE = 2, 2
    RANK_TO_COORDINATE = {
        0: (0, 0),
        1: (0, 1),
        2: (1, 0),
        3: (1, 1),
    }
    PP_RANKS_IN_GROUP = {
        0: [0, 1],
        1: [0, 1],
        2: [2, 3],
        3: [2, 3],
    }
    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
    # print(pg_mesh)

    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    rank = dist.get_rank()

    model_policy = BertModelPolicy(stage_manager, len(model.encoder.layer))
    assert model_policy.layers_per_stage == [6, 6]
    layers = model_policy.get_hold_layers(model)
    for layer in layers:
        print(layer)


def run_dist_model(rank, world_size, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
    check_bert_model_forward()


def run_dist_policy(rank, world_size, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
    check_bert_model_policy()


@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_model_forward():
    spawn(run_dist_model, 4)


@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_model_policy():
    spawn(run_dist_policy, 4)


if __name__ == "__main__":
    """test the bert model forward and bert model policy"""
    test_bert_model_forward()
    test_bert_model_policy()
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00			`import pytest`
			`import torch`
			`import torch.distributed as dist`
			`from transformers.models.bert.modeling_bert import BertModel`

			`import colossalai`
			`from colossalai.cluster import ProcessGroupMesh`
			`from colossalai.pipeline.policy.bert import BertModelPolicy, bert_model_forward`
			`from colossalai.pipeline.stage_manager import PipelineStageManager`
			`from colossalai.testing import rerun_if_address_is_in_use, spawn`


			`def check_bert_model_forward():`
			`model = BertModel.from_pretrained('bert-base-uncased')`
			`DP_DIM, PP_DIM = 0, 1`
			`DP_SIZE, PP_SIZE = 2, 2`
			`RANK_TO_COORDINATE = {`
			`0: (0, 0),`
			`1: (0, 1),`
			`2: (1, 0),`
			`3: (1, 1),`
			`}`
			`PP_RANKS_IN_GROUP = {`
			`0: [0, 1],`
			`1: [0, 1],`
			`2: [2, 3],`
			`3: [2, 3],`
			`}`
			`pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)`
[pipeline] build bloom model and policy , revise the base class of policy (#4161) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt * add bloom model and policy ,revise the base class of policy * revise * revision * add bert_for_pretraining 2023-07-05 02:52:53 +00:00
			`# print(pg_mesh)`
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00
			`stage_manager = PipelineStageManager(pg_mesh, PP_DIM)`
			`rank = dist.get_rank()`
			`# print(rank)`

			`x = torch.randint(0, 1000, (2, 3))`
			`hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)`
			`if stage_manager.stage == 0:`
			`attention_mask = torch.ones_like(x)`
			`output = bert_model_forward(self=model, input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)`
[pipeline] add bert_for_pretraining bert_lmhead forward and policy (#4172) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt * add bloom model and policy ,revise the base class of policy * revise * revision * add bert_for_pretraining * add bert_for_pretraining forward and policy * fix typos * cancel warning * change the imediate output to default dict * change the default output of get_shared_params 2023-07-06 06:49:10 +00:00			`print(output['hidden_states'].shape)`
			`assert output['hidden_states'].shape == (2, 3, 768)`
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00			`print('start the training')`
			`else:`
[pipeline] add bert_for_pretraining bert_lmhead forward and policy (#4172) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt * add bloom model and policy ,revise the base class of policy * revise * revision * add bert_for_pretraining * add bert_for_pretraining forward and policy * fix typos * cancel warning * change the imediate output to default dict * change the default output of get_shared_params 2023-07-06 06:49:10 +00:00			`attention_mask = torch.ones((2, 3))`
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00			`output = bert_model_forward(self=model,`
			`hidden_states=hidden_states,`
			`attention_mask=attention_mask,`
			`stage_manager=stage_manager)`
			`print(output[0].shape)`
			`assert output[0].shape == (2, 3, 768)`
			`print('end the training')`
			`print(output)`

			`# assert output[1].shape == (2, 768)`


			`def check_bert_model_policy():`
			`model = BertModel.from_pretrained('bert-base-uncased')`
			`DP_DIM, PP_DIM = 0, 1`
			`DP_SIZE, PP_SIZE = 2, 2`
			`RANK_TO_COORDINATE = {`
			`0: (0, 0),`
			`1: (0, 1),`
			`2: (1, 0),`
			`3: (1, 1),`
			`}`
			`PP_RANKS_IN_GROUP = {`
			`0: [0, 1],`
			`1: [0, 1],`
			`2: [2, 3],`
			`3: [2, 3],`
			`}`
			`pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)`
[pipeline] build bloom model and policy , revise the base class of policy (#4161) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt * add bloom model and policy ,revise the base class of policy * revise * revision * add bert_for_pretraining 2023-07-05 02:52:53 +00:00			`# print(pg_mesh)`
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00
			`stage_manager = PipelineStageManager(pg_mesh, PP_DIM)`
			`rank = dist.get_rank()`

[pipeline] add bert_for_pretraining bert_lmhead forward and policy (#4172) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt * add bloom model and policy ,revise the base class of policy * revise * revision * add bert_for_pretraining * add bert_for_pretraining forward and policy * fix typos * cancel warning * change the imediate output to default dict * change the default output of get_shared_params 2023-07-06 06:49:10 +00:00			`model_policy = BertModelPolicy(stage_manager, len(model.encoder.layer))`
[pipeline]add pipeline policy and bert forward (#4130) * add pipeline policy and bert forward to be done * add bertmodel pipeline forward and make tests * add Bert_Policy and test for policy * update formatting * update formatting * update the code * fix bugs * fix name confilt 2023-07-04 05:46:16 +00:00			`assert model_policy.layers_per_stage == [6, 6]`
			`layers = model_policy.get_hold_layers(model)`
			`for layer in layers:`
			`print(layer)`


			`def run_dist_model(rank, world_size, port):`
			`colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')`
			`check_bert_model_forward()`


			`def run_dist_policy(rank, world_size, port):`
			`colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')`
			`check_bert_model_policy()`


			`@pytest.mark.dist`
			`@rerun_if_address_is_in_use()`
			`def test_bert_model_forward():`
			`spawn(run_dist_model, 4)`


			`@pytest.mark.dist`
			`@rerun_if_address_is_in_use()`
			`def test_bert_model_policy():`
			`spawn(run_dist_policy, 4)`


			`if __name__ == "__main__":`
			`"""test the bert model forward and bert model policy"""`
			`test_bert_model_forward()`
			`test_bert_model_policy()`