ColossalAI/applications/Chat/tests/test_checkpoint.py

import os
import tempfile
from contextlib import nullcontext

import pytest
import torch
import torch.distributed as dist
from coati.models.gpt import GPTActor
from coati.models.utils import calc_action_log_probs
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
from transformers.models.gpt2.configuration_gpt2 import GPT2Config

from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import rerun_if_address_is_in_use, spawn

GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)


def get_data(batch_size: int, seq_len: int = 10) -> dict:
    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
    attention_mask = torch.ones_like(input_ids)
    return dict(input_ids=input_ids, attention_mask=attention_mask)


def run_test_checkpoint(strategy):
    BATCH_SIZE = 2

    if strategy == 'ddp':
        strategy = DDPStrategy()
    elif strategy == 'colossalai_gemini':
        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
    elif strategy == 'colossalai_zero2':
        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
    else:
        raise ValueError(f'Unsupported strategy "{strategy}"')

    with strategy.model_init_context():
        actor = GPTActor(config=GPT_CONFIG).cuda()

    actor_optim = HybridAdam(actor.parameters())

    actor, actor_optim = strategy.prepare((actor, actor_optim))

    def run_step():
        data = get_data(BATCH_SIZE)
        action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
        actor_output = actor(data['input_ids'], data['attention_mask'])
        action_log_probs = calc_action_log_probs(actor_output, data['input_ids'], action_mask.size(1))
        loss = action_log_probs.sum()
        strategy.backward(loss, actor, actor_optim)
        strategy.optimizer_step(actor_optim)

    run_step()

    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()

    with ctx as dirname:
        rank0_dirname = [dirname]
        dist.broadcast_object_list(rank0_dirname)
        rank0_dirname = rank0_dirname[0]

        model_path = os.path.join(rank0_dirname, 'model.pt')
        strategy.save_model(actor, model_path, only_rank0=True)

        optim_path = os.path.join(rank0_dirname, f'optim.pt')
        strategy.save_optimizer(actor_optim, optim_path, only_rank0=True)

        # FIXME(cwher): Sharded optimizer checkpoint is not supported yet.
        #  at "ColossalAI/colossalai/checkpoint_io/general_checkpoint_io.py", line 62
        # optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
        # strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)

        dist.barrier()

        strategy.load_model(actor, model_path, strict=False)
        strategy.load_optimizer(actor_optim, optim_path)

        dist.barrier()

    run_step()


def run_dist(rank, world_size, port, strategy):
    os.environ['RANK'] = str(rank)
    os.environ['LOCAL_RANK'] = str(rank)
    os.environ['WORLD_SIZE'] = str(world_size)
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    run_test_checkpoint(strategy)


@pytest.mark.dist
@pytest.mark.parametrize('world_size', [2])
@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])
@rerun_if_address_is_in_use()
def test_checkpoint(world_size, strategy):
    spawn(run_dist, world_size, strategy=strategy)


if __name__ == '__main__':
    test_checkpoint(2, 'colossalai_zero2')
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`import os`
			`import tempfile`
			`from contextlib import nullcontext`

			`import pytest`
			`import torch`
			`import torch.distributed as dist`
			`from coati.models.gpt import GPTActor`
[chat] refactor actor class (#3968) * refactor: separate log_probs fn from Actor forward fn * refactor: separate generate fn from Actor class * feat: update unwrap_model and get_base_model * unwrap_model returns model not wrapped by Strategy * get_base_model returns HF model for Actor, Critic and RewardModel * feat: simplify Strategy.prepare * style: remove get_base_model method of Actor * perf: tokenize text in batches * refactor: move calc_action_log_probs to utils of model * test: update test with new forward fn * style: rename forward fn args * fix: do not unwrap model in save_model fn of naive strategy * test: add gemini test for train_prompts * fix: fix _set_default_generate_kwargs 2023-06-13 05:31:56 +00:00			`from coati.models.utils import calc_action_log_probs`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy`
			`from transformers.models.gpt2.configuration_gpt2 import GPT2Config`

			`from colossalai.nn.optimizer import HybridAdam`
[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`from colossalai.testing import rerun_if_address_is_in_use, spawn`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00
			`GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)`


			`def get_data(batch_size: int, seq_len: int = 10) -> dict:`
			`input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')`
			`attention_mask = torch.ones_like(input_ids)`
			`return dict(input_ids=input_ids, attention_mask=attention_mask)`


			`def run_test_checkpoint(strategy):`
			`BATCH_SIZE = 2`

			`if strategy == 'ddp':`
			`strategy = DDPStrategy()`
			`elif strategy == 'colossalai_gemini':`
			`strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)`
			`elif strategy == 'colossalai_zero2':`
			`strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')`
			`else:`
			`raise ValueError(f'Unsupported strategy "{strategy}"')`

			`with strategy.model_init_context():`
			`actor = GPTActor(config=GPT_CONFIG).cuda()`

			`actor_optim = HybridAdam(actor.parameters())`

			`actor, actor_optim = strategy.prepare((actor, actor_optim))`

			`def run_step():`
			`data = get_data(BATCH_SIZE)`
			`action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)`
[chat] refactor actor class (#3968) * refactor: separate log_probs fn from Actor forward fn * refactor: separate generate fn from Actor class * feat: update unwrap_model and get_base_model * unwrap_model returns model not wrapped by Strategy * get_base_model returns HF model for Actor, Critic and RewardModel * feat: simplify Strategy.prepare * style: remove get_base_model method of Actor * perf: tokenize text in batches * refactor: move calc_action_log_probs to utils of model * test: update test with new forward fn * style: rename forward fn args * fix: do not unwrap model in save_model fn of naive strategy * test: add gemini test for train_prompts * fix: fix _set_default_generate_kwargs 2023-06-13 05:31:56 +00:00			`actor_output = actor(data['input_ids'], data['attention_mask'])`
			`action_log_probs = calc_action_log_probs(actor_output, data['input_ids'], action_mask.size(1))`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`loss = action_log_probs.sum()`
			`strategy.backward(loss, actor, actor_optim)`
			`strategy.optimizer_step(actor_optim)`

			`run_step()`

			`ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()`

			`with ctx as dirname:`
			`rank0_dirname = [dirname]`
			`dist.broadcast_object_list(rank0_dirname)`
			`rank0_dirname = rank0_dirname[0]`

			`model_path = os.path.join(rank0_dirname, 'model.pt')`
			`strategy.save_model(actor, model_path, only_rank0=True)`
[chat] refactor strategy class with booster api (#3987) * refactor: adapt boost API in base and naive strategies * fix: initialize plugin after setup_distributed * fix: fix save_pretrained fn * refactor: adapt boost API in DDPStrategy * to: add _post_init check * to: fix ddp backward, modify ddp dataloader and unwrap * feat: adapt boost API in ColossalAIStrategy * fix: call setup_distributed before use get_current_device * fix: fix save_model and save_optimizer * test: remove save_sharded_optimizer test * style: apply formatter * fix: fix stage check and add comments * feat: allow dict type arg in strategy.prepare * to: temporarily remove lr_scheduler for testing * style: simplify init of ColossalAIStrategy * fix: fix lr_scheduler in sft and rm * style: modify comments * test: add train_prompts tests * fix: fix inference only case and use in train_prompts * test: skip failed tests in ci * style: fix CodeFactor check * fix: do not use model.to('cpu') with GeminiPlugin * test: enable colossalai_gemini tests * test: set CUDA_VISIBLE_DEVICES in ci * docs: add note 2023-06-25 09:36:21 +00:00
			`optim_path = os.path.join(rank0_dirname, f'optim.pt')`
			`strategy.save_optimizer(actor_optim, optim_path, only_rank0=True)`

			`# FIXME(cwher): Sharded optimizer checkpoint is not supported yet.`
			`# at "ColossalAI/colossalai/checkpoint_io/general_checkpoint_io.py", line 62`
			`# optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')`
			`# strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00
			`dist.barrier()`

			`strategy.load_model(actor, model_path, strict=False)`
			`strategy.load_optimizer(actor_optim, optim_path)`

			`dist.barrier()`

			`run_step()`


			`def run_dist(rank, world_size, port, strategy):`
			`os.environ['RANK'] = str(rank)`
			`os.environ['LOCAL_RANK'] = str(rank)`
			`os.environ['WORLD_SIZE'] = str(world_size)`
			`os.environ['MASTER_ADDR'] = 'localhost'`
			`os.environ['MASTER_PORT'] = str(port)`
			`run_test_checkpoint(strategy)`


			`@pytest.mark.dist`
			`@pytest.mark.parametrize('world_size', [2])`
			`@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])`
			`@rerun_if_address_is_in_use()`
			`def test_checkpoint(world_size, strategy):`
[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`spawn(run_dist, world_size, strategy=strategy)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00

			`if __name__ == '__main__':`
			`test_checkpoint(2, 'colossalai_zero2')`