ColossalAI/tests/test_checkpoint_io/test_gemini_torch_compabili...

import pytest
import torch
import torch.distributed as dist
from torch.optim import Adam
from utils import shared_tempdir

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, TorchDDPPlugin
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import (
    check_state_dict_equal,
    clear_cache_before_run,
    parameterize,
    rerun_if_address_is_in_use,
    spawn,
)
from tests.kit.model_zoo import model_zoo


@clear_cache_before_run()
@parameterize('shard', [False, True])
@parameterize('model_name', ['transformers_gpt'])
def exam_torch_load_from_gemini(shard: bool, model_name: str):

    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
    criterion = lambda x: x.mean()
    plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14))
    booster = Booster(plugin=plugin)

    model = model_fn()
    optimizer = HybridAdam(model.parameters(), lr=0.001)
    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)

    data = data_gen_fn()
    data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
    output = model(**data)
    output = output_transform_fn(output)
    output_key = list(output.keys())[0]
    loss = criterion(output[output_key])

    booster.backward(loss, optimizer)
    optimizer.step()

    with shared_tempdir() as tempdir:
        model_ckpt_path = f"{tempdir}/model"
        optimizer_ckpt_path = f"{tempdir}/optimizer"

        booster.save_model(model, model_ckpt_path, shard=shard)
        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)
        dist.barrier()

        new_model = model_fn()
        new_optimizer = Adam(new_model.parameters(), lr=0.001)
        new_plugin = TorchDDPPlugin()
        new_booster = Booster(plugin=new_plugin)
        new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion)

        # Loading HybridAdam states to torch.Adam
        new_booster.load_model(new_model, model_ckpt_path, strict=True)

        # Add prefix to get aligned with pytorch parameter names.
        check_state_dict_equal(model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
                               new_model.state_dict(), False)

        new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)

        # Check the new model/optimizer can successfully run.
        data = data_gen_fn()
        data = {
            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
        }
        output = new_model(**data)
        output = output_transform_fn(output)
        output_key = list(output.keys())[0]
        loss = criterion(output[output_key])
        new_booster.backward(loss, new_optimizer)
        new_optimizer.step()
        new_booster.save_model(new_model, model_ckpt_path, shard=shard)
        new_booster.save_optimizer(new_optimizer, optimizer_ckpt_path, shard=shard)


@clear_cache_before_run()
@parameterize('shard', [False, True])
@parameterize('model_name', ['transformers_gpt'])
def exam_gemini_load_from_torch(shard: bool, model_name: str):

    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
    criterion = lambda x: x.mean()
    plugin = TorchDDPPlugin()
    booster = Booster(plugin=plugin)

    model = model_fn()
    optimizer = Adam(model.parameters(), lr=0.001)
    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)

    data = data_gen_fn()
    data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}
    output = model(**data)
    output = output_transform_fn(output)
    output_key = list(output.keys())[0]
    loss = criterion(output[output_key])

    booster.backward(loss, optimizer)
    optimizer.step()

    with shared_tempdir() as tempdir:
        model_ckpt_path = f"{tempdir}/model"
        optimizer_ckpt_path = f"{tempdir}/optimizer"

        booster.save_model(model, model_ckpt_path, shard=shard)
        booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)
        dist.barrier()

        new_model = model_fn()
        new_optimizer = HybridAdam(new_model.parameters(), lr=0.001)
        new_plugin = GeminiPlugin()
        new_booster = Booster(plugin=new_plugin)
        new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion)

        # Loading torch.Adam states to HybridAdam
        new_booster.load_model(new_model, model_ckpt_path, strict=True)

        # Add prefix to get aligned with pytorch parameter names.
        check_state_dict_equal(new_model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
                               model.state_dict(), False)

        new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
        old_state_dict = optimizer.state_dict()
        new_state_dict = new_optimizer.state_dict(only_rank_0=False)

        # Comparison of param_groups needs special care here,
        # since not all hyperparameters in Adam are used by HybridAdam
        hyperparameters_to_examine = ['params', 'lr', 'betas', 'eps', 'weight_decay']
        for old_group, new_group in zip(old_state_dict['param_groups'], new_state_dict['param_groups']):
            for k in hyperparameters_to_examine:
                assert k in old_group and k in new_group, \
                    f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
                assert old_group[k] == new_group[k]
        check_state_dict_equal(old_state_dict['state'], new_state_dict['state'], False)

        # Check the new model/optimizer can successfully run.
        data = data_gen_fn()
        data = {
            k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()
        }
        output = new_model(**data)
        output = output_transform_fn(output)
        output_key = list(output.keys())[0]
        loss = criterion(output[output_key])
        new_booster.backward(loss, new_optimizer)
        new_optimizer.step()
        new_booster.save_model(new_model, model_ckpt_path, shard=shard)
        new_booster.save_optimizer(new_optimizer, optimizer_ckpt_path, shard=shard)


def run_dist(rank, world_size, port):
    config = {}
    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    exam_torch_load_from_gemini()
    exam_gemini_load_from_torch()


@pytest.mark.dist
@pytest.mark.parametrize('world_size', [2])
@rerun_if_address_is_in_use()
def test_gemini_ckpIO(world_size):
    spawn(run_dist, world_size)
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00			`import pytest`
			`import torch`
			`import torch.distributed as dist`
			`from torch.optim import Adam`
			`from utils import shared_tempdir`

			`import colossalai`
			`from colossalai.booster import Booster`
			`from colossalai.booster.plugin import GeminiPlugin, TorchDDPPlugin`
			`from colossalai.nn.optimizer import HybridAdam`
			`from colossalai.testing import (`
			`check_state_dict_equal,`
			`clear_cache_before_run,`
			`parameterize,`
			`rerun_if_address_is_in_use,`
			`spawn,`
			`)`
			`from tests.kit.model_zoo import model_zoo`


			`@clear_cache_before_run()`
[checkpointio] Sharded Optimizer Checkpoint for Gemini Plugin (#4302) * sharded optimizer checkpoint for gemini plugin * modify test to reduce testing time * update doc * fix bug when keep_gatherd is true under GeminiPlugin 2023-07-21 06:39:01 +00:00			`@parameterize('shard', [False, True])`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00			`@parameterize('model_name', ['transformers_gpt'])`
			`def exam_torch_load_from_gemini(shard: bool, model_name: str):`

			`(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))`
			`criterion = lambda x: x.mean()`
			`plugin = GeminiPlugin(precision="fp16", initial_scale=(2**14))`
			`booster = Booster(plugin=plugin)`

			`model = model_fn()`
			`optimizer = HybridAdam(model.parameters(), lr=0.001)`
			`model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)`

			`data = data_gen_fn()`
			`data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}`
			`output = model(**data)`
			`output = output_transform_fn(output)`
			`output_key = list(output.keys())[0]`
			`loss = criterion(output[output_key])`

			`booster.backward(loss, optimizer)`
			`optimizer.step()`

			`with shared_tempdir() as tempdir:`
			`model_ckpt_path = f"{tempdir}/model"`
			`optimizer_ckpt_path = f"{tempdir}/optimizer"`

			`booster.save_model(model, model_ckpt_path, shard=shard)`
			`booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)`
			`dist.barrier()`

			`new_model = model_fn()`
			`new_optimizer = Adam(new_model.parameters(), lr=0.001)`
			`new_plugin = TorchDDPPlugin()`
			`new_booster = Booster(plugin=new_plugin)`
			`new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion)`

			`# Loading HybridAdam states to torch.Adam`
			`new_booster.load_model(new_model, model_ckpt_path, strict=True)`

			`# Add prefix to get aligned with pytorch parameter names.`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`check_state_dict_equal(model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),`
			`new_model.state_dict(), False)`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00
			`new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00
			`# Check the new model/optimizer can successfully run.`
			`data = data_gen_fn()`
			`data = {`
			`k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()`
			`}`
			`output = new_model(**data)`
			`output = output_transform_fn(output)`
			`output_key = list(output.keys())[0]`
			`loss = criterion(output[output_key])`
			`new_booster.backward(loss, new_optimizer)`
			`new_optimizer.step()`
			`new_booster.save_model(new_model, model_ckpt_path, shard=shard)`
			`new_booster.save_optimizer(new_optimizer, optimizer_ckpt_path, shard=shard)`


			`@clear_cache_before_run()`
[checkpointio] Sharded Optimizer Checkpoint for Gemini Plugin (#4302) * sharded optimizer checkpoint for gemini plugin * modify test to reduce testing time * update doc * fix bug when keep_gatherd is true under GeminiPlugin 2023-07-21 06:39:01 +00:00			`@parameterize('shard', [False, True])`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00			`@parameterize('model_name', ['transformers_gpt'])`
			`def exam_gemini_load_from_torch(shard: bool, model_name: str):`

			`(model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))`
			`criterion = lambda x: x.mean()`
			`plugin = TorchDDPPlugin()`
			`booster = Booster(plugin=plugin)`

			`model = model_fn()`
			`optimizer = Adam(model.parameters(), lr=0.001)`
			`model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)`

			`data = data_gen_fn()`
			`data = {k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()}`
			`output = model(**data)`
			`output = output_transform_fn(output)`
			`output_key = list(output.keys())[0]`
			`loss = criterion(output[output_key])`

			`booster.backward(loss, optimizer)`
			`optimizer.step()`

			`with shared_tempdir() as tempdir:`
			`model_ckpt_path = f"{tempdir}/model"`
			`optimizer_ckpt_path = f"{tempdir}/optimizer"`

			`booster.save_model(model, model_ckpt_path, shard=shard)`
			`booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=shard)`
			`dist.barrier()`

			`new_model = model_fn()`
			`new_optimizer = HybridAdam(new_model.parameters(), lr=0.001)`
			`new_plugin = GeminiPlugin()`
			`new_booster = Booster(plugin=new_plugin)`
			`new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion)`

			`# Loading torch.Adam states to HybridAdam`
			`new_booster.load_model(new_model, model_ckpt_path, strict=True)`

			`# Add prefix to get aligned with pytorch parameter names.`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`check_state_dict_equal(new_model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),`
			`model.state_dict(), False)`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00
			`new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)`
			`old_state_dict = optimizer.state_dict()`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`new_state_dict = new_optimizer.state_dict(only_rank_0=False)`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00
			`# Comparison of param_groups needs special care here,`
			`# since not all hyperparameters in Adam are used by HybridAdam`
			`hyperparameters_to_examine = ['params', 'lr', 'betas', 'eps', 'weight_decay']`
			`for old_group, new_group in zip(old_state_dict['param_groups'], new_state_dict['param_groups']):`
			`for k in hyperparameters_to_examine:`
			`assert k in old_group and k in new_group, \`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00			`assert old_group[k] == new_group[k]`
			`check_state_dict_equal(old_state_dict['state'], new_state_dict['state'], False)`

			`# Check the new model/optimizer can successfully run.`
			`data = data_gen_fn()`
			`data = {`
			`k: v.to('cuda') if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__ else v for k, v in data.items()`
			`}`
			`output = new_model(**data)`
			`output = output_transform_fn(output)`
			`output_key = list(output.keys())[0]`
			`loss = criterion(output[output_key])`
			`new_booster.backward(loss, new_optimizer)`
			`new_optimizer.step()`
			`new_booster.save_model(new_model, model_ckpt_path, shard=shard)`
			`new_booster.save_optimizer(new_optimizer, optimizer_ckpt_path, shard=shard)`


			`def run_dist(rank, world_size, port):`
			`config = {}`
			`colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`
			`exam_torch_load_from_gemini()`
			`exam_gemini_load_from_torch()`


			`@pytest.mark.dist`
[checkpointio] Sharded Optimizer Checkpoint for Gemini Plugin (#4302) * sharded optimizer checkpoint for gemini plugin * modify test to reduce testing time * update doc * fix bug when keep_gatherd is true under GeminiPlugin 2023-07-21 06:39:01 +00:00			`@pytest.mark.parametrize('world_size', [2])`
Next commit [checkpointio] Unsharded Optimizer Checkpoint for Gemini Plugin (#4141) * [checkpointio] unsharded optimizer checkpoint for Gemini plugin * [checkpointio] unsharded optimizer checkpoint for Gemini using all_gather 2023-07-07 08:33:06 +00:00			`@rerun_if_address_is_in_use()`
			`def test_gemini_ckpIO(world_size):`
			`spawn(run_dist, world_size)`