mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.3 KiB
86 lines
2.3 KiB
2 years ago
|
from typing import Callable, List, Tuple, Union
|
||
|
|
||
|
import torch
|
||
|
import torch.distributed as dist
|
||
|
import torch.nn as nn
|
||
|
from torch.optim import Optimizer
|
||
|
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
|
||
|
from torch.utils.data import DataLoader, TensorDataset
|
||
|
|
||
|
import colossalai
|
||
|
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
|
||
|
from colossalai.checkpoint_io import CheckpointIO
|
||
|
from colossalai.interface import OptimizerWrapper
|
||
|
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||
|
|
||
|
|
||
|
class DPPluginWrapper(DPPluginBase):
|
||
|
"""This is a wrapper class for testing DP plugin initialization and dataloader creation.
|
||
|
"""
|
||
|
|
||
|
def configure(
|
||
|
self,
|
||
|
model: nn.Module,
|
||
|
optimizer: Optimizer,
|
||
|
criterion: Callable = None,
|
||
|
dataloader: DataLoader = None,
|
||
|
lr_scheduler: LRScheduler = None,
|
||
|
) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
|
||
|
pass
|
||
|
|
||
|
def control_checkpoint_io(self) -> bool:
|
||
|
pass
|
||
|
|
||
|
def control_device(self) -> bool:
|
||
|
pass
|
||
|
|
||
|
def control_precision(self) -> bool:
|
||
|
pass
|
||
|
|
||
|
def get_checkpoint_io(self) -> CheckpointIO:
|
||
|
pass
|
||
|
|
||
|
def support_no_sync(self) -> bool:
|
||
|
pass
|
||
|
|
||
|
def supported_devices(self) -> List[str]:
|
||
|
pass
|
||
|
|
||
|
def supported_precisions(self) -> List[str]:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def check_dataloader_sharding():
|
||
|
plugin = DPPluginWrapper()
|
||
|
|
||
|
# create a custom dasetset with 0 to 10
|
||
|
dataset = TensorDataset(torch.arange(0, 10))
|
||
|
train_dataloader = plugin.prepare_train_dataloader(dataset, batch_size=2)
|
||
|
|
||
|
# get the first batch of data
|
||
|
batch = next(iter(train_dataloader))[0].cuda()
|
||
|
is_rank_0 = dist.get_rank() == 0
|
||
|
|
||
|
if is_rank_0:
|
||
|
batch_to_compare = batch.clone()
|
||
|
else:
|
||
|
batch_to_compare = batch
|
||
|
# pass to the rank 1 value to rank 0
|
||
|
dist.broadcast(batch_to_compare, src=1)
|
||
|
|
||
|
# compare on rank 0
|
||
|
if is_rank_0:
|
||
|
assert not torch.equal(batch,
|
||
|
batch_to_compare), 'Same number was found across ranks but expected it to be different'
|
||
|
|
||
|
|
||
|
def run_dist(rank, world_size, port):
|
||
|
# init dist env
|
||
|
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
|
||
|
check_dataloader_sharding()
|
||
|
|
||
|
|
||
|
@rerun_if_address_is_in_use()
|
||
|
def test_dp_plugin_dataloader():
|
||
|
spawn(run_dist, 2)
|