ColossalAI/colossalai/legacy/utils/checkpointing.py

283 lines
11 KiB
Python

from collections import OrderedDict
from itertools import chain
import torch
import torch.distributed as dist
from colossalai.legacy.constants import IS_TENSOR_PARALLEL
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
try:
from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
except ImportError:
_EXTRA_STATE_KEY_SUFFIX = "_extra_state"
from .common import is_using_pp
__all__ = ["save_checkpoint", "load_checkpoint"]
def broadcast_state_dict(state_dict, parallel_mode):
state_dict = [state_dict.copy() if isinstance(state_dict, dict) else state_dict]
src_rank = gpc.get_ranks_in_group(parallel_mode)[0]
dist.broadcast_object_list(state_dict, src=src_rank, group=gpc.get_cpu_group(parallel_mode))
return state_dict[0]
def partition_tensor_parallel_state_dict(
state_dict: OrderedDict, parallel_mode: ParallelMode, dims: dict = dict(), partition_states: dict = dict()
):
src_rank = gpc.get_ranks_in_group(parallel_mode)[0]
depth = gpc.get_world_size(parallel_mode)
group = gpc.get_cpu_group(parallel_mode)
is_rank0 = gpc.get_local_rank(parallel_mode) == 0
partition_info = [None]
if is_rank0:
partition_info_dict = OrderedDict()
for key, param in state_dict.items():
dim = dims[key]
is_partitioned = partition_states[key]
shape = list(param.shape)
if is_partitioned:
shape[dim] = shape[dim] // depth
partition_info_dict[key] = (is_partitioned, param.dtype, shape, dim)
partition_info[0] = partition_info_dict
dist.broadcast_object_list(partition_info, src_rank, group=group)
partitioned_state = OrderedDict()
for key, (is_partitioned, dtype, shape, dim) in partition_info[0].items():
if is_partitioned:
output = torch.empty(shape, dtype=dtype)
if is_rank0:
scatter_list = [t.contiguous() for t in state_dict[key].chunk(depth, dim)]
else:
scatter_list = None
dist.scatter(output, scatter_list, src_rank, group=group)
else:
if is_rank0:
output = state_dict[key]
else:
output = torch.empty(shape, dtype=dtype)
dist.broadcast(output, src_rank, group=group)
partitioned_state[key] = output
return partitioned_state
def gather_tensor_parallel_state_dict(
state_dict: OrderedDict,
parallel_mode: ParallelMode,
dims: dict = dict(),
partition_states: dict = dict(),
keep_vars: bool = False,
):
dst_rank = gpc.get_ranks_in_group(parallel_mode)[0]
depth = gpc.get_world_size(parallel_mode)
for key in list(state_dict.keys()):
param = state_dict.pop(key)
param = param if keep_vars else param.detach()
dim = dims.get(key, 0)
do_partition = partition_states.get(key, True)
if do_partition:
temp = param.transpose(0, dim).contiguous()
gather_list = None
if gpc.get_local_rank(parallel_mode) == 0:
shape = list(param.shape)
shape[0], shape[dim] = shape[dim], shape[0]
shape[0] *= depth
param = torch.empty(shape, dtype=param.dtype, device=param.device)
gather_list = list(torch.chunk(param, depth, dim=0))
dist.gather(temp, gather_list, dst=dst_rank, group=gpc.get_cpu_group(parallel_mode))
param = torch.transpose(param, 0, dim)
# update params in state_dict only on local rank 0
if gpc.get_local_rank(parallel_mode) == 0:
state_dict[key] = param
return state_dict
def _send_state_dict(state_dict, dst, parallel_mode):
state_tensor, state_size = dist.distributed_c10d._object_to_tensor(state_dict)
dist.send(state_size, dst, group=gpc.get_cpu_group(parallel_mode))
dist.send(state_tensor, dst, group=gpc.get_cpu_group(parallel_mode))
def _recv_state_dict(src, parallel_mode):
state_size = torch.tensor([0], dtype=torch.long)
dist.recv(state_size, src, group=gpc.get_cpu_group(parallel_mode))
state_tensor = torch.empty(state_size.item(), dtype=torch.uint8)
dist.recv(state_tensor, src, group=gpc.get_cpu_group(parallel_mode))
state_dict = dist.distributed_c10d._tensor_to_object(state_tensor, state_size)
return state_dict
def partition_pipeline_parallel_state_dict(model, state_dict):
pipeline_state = OrderedDict()
if gpc.get_local_rank(ParallelMode.TENSOR) == 0:
# receive all states from prev stage
if not gpc.is_first_rank(ParallelMode.PIPELINE):
state_dict = _recv_state_dict(gpc.get_prev_global_rank(ParallelMode.PIPELINE), ParallelMode.PIPELINE)
# move states to output
for name, _ in model.named_parameters(recurse=True):
if name in state_dict:
pipeline_state[name] = state_dict.pop(name)
for name, _ in model.named_buffers(recurse=True):
if name in state_dict:
pipeline_state[name] = state_dict.pop(name)
for name, _ in model.named_modules():
extra_state_key = name + "." + _EXTRA_STATE_KEY_SUFFIX
if extra_state_key in state_dict:
pipeline_state[extra_state_key] = state_dict.pop(extra_state_key)
# send rest states to next stage
if not gpc.is_last_rank(ParallelMode.PIPELINE):
_send_state_dict(state_dict, gpc.get_next_global_rank(ParallelMode.PIPELINE), ParallelMode.PIPELINE)
return pipeline_state
def gather_pipeline_parallel_state_dict(state_dict):
gathered_states = (
[None for _ in range(gpc.get_world_size(ParallelMode.PIPELINE))]
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0
else None
)
dist.gather_object(
state_dict,
gathered_states,
dst=gpc.get_ranks_in_group(ParallelMode.PIPELINE)[0],
group=gpc.get_cpu_group(ParallelMode.PIPELINE),
)
state_dict = (
OrderedDict(chain.from_iterable(state.items() for state in gathered_states))
if gpc.get_local_rank(ParallelMode.PIPELINE) == 0
else OrderedDict()
)
return state_dict
def save_checkpoint(
file,
epoch: int,
model: torch.nn.Module,
optimizer: torch.optim.Optimizer = None,
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
**kwargs,
):
"""Stores the checkpoint to disk. Saves all the training components' parameters or buffers, such as model, optimizer,
lr_scheduler etc. into a checkpoint dictionary.
Args:
file: a file-like object (has to implement write and flush) or a string or os.PathLike object containing a
file name.
epoch (int): Epoch number (indicates how many epochs have you trained this model).
model (:class:`torch.nn.Module`): Model to be saved.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be saved.
lr_scheduler (Union[:class:`torch.optim.lr_scheduler`, :class:`colossalai.nn.lr_scheduler`], optional):
lr_scheduler to be saved, defaults to None.
pickle_module: module used for pickling metadata and objects
pickle_protocol: can be specified to override the default protocol
"""
# ckpt container
checkpoint = {"epoch": epoch}
model_state = model.state_dict()
if is_using_pp() and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
model_state = gather_pipeline_parallel_state_dict(model_state)
if gpc.get_global_rank() == 0:
checkpoint["model"] = model_state
# if optimizer is not None:
# checkpoint['optimizer'] = optimizer.state_dict()
# if lr_scheduler is not None:
# checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
torch.save(checkpoint, file, **kwargs)
def broadcast_model(model: torch.nn.Module):
src_rank = gpc.get_ranks_in_group(ParallelMode.TENSOR)[0]
for p in model.parameters():
if not getattr(p, IS_TENSOR_PARALLEL, False) and p.storage().size() > 0:
group = (
gpc.get_group(ParallelMode.TENSOR)
if p.device.type == "cuda"
else gpc.get_cpu_group(ParallelMode.TENSOR)
)
dist.broadcast(p, src_rank, group=group)
def load_checkpoint(
file,
model: torch.nn.Module,
optimizer: torch.optim.Optimizer = None,
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
strict: bool = True,
):
"""Loads training states from a checkpoint file.
Args:
file: a file-like object (has to implement read(), readline(), tell(), and seek()), or a string or os.PathLike
object containing a file name.
model (:class:`torch.nn.Module`): Model to load saved weights and buffers.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
lr_scheduler to recuperate, defaults to None.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
of the checkpoint match the names of parameters and buffers in model, defaults to True.
Returns:
int: The saved epoch number.
Raises:
RuntimeError: Raise error if the model/optimizer cannot successfully be recuperated
"""
state_dict = (
torch.load(file, map_location=torch.device("cpu")) if gpc.get_local_rank(ParallelMode.MODEL) == 0 else None
)
# model states
model_state = state_dict.pop("model") if state_dict is not None else dict()
# pipeline
if is_using_pp():
model_state = partition_pipeline_parallel_state_dict(model, model_state)
try:
model.load_state_dict(model_state, strict=strict)
broadcast_model(model)
except RuntimeError as e:
error_msgs = str(e)
if error_msgs.startswith("Error(s) in loading state_dict for "):
error_msgs = error_msgs.split("\n\t")[1:]
dst_rank = gpc.get_ranks_in_group(ParallelMode.MODEL)[0]
all_error_msgs = [None for _ in range(gpc.get_world_size(ParallelMode.MODEL))]
dist.gather_object(error_msgs, all_error_msgs, dst=dst_rank, group=gpc.get_cpu_group(ParallelMode.MODEL))
if gpc.get_global_rank() == 0:
all_error_msgs = list(chain.from_iterable(all_error_msgs))
raise RuntimeError(
"Error(s) in loading state_dict for {}:\n\t{}".format(
model.__class__.__name__, "\n\t".join(all_error_msgs)
)
)
else:
raise e
# broadcast the rest states
state_dict = broadcast_state_dict(state_dict, ParallelMode.MODEL)
# # optimizer states
# if optimizer is not None and 'optimizer' in state_dict:
# optimizer.load_state_dict(state_dict['optimizer'])
# # lr scheduler states
# if lr_scheduler is not None and 'lr_scheduler' in state_dict:
# lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
# last epoch
last_epoch = state_dict.pop("epoch", -1)
return last_epoch