@ -29,12 +29,12 @@ from colossalai.global_variables import moe_env
def get_default_parser ( ) :
''' Reads user command line and uses an argument parser to parse the input arguments.
""" Reads user command line and uses an argument parser to parse the input arguments.
Input arguments include configuration , host , port , world size , local rank , backend for torch . distributed .
: return : r eturns the parser with the default arguments , the user may add customized arguments into this parser
: return : R eturns the parser with the default arguments , the user may add customized arguments into this parser
: rtype : Namespace
'''
"""
parser = argparse . ArgumentParser ( )
parser . add_argument ( ' --config ' , type = str , help = ' path to the config file ' )
parser . add_argument ( ' --host ' ,
@ -64,28 +64,30 @@ def launch(config: Union[str, Path, Config, Dict],
local_rank : int = None ,
seed : int = 1024 ,
verbose : bool = True ) :
''' This function first parses the configuration arguments, using :func:parse_args() in case one of the input arguments are not given.
Then initialize and set distributed environment by calling global_context ' s functions.
""" This function first parses the configuration arguments, using :func:` parse_args()` in case one of the input
arguments are not given . Then initialize and set distributed environment by calling global_context ' s functions.
: param config : c onfig file or config file path are both acceptable
: param config : C onfig file or config file path are both acceptable
: type config : Union [ str , dict , Config ]
: param rank : r ank for the default process group
: param rank : R ank for the default process group
: type rank : int
: param world_size : w orld size of the default process group
: param world_size : W orld size of the default process group
: type world_size : int
: param host : t he master address for distributed training
: param host : T he master address for distributed training
: type host : str
: param port : t he master port for distributed training
: param port : T he master port for distributed training
: type port : str
: param backend : b ackend for torch . distributed
: type backend : str
: param local_rank : r ank for the process on the node and is used to set the default CUDA device ,
defaults to None . If local_rank = None , the default device ordinal will be calculated automatically
: param backend : B ackend for torch . distributed
: type backend : str , optional
: param local_rank : R ank for the process on the node and is used to set the default CUDA device , defaults to None .
If local_rank = None , the default device ordinal will be calculated automatically
: type local_rank : int , optional
: param verbose : whether to print logs
: type verbose : bool
: raises Exception : raise exception when config type is wrong
'''
: param seed : Specified random seed for every processes
: type seed : int , optional
: param verbose : Whether to print logs
: type verbose : bool , optional
: raises Exception : Raise exception when config type is wrong
"""
gpc . verbose = verbose
# set config
@ -123,20 +125,22 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
backend : str = ' nccl ' ,
seed : int = 1024 ,
verbose : bool = True ) :
''' A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
""" A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
set by SLURM
: param config : c onfig file or config file path are both acceptable
: param config : C onfig file or config file path are both acceptable
: type config : Union [ str , dict , Config ]
: param host : t he master address for distributed training
: param host : T he master address for distributed training
: type host : str
: param port : t he master port for distributed training
: param port : T he master port for distributed training
: type port : str
: param backend : backend for torch . distributed
: type backend : str
: param verbose : whether to print logs
: type verbose : bool
'''
: param backend : Backend for torch . distributed
: type backend : str , optional
: param seed : Specified random seed for every processes
: type seed : int , optional
: param verbose : Whether to print logs
: type verbose : bool , optional
"""
rank = int ( os . environ [ ' SLURM_PROCID ' ] )
world_size = int ( os . environ [ ' SLURM_NPROCS ' ] )
launch ( config = config ,
@ -155,20 +159,22 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
backend : str = ' nccl ' ,
seed : int = 1024 ,
verbose : bool = True ) :
''' A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
""" A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
set by OpenMPI
: param config : c onfig file or config file path are both acceptable
: param config : C onfig file or config file path are both acceptable
: type config : Union [ str , dict , Config ]
: param host : t he master address for distributed training
: param host : T he master address for distributed training
: type host : str
: param port : t he master port for distributed training
: param port : T he master port for distributed training
: type port : str
: param backend : backend for torch . distributed
: type backend : str
: param verbose : whether to print logs
: type verbose : bool
'''
: param backend : Backend for torch . distributed
: type backend : str , optional
: param seed : Specified random seed for every processes
: type seed : int , optional
: param verbose : Whether to print logs
: type verbose : bool , optional
"""
rank = int ( os . environ [ ' OMPI_COMM_WORLD_RANK ' ] )
local_rank = int ( os . environ [ ' OMPI_COMM_WORLD_LOCAL_RANK ' ] )
world_size = int ( os . environ [ ' OMPI_COMM_WORLD_SIZE ' ] )
@ -187,20 +193,18 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
backend : str = ' nccl ' ,
seed : int = 1024 ,
verbose : bool = True ) :
''' A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
""" A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch
: param config : c onfig file or config file path are both acceptable
: param config : C onfig file or config file path are both acceptable
: type config : Union [ str , dict , Config ]
: param host : the master address for distributed training
: type host : str
: param port : the master port for distributed training
: type port : str
: param backend : backend for torch . distributed
: type backend : str
: param verbose : whether to print logs
: type verbose : bool
'''
: param backend : Backend for torch . distributed
: type backend : str , optional
: param seed : Specified random seed for every processes
: type seed : int , optional
: param verbose : Whether to print logs
: type verbose : bool , optional
"""
rank = int ( os . environ [ ' RANK ' ] )
local_rank = int ( os . environ [ ' LOCAL_RANK ' ] )
world_size = int ( os . environ [ ' WORLD_SIZE ' ] )
@ -225,25 +229,26 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
lr_scheduler : _LRScheduler = None ,
verbose : bool = True
) - > Tuple [ Engine , DataLoader , DataLoader , _LRScheduler ] :
''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
""" Core function to wrap the essential training components with our functionality based on the config which is
loaded into gpc . config .
: param model : y our model instance
: param model : Y our model instance
: type model : : class : ` torch . nn . Module `
: param optimizer : y our optimizer instance
: param optimizer : Y our optimizer instance
: type optimizer : : class : ` torch . optim . optimizer . Optimizer `
: param criterion : y our criterion instance
: param criterion : Y our criterion instance
: type criterion : : class : ` torch . nn . modules . loss . _Loss `
: param train_dataloader : d ataloader for training data
: type train_dataloader : : class : ` torch . utils . data . DataLoader `
: param train_dataloader : d ataloader for testing data
: type train _dataloader : : class : ` torch . utils . data . DataLoader `
: param lr_scheduler : y our lr scheduler instance
: type lr_scheduler : : class : ` torch . nn . lr_scheduler . _LRScheduler `
: param verbose : w hether to print logs
: type verbose : bool
: param train_dataloader : D ataloader for training
: type train_dataloader : : class : ` torch . utils . data . DataLoader ` , optional
: param test_dataloader : D ataloader for testing
: type test _dataloader : : class : ` torch . utils . data . DataLoader ` , optional
: param lr_scheduler : Y our lr scheduler instance
: type lr_scheduler : : class : ` torch . nn . lr_scheduler . _LRScheduler ` , optional
: param verbose : W hether to print logs
: type verbose : bool , optional
: return : ( engine , train_dataloader , test_dataloader , lr_scheduler )
: rtype : t uple
'''
: rtype : T uple
"""
# get logger
logger = get_dist_logger ( )
gpc . verbose = verbose