ColossalAI/colossalai/legacy/nn/parallel/layers/module_utils.py

from typing import Dict

import torch

from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec

from . import ColoModule

_COLOSSAL_MODULES: Dict[type, ColoModule] = {}


def register_colo_module(module_type: type, colo_module: ColoModule):
    global _COLOSSAL_MODULES
    _COLOSSAL_MODULES[module_type] = colo_module


def is_colo_module(module: torch.nn.Module):
    global _COLOSSAL_MODULES
    for module_type in _COLOSSAL_MODULES.keys():
        if isinstance(module, module_type):
            return True
    return False


def get_colo_module(module: torch.nn.Module):
    global _COLOSSAL_MODULES
    if is_colo_module(module):
        for module_type, colo_module in _COLOSSAL_MODULES.items():
            if isinstance(module, module_type):
                return colo_module
    else:
        return None


def check_colo_module(module: torch.nn.Module, pg: ProcessGroup, recursive=True):
    if is_colo_module(module):
        colo_module = get_colo_module(module)
        param_names = colo_module.get_param_names()
        compute_pattern = None
        for param_name in param_names:
            param = module.get_parameter(param_name)
            if not isinstance(param, ColoParameter):
                raise Exception(f'Invalid ColoParameter spec: {param} in {module} is not a ColoParameter.')
            if param.has_compute_spec():
                cur_compute_pattern = param.compute_spec.compute_pattern
                if compute_pattern is None:
                    compute_pattern = cur_compute_pattern
                else:
                    if cur_compute_pattern != compute_pattern:
                        raise Exception(
                            f'Invalid ColoParameter spec: Params in {module} have different compute_pattern.')
            else:
                continue

        if compute_pattern is not None:
            colo_module.register(compute_pattern, pg)
            if not colo_module.has_compute_pattern(compute_pattern):
                raise Exception(
                    f'Invalid ColoParameter spec: ComputePattern {compute_pattern} in {module} is not allowed.')

            match_specs = False
            allowed_specs = colo_module.get_dist_specs(compute_pattern)
            for _, param_specs in allowed_specs.items():
                cur_match = True
                for param_name, dist_spec in param_specs.items():
                    param = module.get_parameter(param_name)
                    if param.has_compute_spec():
                        if dist_spec != param.dist_spec:
                            cur_match = False
                            break
                    else:
                        if dist_spec is not None:
                            cur_match = False
                            break
                if cur_match == True:
                    match_specs = True
                    break
            if match_specs == False:
                raise Exception(f'Invalid ColoParameter spec: Params in {module} are incorrectly sharded.')
    if recursive == True:
        for submodule in module.children():
            check_colo_module(submodule, pg=pg, recursive=True)


def init_colo_module(module: torch.nn.Module,
                     compute_spec: ComputeSpec,
                     pg: ProcessGroup,
                     recursive=True,
                     mode='default'):
    compute_pattern = compute_spec.compute_pattern
    if is_colo_module(module):
        # for each param
        # set its process_group, dist_spec and compute_spec
        colo_module = get_colo_module(module)
        colo_module.register(compute_pattern, pg)
        if not colo_module.has_compute_pattern_with_mode(compute_pattern, mode=mode):
            raise NotImplementedError
        # a set for modules which update at least one param in the init process.
        # these modules need to be checked whether all params still match one of the valid compute pattern.
        modules_update_param = {module}
        for param_name, dist_spec in colo_module.get_dist_specs_with_mode(compute_pattern, mode=mode).items():
            if dist_spec is None:
                continue
            param = module.get_parameter(param_name)
            if isinstance(param, ColoParameter):
                param.set_process_group(pg)
                param.set_dist_spec(dist_spec)
                param.compute_spec = compute_spec
                for mod in param.shared_param_modules:
                    modules_update_param.add(mod)
        for mod in modules_update_param:
            check_colo_module(mod, pg, recursive=False)
    if recursive == True:
        for submodule in module.children():
            init_colo_module(submodule, compute_spec, pg=pg, recursive=True, mode=mode)
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`from typing import Dict`
[legacy] move communication and nn to legacy and refactor logger (#4671) * [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check 2023-09-11 08:24:28 +00:00
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`import torch`

[legacy] move communication and nn to legacy and refactor logger (#4671) * [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check 2023-09-11 08:24:28 +00:00			`from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec`

			`from . import ColoModule`

[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`_COLOSSAL_MODULES: Dict[type, ColoModule] = {}`


			`def register_colo_module(module_type: type, colo_module: ColoModule):`
			`global _COLOSSAL_MODULES`
			`_COLOSSAL_MODULES[module_type] = colo_module`

reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`def is_colo_module(module: torch.nn.Module):`
			`global _COLOSSAL_MODULES`
[Tensor] add hybrid device demo and fix bugs (#1059) 2022-06-03 04:09:49 +00:00			`for module_type in _COLOSSAL_MODULES.keys():`
fix module utils bug (#1066) 2022-06-06 04:11:48 +00:00			`if isinstance(module, module_type):`
[Tensor] add hybrid device demo and fix bugs (#1059) 2022-06-03 04:09:49 +00:00			`return True`
			`return False`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00
reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`def get_colo_module(module: torch.nn.Module):`
			`global _COLOSSAL_MODULES`
			`if is_colo_module(module):`
[Tensor] add hybrid device demo and fix bugs (#1059) 2022-06-03 04:09:49 +00:00			`for module_type, colo_module in _COLOSSAL_MODULES.items():`
fix module utils bug (#1066) 2022-06-06 04:11:48 +00:00			`if isinstance(module, module_type):`
[Tensor] add hybrid device demo and fix bugs (#1059) 2022-06-03 04:09:49 +00:00			`return colo_module`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`else:`
			`return None`

reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`def check_colo_module(module: torch.nn.Module, pg: ProcessGroup, recursive=True):`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if is_colo_module(module):`
			`colo_module = get_colo_module(module)`
			`param_names = colo_module.get_param_names()`
			`compute_pattern = None`
			`for param_name in param_names:`
			`param = module.get_parameter(param_name)`
			`if not isinstance(param, ColoParameter):`
			`raise Exception(f'Invalid ColoParameter spec: {param} in {module} is not a ColoParameter.')`
[ColoTensor] rename APIs and add output_replicate to ComputeSpec (#1168) 2022-06-24 05:08:54 +00:00			`if param.has_compute_spec():`
[refactor] move process group from _DistSpec to ColoTensor. (#1203) 2022-07-06 08:15:16 +00:00			`cur_compute_pattern = param.compute_spec.compute_pattern`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if compute_pattern is None:`
			`compute_pattern = cur_compute_pattern`
			`else:`
			`if cur_compute_pattern != compute_pattern:`
reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00			`raise Exception(`
			`f'Invalid ColoParameter spec: Params in {module} have different compute_pattern.')`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`else:`
			`continue`
reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if compute_pattern is not None:`
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`colo_module.register(compute_pattern, pg)`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if not colo_module.has_compute_pattern(compute_pattern):`
reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00			`raise Exception(`
			`f'Invalid ColoParameter spec: ComputePattern {compute_pattern} in {module} is not allowed.')`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00
			`match_specs = False`
			`allowed_specs = colo_module.get_dist_specs(compute_pattern)`
			`for _, param_specs in allowed_specs.items():`
			`cur_match = True`
			`for param_name, dist_spec in param_specs.items():`
			`param = module.get_parameter(param_name)`
[ColoTensor] rename APIs and add output_replicate to ComputeSpec (#1168) 2022-06-24 05:08:54 +00:00			`if param.has_compute_spec():`
[refactor] move process group from _DistSpec to ColoTensor. (#1203) 2022-07-06 08:15:16 +00:00			`if dist_spec != param.dist_spec:`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`cur_match = False`
			`break`
			`else:`
			`if dist_spec is not None:`
			`cur_match = False`
			`break`
			`if cur_match == True:`
			`match_specs = True`
			`break`
			`if match_specs == False:`
			`raise Exception(f'Invalid ColoParameter spec: Params in {module} are incorrectly sharded.')`
			`if recursive == True:`
			`for submodule in module.children():`
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`check_colo_module(submodule, pg=pg, recursive=True)`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00
reorgnize colotensor directory (#1062) * reorgnize colotensor directory * polish code 2022-06-03 10:04:22 +00:00
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`def init_colo_module(module: torch.nn.Module,`
			`compute_spec: ComputeSpec,`
			`pg: ProcessGroup,`
			`recursive=True,`
			`mode='default'):`
[Tensor] rename parallel_action (#1174) * rename parallel_action * polish 2022-06-27 02:04:45 +00:00			`compute_pattern = compute_spec.compute_pattern`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if is_colo_module(module):`
			`# for each param`
[hotfix] fix unit test test_module_spec (#1321) 2022-07-15 06:02:32 +00:00			`# set its process_group, dist_spec and compute_spec`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`colo_module = get_colo_module(module)`
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`colo_module.register(compute_pattern, pg)`
[Tensor] add module check and bert test (#1031) * add Embedding * Add bert test * polish * add check module test * polish * polish * polish * polish 2022-05-26 10:15:42 +00:00			`if not colo_module.has_compute_pattern_with_mode(compute_pattern, mode=mode):`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`raise NotImplementedError`
[Tensor] add module check and bert test (#1031) * add Embedding * Add bert test * polish * add check module test * polish * polish * polish * polish 2022-05-26 10:15:42 +00:00			`# a set for modules which update at least one param in the init process.`
			`# these modules need to be checked whether all params still match one of the valid compute pattern.`
			`modules_update_param = {module}`
			`for param_name, dist_spec in colo_module.get_dist_specs_with_mode(compute_pattern, mode=mode).items():`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if dist_spec is None:`
			`continue`
			`param = module.get_parameter(param_name)`
			`if isinstance(param, ColoParameter):`
[hotfix] fix unit test test_module_spec (#1321) 2022-07-15 06:02:32 +00:00			`param.set_process_group(pg)`
[refactor] move process group from _DistSpec to ColoTensor. (#1203) 2022-07-06 08:15:16 +00:00			`param.set_dist_spec(dist_spec)`
			`param.compute_spec = compute_spec`
[Tensor] add module check and bert test (#1031) * add Embedding * Add bert test * polish * add check module test * polish * polish * polish * polish 2022-05-26 10:15:42 +00:00			`for mod in param.shared_param_modules:`
			`modules_update_param.add(mod)`
			`for mod in modules_update_param:`
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`check_colo_module(mod, pg, recursive=False)`
[Tensor] add module handler for linear (#1021) * add module spec for linear * polish * polish * polish 2022-05-26 03:50:44 +00:00			`if recursive == True:`
			`for submodule in module.children():`
[refactor] remove gpc dependency in colotensor's _ops (#1189) 2022-07-04 10:54:37 +00:00			`init_colo_module(submodule, compute_spec, pg=pg, recursive=True, mode=mode)`