From d182b0bd4749635222cbf83b1fa80308ddcd6147 Mon Sep 17 00:00:00 2001 From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com> Date: Mon, 23 May 2022 14:02:28 +0800 Subject: [PATCH] [hotfix] fix some bugs caused by size mismatch. (#1011) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [hotfix]fix some bugs caused by size mismatch. * add warning logs * polish --- colossalai/cli/benchmark/benchmark.py | 9 +++++++++ colossalai/cli/benchmark/utils.py | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/colossalai/cli/benchmark/benchmark.py b/colossalai/cli/benchmark/benchmark.py index 5bf09aa4e..43632b150 100644 --- a/colossalai/cli/benchmark/benchmark.py +++ b/colossalai/cli/benchmark/benchmark.py @@ -23,6 +23,8 @@ def run_benchmark(args: Config) -> None: if args.gpus is None: click.echo("Error: --num_gpus is not given") exit() + if args.gpus <= 1: + click.echo("Warning: tensor parallel will be activated with at least 2 devices.") click.echo("=== Benchmarking Parameters ===") for k, v in args.items(): @@ -63,6 +65,13 @@ def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_ colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') timer = MultiTimer() + # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size. + if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0: + click.echo( + "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size." + ) + continue + if hyperparams.model == 'mlp': model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers) else: diff --git a/colossalai/cli/benchmark/utils.py b/colossalai/cli/benchmark/utils.py index 41ac38b82..825b795f2 100644 --- a/colossalai/cli/benchmark/utils.py +++ b/colossalai/cli/benchmark/utils.py @@ -48,9 +48,15 @@ def find_all_configs(device_cnt: int) -> List[Dict]: """ def _is_square(num): + # 2D parallel should be implemented with at least 2 devices. + if num <= 1: + return False return math.floor(math.sqrt(num))**2 == num def _is_cube(num): + # 3D parallel should be implemented with at least 2 devices. + if num <= 1: + return False return math.floor(num**(1. / 3.))**3 == num config_list = [] @@ -63,7 +69,7 @@ def find_all_configs(device_cnt: int) -> List[Dict]: config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d'))) config_list.append(config) - # add 1D config only if device_cnt is a square + # add 2D config only if device_cnt is a square if _is_square(device_cnt): config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d'))) config_list.append(config)