diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py index e078a57c1..6411b4302 100644 --- a/colossalai/cli/launcher/run.py +++ b/colossalai/cli/launcher/run.py @@ -1,13 +1,16 @@ -import click -import sys import os -import torch -from colossalai.context import Config -from .multinode_runner import MultiNodeRunner -from .hostinfo import HostInfo, HostInfoList +import sys from typing import List + +import click +import torch from packaging import version +from colossalai.context import Config + +from .hostinfo import HostInfo, HostInfoList +from .multinode_runner import MultiNodeRunner + # Constants that define our syntax NODE_SEP = ',' @@ -15,7 +18,7 @@ NODE_SEP = ',' def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList: """ Parse the hostfile to obtain a list of hosts. - + A hostfile should look like: worker-0 worker-1 @@ -63,7 +66,7 @@ def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str device_pool (HostInfoList): a list of HostInfo objects include_str (str): --include option passed by user, default None exclude_str (str): --exclude option passed by user, default None - + Returns: filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion ''' @@ -192,7 +195,7 @@ def launch_multi_processes(args: Config) -> None: Launch multiple processes on a single node or multiple nodes. The overall logic can be summarized as the pseudo code below: - + if hostfile given: hostinfo = parse_hostfile(hostfile) hostinfo = include_or_exclude_hosts(hostinfo) @@ -202,7 +205,7 @@ def launch_multi_processes(args: Config) -> None: launch_on_multi_nodes(hostinfo) else: launch_on_current_node() - + Args: args (Config): the arguments taken from command line @@ -276,6 +279,33 @@ def launch_multi_processes(args: Config) -> None: extra_launch_args=args.extra_launch_args) runner.send(hostinfo=hostinfo, cmd=cmd) - runner.recv_from_all() + # start training + msg_from_node = runner.recv_from_all() + has_error = False + + # print node status + click.echo("\n====== Training on All Nodes =====") + for hostname, msg in msg_from_node.items(): + click.echo(f"{hostname}: {msg}") + + # check if a process failed + if msg == "failure": + has_error = True + + # stop all nodes runner.stop_all() - runner.recv_from_all() + + # receive the stop status + msg_from_node = runner.recv_from_all() + + # printe node status + click.echo("\n====== Stopping All Nodes =====") + for hostname, msg in msg_from_node.items(): + click.echo(f"{hostname}: {msg}") + + # give the process an exit code + # so that it behaves like a normal process + if has_error: + sys.exit(1) + else: + sys.exit(0) diff --git a/examples/tutorial/hybrid_parallel/README.md b/examples/tutorial/hybrid_parallel/README.md index 6f975e863..1b5e54f92 100644 --- a/examples/tutorial/hybrid_parallel/README.md +++ b/examples/tutorial/hybrid_parallel/README.md @@ -1,45 +1,40 @@ # Multi-dimensional Parallelism with Colossal-AI +## Table of contents -## πŸš€Quick Start -1. Install our model zoo. -```bash -pip install titans -``` -2. Run with synthetic data which is of similar shape to CIFAR10 with the `-s` flag. -```bash -colossalai run --nproc_per_node 4 train.py --config config.py -s +- [Overview](#-overview) +- [Quick Start](#-quick-start) + +## πŸ“š Overview + +This example lets you to quickly try out the hybrid parallelism provided by Colossal-AI. +You can change the parameters below to try out different settings in the `config.py`. + +```python +# parallel setting +TENSOR_PARALLEL_SIZE = 2 +TENSOR_PARALLEL_MODE = '1d' + +parallel = dict( + pipeline=2, + tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE), +) ``` -3. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs. +## πŸš€ Quick Start +1. Install PyTorch -## Install Titans Model Zoo +2. Install the dependencies. ```bash -pip install titans +pip install -r requirements.txt ``` - -## Prepare Dataset - -We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`. -The dataset will be downloaded to `colossalai/examples/tutorials/data` by default. -If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command. +3. Run the training scripts with synthetic data. ```bash -export DATA=/path/to/data -``` - - -## Run on 2*2 device mesh - -Current configuration setting on `config.py` is TP=2, PP=2. - -```bash -# train with cifar10 colossalai run --nproc_per_node 4 train.py --config config.py - -# train with synthetic data -colossalai run --nproc_per_node 4 train.py --config config.py -s ``` + +4. Modify the config file to play with different types of tensor parallelism, for example, change tensor parallel size to be 4 and mode to be 2d and run on 8 GPUs. diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py index ac273c305..fe9abf2f1 100644 --- a/examples/tutorial/hybrid_parallel/config.py +++ b/examples/tutorial/hybrid_parallel/config.py @@ -3,7 +3,7 @@ from colossalai.amp import AMP_TYPE # hyperparameters # BATCH_SIZE is as per GPU # global batch size = BATCH_SIZE x data parallel size -BATCH_SIZE = 256 +BATCH_SIZE = 4 LEARNING_RATE = 3e-3 WEIGHT_DECAY = 0.3 NUM_EPOCHS = 2 @@ -12,11 +12,11 @@ WARMUP_EPOCHS = 1 # model config IMG_SIZE = 224 PATCH_SIZE = 16 -HIDDEN_SIZE = 512 +HIDDEN_SIZE = 128 DEPTH = 4 NUM_HEADS = 4 MLP_RATIO = 2 -NUM_CLASSES = 1000 +NUM_CLASSES = 10 CHECKPOINT = False SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1 # add 1 for cls token diff --git a/examples/tutorial/hybrid_parallel/requirements.txt b/examples/tutorial/hybrid_parallel/requirements.txt index dbf6aaf3e..99b7ecfe1 100644 --- a/examples/tutorial/hybrid_parallel/requirements.txt +++ b/examples/tutorial/hybrid_parallel/requirements.txt @@ -1,3 +1,3 @@ -colossalai >= 0.1.12 -torch >= 1.8.1 -titans \ No newline at end of file +torch +colossalai +titans diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index 8860b72a2..e0dbef354 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -2,4 +2,4 @@ set -euxo pipefail pip install -r requirements.txt -torchrun --standalone --nproc_per_node 4 train.py --config config.py -s +colossalai run --nproc_per_node 4 train.py --config config.py diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py index 2a8576db7..4953d5350 100644 --- a/examples/tutorial/hybrid_parallel/train.py +++ b/examples/tutorial/hybrid_parallel/train.py @@ -1,7 +1,6 @@ import os import torch -from titans.dataloader.cifar10 import build_cifar from titans.model.vit.vit import _create_vit_model from tqdm import tqdm @@ -12,7 +11,7 @@ from colossalai.logging import get_dist_logger from colossalai.nn import CrossEntropyLoss from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.pipeline.pipelinable import PipelinableContext -from colossalai.utils import get_dataloader, is_using_pp +from colossalai.utils import is_using_pp class DummyDataloader(): @@ -42,12 +41,9 @@ class DummyDataloader(): def main(): - # initialize distributed setting - parser = colossalai.get_default_parser() - parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data") - args = parser.parse_args() - # launch from torch + parser = colossalai.get_default_parser() + args = parser.parse_args() colossalai.launch_from_torch(config=args.config) # get logger @@ -94,15 +90,10 @@ def main(): pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE) logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}") - # create dataloaders - root = os.environ.get('DATA', '../data') - if args.synthetic: - # if we use synthetic dataset - # we train for 10 steps and eval for 5 steps per epoch - train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) - test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE) - else: - train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True) + # use synthetic dataset + # we train for 10 steps and eval for 5 steps per epoch + train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE) + test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE) # create loss function criterion = CrossEntropyLoss(label_smoothing=0.1) @@ -139,6 +130,7 @@ def main(): engine.execute_schedule(data_iter, return_output_label=False) engine.step() lr_scheduler.step() + gpc.destroy() if __name__ == '__main__':