[misc] refactor launch API and tensor constructor (#5666)

* [misc] remove config arg from initialize

* [misc] remove old tensor contrusctor

* [plugin] add npu support for ddp

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [devops] fix doc test ci

* [test] fix test launch

* [doc] update launch doc

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pull/5541/head
Hongxin Liu 2024-04-29 10:40:11 +08:00 committed by GitHub
parent 91fa553775
commit 7f8b16635b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
223 changed files with 294 additions and 403 deletions

View File

@ -56,7 +56,7 @@ jobs:
needs: detect-changed-doc needs: detect-changed-doc
runs-on: [self-hosted, gpu] runs-on: [self-hosted, gpu]
container: container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm options: --gpus all --rm
timeout-minutes: 20 timeout-minutes: 20
defaults: defaults:

View File

@ -136,7 +136,7 @@ def main() -> None:
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
accelerator = get_accelerator() accelerator = get_accelerator()
coordinator = DistCoordinator() coordinator = DistCoordinator()

View File

@ -66,7 +66,7 @@ def benchmark_train(args):
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# ====================================================== # ======================================================

View File

@ -37,7 +37,7 @@ def train(args):
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# ============================== # ==============================

View File

@ -39,7 +39,7 @@ def train(args):
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# ====================================================== # ======================================================

View File

@ -34,7 +34,7 @@ def train(args):
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# ====================================================== # ======================================================

View File

@ -29,7 +29,7 @@ def train(args):
# ============================== # ==============================
# Initialize Distributed Training # Initialize Distributed Training
# ============================== # ==============================
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# ============================== # ==============================

View File

@ -81,7 +81,7 @@ def rm_and_merge(
def main(args): def main(args):
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
accelerator = get_accelerator() accelerator = get_accelerator()
world_size = dist.get_world_size() world_size = dist.get_world_size()

View File

@ -81,7 +81,7 @@ def rm_and_merge(
def main(args): def main(args):
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
world_size = dist.get_world_size() world_size = dist.get_world_size()
rank = dist.get_rank() rank = dist.get_rank()

View File

@ -57,7 +57,7 @@ def main():
args = parse_args() args = parse_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
config = MixtralConfig.from_pretrained(args.model_name) config = MixtralConfig.from_pretrained(args.model_name)
@ -96,7 +96,11 @@ def main():
if coordinator.rank == 0: if coordinator.rank == 0:
text = ["Hello my name is"] text = ["Hello my name is"]
else: else:
text = ["What's the largest country in the world?", "How many people live in China?", "帮我续写这首诗:离离原上草"] text = [
"What's the largest country in the world?",
"How many people live in China?",
"帮我续写这首诗:离离原上草",
]
tokenizer.pad_token = tokenizer.unk_token tokenizer.pad_token = tokenizer.unk_token
inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device()) inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device())

View File

@ -50,7 +50,7 @@ def check_mixtral_moe_layer():
def run_dist(rank: int, world_size: int, port: int): def run_dist(rank: int, world_size: int, port: int):
colossalai.launch({}, rank, world_size, "localhost", port) colossalai.launch(rank, world_size, "localhost", port)
check_mixtral_moe_layer() check_mixtral_moe_layer()

View File

@ -133,7 +133,7 @@ def check_mixtral_moe_layer():
def run_dist(rank: int, world_size: int, port: int): def run_dist(rank: int, world_size: int, port: int):
colossalai.launch({}, rank, world_size, "localhost", port) colossalai.launch(rank, world_size, "localhost", port)
check_mixtral_moe_layer() check_mixtral_moe_layer()

View File

@ -145,7 +145,7 @@ def main():
args = parse_args() args = parse_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# Set plugin # Set plugin
@ -195,9 +195,9 @@ def main():
lr_scheduler = CosineAnnealingWarmupLR( lr_scheduler = CosineAnnealingWarmupLR(
optimizer=optimizer, optimizer=optimizer,
total_steps=args.num_epochs * len(dataloader), total_steps=args.num_epochs * len(dataloader),
warmup_steps=args.warmup_steps warmup_steps=(
if args.warmup_steps is not None args.warmup_steps if args.warmup_steps is not None else int(args.num_epochs * len(dataloader) * 0.025)
else int(args.num_epochs * len(dataloader) * 0.025), ),
eta_min=0.1 * args.lr, eta_min=0.1 * args.lr,
) )

View File

@ -126,7 +126,7 @@ class AMPOptimizer(OptimizerWrapper):
return self.grad_scaler.scale.item() return self.grad_scaler.scale.item()
def zero_grad(self, *args, **kwargs): def zero_grad(self, *args, **kwargs):
self.module.overflow_counter = torch.cuda.IntTensor([0]) self.module.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_accelerator().get_current_device())
return self.optim.zero_grad(set_to_none=True) return self.optim.zero_grad(set_to_none=True)
def step(self, *args, **kwargs): def step(self, *args, **kwargs):

View File

@ -4,7 +4,7 @@ from typing import Optional, Set
import torch import torch
import torch.nn as nn import torch.nn as nn
from colossalai.utils import _cast_float from colossalai.utils import _cast_float, get_current_device
from colossalai.utils.common import free_storage from colossalai.utils.common import free_storage
from .region_manager import RegionManager from .region_manager import RegionManager
@ -25,7 +25,7 @@ class BaseOffloadModule:
self.model = model self.model = model
self.region_manager = region_manager self.region_manager = region_manager
self.grad_hook_list = [] self.grad_hook_list = []
self.overflow_counter = torch.cuda.IntTensor([0]) self.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_current_device())
self.grad_offload_stream = torch.cuda.current_stream() if is_sync else GlobalRuntimeInfo.d2h_stream self.grad_offload_stream = torch.cuda.current_stream() if is_sync else GlobalRuntimeInfo.d2h_stream

View File

@ -10,6 +10,7 @@ from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
from colossalai.interface import ModelWrapper, OptimizerWrapper from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.quantization import BnbQuantizationConfig, quantize_model from colossalai.quantization import BnbQuantizationConfig, quantize_model
from colossalai.utils import get_current_device
from .dp_plugin_base import DPPluginBase from .dp_plugin_base import DPPluginBase
@ -203,7 +204,7 @@ class TorchDDPPlugin(DPPluginBase):
return True return True
def supported_devices(self) -> List[str]: def supported_devices(self) -> List[str]:
return ["cuda"] return ["cuda", "npu"]
def configure( def configure(
self, self,
@ -214,7 +215,7 @@ class TorchDDPPlugin(DPPluginBase):
lr_scheduler: Optional[LRScheduler] = None, lr_scheduler: Optional[LRScheduler] = None,
) -> Tuple[nn.Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]: ) -> Tuple[nn.Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
# cast model to cuda # cast model to cuda
model = model.cuda() model = model.to(get_current_device())
# convert model to sync bn # convert model to sync bn
model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None) model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)

View File

@ -114,7 +114,7 @@ import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM, LlamaTokenizer
#launch distributed environment #launch distributed environment
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
# load original model and tokenizer # load original model and tokenizer
model = LlamaForCausalLM.from_pretrained("/path/to/model") model = LlamaForCausalLM.from_pretrained("/path/to/model")

View File

@ -2,20 +2,15 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import os import os
import warnings
from pathlib import Path
from typing import Dict, Union
import torch.distributed as dist import torch.distributed as dist
from colossalai.accelerator import get_accelerator from colossalai.accelerator import get_accelerator
from colossalai.context import Config
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import set_seed from colossalai.utils import set_seed
def launch( def launch(
config: Union[str, Path, Config, Dict],
rank: int, rank: int,
world_size: int, world_size: int,
host: str, host: str,
@ -44,8 +39,6 @@ def launch(
Raises: Raises:
Exception: Raise exception when config type is wrong Exception: Raise exception when config type is wrong
""" """
if rank == 0:
warnings.warn("`config` is deprecated and will be removed soon.")
cur_accelerator = get_accelerator() cur_accelerator = get_accelerator()
@ -68,7 +61,6 @@ def launch(
def launch_from_slurm( def launch_from_slurm(
config: Union[str, Path, Config, Dict],
host: str, host: str,
port: int, port: int,
backend: str = "nccl", backend: str = "nccl",
@ -95,7 +87,6 @@ def launch_from_slurm(
) )
launch( launch(
config=config,
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,
host=host, host=host,
@ -107,7 +98,6 @@ def launch_from_slurm(
def launch_from_openmpi( def launch_from_openmpi(
config: Union[str, Path, Config, Dict],
host: str, host: str,
port: int, port: int,
backend: str = "nccl", backend: str = "nccl",
@ -135,7 +125,6 @@ def launch_from_openmpi(
) )
launch( launch(
config=config,
local_rank=local_rank, local_rank=local_rank,
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,
@ -147,9 +136,7 @@ def launch_from_openmpi(
) )
def launch_from_torch( def launch_from_torch(backend: str = "nccl", seed: int = 1024, verbose: bool = True):
config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024, verbose: bool = True
):
"""A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch from the environment variables set by PyTorch
@ -171,7 +158,6 @@ def launch_from_torch(
) )
launch( launch(
config=config,
local_rank=local_rank, local_rank=local_rank,
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,

View File

@ -56,7 +56,7 @@ class Worker:
# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
collective.init_collective_group(world_size, rank, "nccl", "default") collective.init_collective_group(world_size, rank, "nccl", "default")
# initialize and set distributed environment # initialize and set distributed environment
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..") ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
log_cuda_info("Worker.setup") log_cuda_info("Worker.setup")

View File

@ -42,7 +42,7 @@ class CaiInferEngine:
import colossalai import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM, LlamaTokenizer
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
model = LlamaForCausalLM.from_pretrained("your_path_to_model") model = LlamaForCausalLM.from_pretrained("your_path_to_model")
tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf") tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf")

View File

@ -36,7 +36,7 @@ from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
import colossalai import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM, LlamaTokenizer
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
model = LlamaForCausalLM.from_pretrained("/path/to/model") model = LlamaForCausalLM.from_pretrained("/path/to/model")
tokenizer = LlamaTokenizer.from_pretrained("/path/to/model") tokenizer = LlamaTokenizer.from_pretrained("/path/to/model")
@ -57,27 +57,27 @@ We conducted multiple benchmark tests to evaluate the performance. We compared t
### Llama Throughput (tokens/s) | input length=1024, output length=128 ### Llama Throughput (tokens/s) | input length=1024, output length=128
#### A10 7b, fp16 #### A10 7b, fp16
| batch_size(micro_batch size)| 2(1) | 4(2) | 8(4) | 16(8) | 32(8) | 32(16)| | batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(8) | 32(16) |
| :---: | :---: | :---: | :---: | :---: | :---: | :---:| |:----------------------------:|:-----:|:-----:|:------:|:------:|:------:|:------:|
| Pipeline Inference | 40.35 | 77.1 | 139.03 | 232.7 | 257.81 | OOM | | Pipeline Inference | 40.35 | 77.1 | 139.03 | 232.7 | 257.81 | OOM |
| Hugging Face | 41.43 | 65.30 | 91.93 | 114.62 | OOM| OOM | | Hugging Face | 41.43 | 65.30 | 91.93 | 114.62 | OOM | OOM |
#### A10 13b, fp16 #### A10 13b, fp16
| batch_size(micro_batch size)| 2(1) | 4(2) | 8(4) | 16(4) | | batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(4) |
| :---: | :---: | :---: | :---: | :---: | |:----------------------------:|:-----:|:-----:|:-----:|:-----:|
| Pipeline Inference | 25.39 | 47.09 | 83.7 | 89.46 | | Pipeline Inference | 25.39 | 47.09 | 83.7 | 89.46 |
| Hugging Face | 23.48 | 37.59 | 53.44 | OOM | | Hugging Face | 23.48 | 37.59 | 53.44 | OOM |
#### A800 7b, fp16 #### A800 7b, fp16
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) | | batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
| :---: | :---: | :---: | :---: | :---: | :---: | |:----------------------------:|:-----:|:------:|:------:|:------:|:------:|
| Pipeline Inference| 57.97 | 110.13 | 213.33 | 389.86 | 670.12 | | Pipeline Inference | 57.97 | 110.13 | 213.33 | 389.86 | 670.12 |
| Hugging Face | 42.44 | 76.5 | 151.97 | 212.88 | 256.13 | | Hugging Face | 42.44 | 76.5 | 151.97 | 212.88 | 256.13 |
#### A800 13b, fp16 #### A800 13b, fp16
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) | | batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
| :---: | :---: | :---: | :---: | :---: | :---: | |:----------------------------:|:-----:|:-----:|:------:|:------:|:------:|
| Pipeline Inference | 41.78 | 94.18 | 172.67| 310.75| 470.15 | | Pipeline Inference | 41.78 | 94.18 | 172.67 | 310.75 | 470.15 |
| Hugging Face | 36.57 | 68.4 | 105.81 | 139.51 | 166.34 | | Hugging Face | 36.57 | 68.4 | 105.81 | 139.51 | 166.34 |

View File

@ -12,7 +12,7 @@ from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
GIGABYTE = 1024**3 GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024 MEGABYTE = 1024 * 1024
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
def data_gen(batch_size: int = 4, seq_len: int = 512): def data_gen(batch_size: int = 4, seq_len: int = 512):

View File

@ -56,7 +56,7 @@ class Worker:
# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
collective.init_collective_group(world_size, rank, "nccl", "default") collective.init_collective_group(world_size, rank, "nccl", "default")
# initialize and set distributed environment # initialize and set distributed environment
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..") ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
log_cuda_info("Worker.setup") log_cuda_info("Worker.setup")

View File

@ -98,7 +98,7 @@ class ColossalInferenceHandler(BaseHandler, ABC):
self.model.cuda() self.model.cuda()
self.model.eval() self.model.eval()
colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
logger.info("Initializing TPInferEngine ...") logger.info("Initializing TPInferEngine ...")
shard_config = ShardConfig( shard_config = ShardConfig(
enable_tensor_parallelism=True if self.tp_size > 1 else False, extra_kwargs={"inference_only": True} enable_tensor_parallelism=True if self.tp_size > 1 else False, extra_kwargs={"inference_only": True}

View File

@ -114,7 +114,7 @@ def run_worker(rank, args, master_func):
port = args.master_port port = args.master_port
backend = "nccl" if device == "cuda" else "gloo" backend = "nccl" if device == "cuda" else "gloo"
launch(dict(), rank, world_size, host, int(port), backend, verbose=False) launch(rank, world_size, host, int(port), backend, verbose=False)
ppg.set_global_info( ppg.set_global_info(
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,

View File

@ -8,7 +8,7 @@ Licensed under the MIT License.
""" """
import torch import torch
from colossalai.utils import multi_tensor_applier from colossalai.utils import get_current_device, multi_tensor_applier
class FusedAdam(torch.optim.Optimizer): class FusedAdam(torch.optim.Optimizer):
@ -75,7 +75,7 @@ class FusedAdam(torch.optim.Optimizer):
fused_optim = FusedOptimizerLoader().load() fused_optim = FusedOptimizerLoader().load()
# Skip buffer # Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=get_current_device())
self.multi_tensor_adam = fused_optim.multi_tensor_adam self.multi_tensor_adam = fused_optim.multi_tensor_adam
else: else:
raise RuntimeError("FusedAdam requires cuda extensions") raise RuntimeError("FusedAdam requires cuda extensions")

View File

@ -3,7 +3,7 @@ from typing import Any, Optional
import torch import torch
from colossalai.kernel.kernel_loader import FusedOptimizerLoader from colossalai.kernel.kernel_loader import FusedOptimizerLoader
from colossalai.utils import multi_tensor_applier from colossalai.utils import get_current_device, multi_tensor_applier
from .cpu_adam import CPUAdam from .cpu_adam import CPUAdam
@ -87,7 +87,7 @@ class HybridAdam(CPUAdam):
if torch.cuda.is_available(): if torch.cuda.is_available():
fused_optim = FusedOptimizerLoader().load() fused_optim = FusedOptimizerLoader().load()
self.gpu_adam_op = fused_optim.multi_tensor_adam self.gpu_adam_op = fused_optim.multi_tensor_adam
self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=get_current_device())
@torch.no_grad() @torch.no_grad()
def step(self, closure=None, div_scale: float = -1): def step(self, closure=None, div_scale: float = -1):

View File

@ -38,7 +38,7 @@ from transformers import BertForMaskedLM
import colossalai import colossalai
# launch colossalai # launch colossalai
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
# create model # create model
config = BertConfig.from_pretrained('bert-base-uncased') config = BertConfig.from_pretrained('bert-base-uncased')

View File

@ -28,7 +28,7 @@ def to_device(x: Any, device: torch.device) -> Any:
def train(args): def train(args):
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# prepare for data and dataset # prepare for data and dataset

View File

@ -1,6 +1,7 @@
""" """
Shardformer Benchmark Shardformer Benchmark
""" """
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import transformers import transformers
@ -84,5 +85,5 @@ def bench_shardformer(BATCH, N_CTX, provider, model_func, dtype=torch.float32, d
# start benchmark, command: # start benchmark, command:
# torchrun --standalone --nproc_per_node=2 performance_benchmark.py # torchrun --standalone --nproc_per_node=2 performance_benchmark.py
if __name__ == "__main__": if __name__ == "__main__":
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
bench_shardformer.run(save_path=".", print_data=dist.get_rank() == 0) bench_shardformer.run(save_path=".", print_data=dist.get_rank() == 0)

View File

@ -26,7 +26,7 @@ class ShardFormer:
import colossalai import colossalai
import torch import torch
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
org_model = BertForMaskedLM.from_pretrained('bert-base-uncased') org_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
shard_config = ShardConfig() shard_config = ShardConfig()

View File

@ -69,7 +69,7 @@ import colossalai
from colossalai.device.device_mesh import DeviceMesh from colossalai.device.device_mesh import DeviceMesh
from colossalai.tensor.d_tensor import DTensor, ShardingSpec from colossalai.tensor.d_tensor import DTensor, ShardingSpec
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
# define your device mesh # define your device mesh
# assume you have 4 GPUs # assume you have 4 GPUs

View File

@ -75,7 +75,7 @@ WARMUP_FRACTION = 0.1
we create a distributed environment. we create a distributed environment.
```python ```python
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch( seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
``` ```
prepare the dataset. You can use `plugin.prepare_dataloader` to generate a dataloader or customize your own dataloader. prepare the dataset. You can use `plugin.prepare_dataloader` to generate a dataloader or customize your own dataloader.

View File

@ -71,7 +71,7 @@ PP_SIZE = 2
Create a distributed environment. Create a distributed environment.
```python ```python
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=SEEDå) colossalai.launch_from_torch( seed=SEEDå)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size
``` ```

View File

@ -55,7 +55,7 @@ from colossalai.booster.plugin import TorchDDPPlugin
def train(): def train():
# launch colossalai # launch colossalai
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost') colossalai.launch(rank=rank, world_size=world_size, port=port, host='localhost')
# create plugin and objects for training # create plugin and objects for training
plugin = TorchDDPPlugin() plugin = TorchDDPPlugin()

View File

@ -87,8 +87,7 @@ import colossalai
args = colossalai.get_default_parser().parse_args() args = colossalai.get_default_parser().parse_args()
# launch distributed environment # launch distributed environment
colossalai.launch(config=args.config, colossalai.launch(rank=args.rank,
rank=args.rank,
world_size=args.world_size, world_size=args.world_size,
host=args.host, host=args.host,
port=args.port, port=args.port,
@ -106,20 +105,11 @@ First, we need to set the launch method in our code. As this is a wrapper of the
use `colossalai.launch_from_torch`. The arguments required for distributed environment such as rank, world size, host and port are all set by the PyTorch use `colossalai.launch_from_torch`. The arguments required for distributed environment such as rank, world size, host and port are all set by the PyTorch
launcher and can be read from the environment variable directly. launcher and can be read from the environment variable directly.
config.py
```python
BATCH_SIZE = 512
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
NUM_EPOCHS = 2
```
train.py train.py
```python ```python
import colossalai import colossalai
colossalai.launch_from_torch( colossalai.launch_from_torch()
config="./config.py",
)
... ...
``` ```
@ -203,7 +193,6 @@ Do this in your training script:
import colossalai import colossalai
colossalai.launch_from_slurm( colossalai.launch_from_slurm(
config=<CONFIG>,
host=args.host, host=args.host,
port=args.port port=args.port
) )
@ -224,7 +213,6 @@ use them to start the distributed backend.
Do this in your train.py: Do this in your train.py:
```python ```python
colossalai.launch_from_openmpi( colossalai.launch_from_openmpi(
config=<CONFIG>,
host=args.host, host=args.host,
port=args.port port=args.port
) )
@ -238,3 +226,5 @@ mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node n
- --hostfile: use this option to specify a list of hosts on which to run - --hostfile: use this option to specify a list of hosts on which to run
- --np: set the number of processes (GPUs) to launch in total. For example, if --np 4, 4 python processes will be initialized to run train.py. - --np: set the number of processes (GPUs) to launch in total. For example, if --np 4, 4 python processes will be initialized to run train.py.
<!-- doc-test-command: echo -->

View File

@ -45,7 +45,7 @@ We then need to initialize distributed environment. For demo purpose, we uses `l
parser = colossalai.get_default_parser() parser = colossalai.get_default_parser()
args = parser.parse_args() args = parser.parse_args()
# launch from torch # launch from torch
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
``` ```
### Step 3. Create training components ### Step 3. Create training components

View File

@ -61,7 +61,7 @@ We then need to initialize distributed environment. For demo purpose, we uses `l
for other initialization methods. for other initialization methods.
```python ```python
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()
``` ```

View File

@ -29,7 +29,7 @@ from colossalai.booster.plugin import GeminiPlugin
from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining
colossalai.launch({}) colossalai.launch()
plugin = GeminiPlugin() plugin = GeminiPlugin()
booster = Booster(plugin) booster = Booster(plugin)

View File

@ -20,10 +20,10 @@ In Colossal-AI, we have incorporated different implementations of mixed precisio
3. naive amp 3. naive amp
| Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent | | Colossal-AI | support tensor parallel | support pipeline parallel | fp16 extent |
| -------------- | ----------------------- | ------------------------- | ---------------------------------------------------------------------------------------------------- | |----------------|-------------------------|---------------------------|------------------------------------------------------------------------------------------------------|
| AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation | | AMP_TYPE.TORCH | ✅ | ❌ | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
| AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 | | AMP_TYPE.APEX | ❌ | ❌ | More fine-grained, we can choose opt_level O0, O1, O2, O3 |
| AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 | | AMP_TYPE.NAIVE | ✅ | ✅ | Model parameters, forward and backward operations are all downcast to fp16 |
The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex. The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
The last method is similar to Apex O2 level. The last method is similar to Apex O2 level.
@ -164,7 +164,7 @@ parser = colossalai.get_default_parser()
args = parser.parse_args() args = parser.parse_args()
# launch from torch # launch from torch
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
``` ```

View File

@ -185,7 +185,7 @@ Then we can train GPT model with Gemini. The placement policy of Gemini should b
```python ```python
def train_gemini_cpu(nvme_offload_fraction: float = 0.0): def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
config = GPT2Config() config = GPT2Config()
with ColoInitContext(device=torch.cuda.current_device()): with ColoInitContext(device=torch.cuda.current_device()):
model = GPT2LMHeadModel(config) model = GPT2LMHeadModel(config)

View File

@ -174,7 +174,7 @@ def main():
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
NUM_STEPS = 10 NUM_STEPS = 10
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
# build criterion # build criterion
criterion = GPTLMLoss() criterion = GPTLMLoss()

View File

@ -62,7 +62,7 @@ plugin = HybridParallelPlugin(
## 创建分布式环境. ## 创建分布式环境.
```python ```python
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
``` ```
## 定义GPT-2模型的训练组件 ## 定义GPT-2模型的训练组件

View File

@ -70,7 +70,7 @@ PP_SIZE = 2
首先我们创建一个分布式环境 首先我们创建一个分布式环境
```python ```python
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=SEEDå) colossalai.launch_from_torch(seed=SEEDå)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size
``` ```

View File

@ -60,7 +60,7 @@ from colossalai.booster.plugin import TorchDDPPlugin
def train(): def train():
# launch colossalai # launch colossalai
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost') colossalai.launch(rank=rank, world_size=world_size, port=port, host='localhost')
# create plugin and objects for training # create plugin and objects for training
plugin = TorchDDPPlugin() plugin = TorchDDPPlugin()

View File

@ -74,8 +74,7 @@ import colossalai
args = colossalai.get_default_parser().parse_args() args = colossalai.get_default_parser().parse_args()
# launch distributed environment # launch distributed environment
colossalai.launch(config=args.config, colossalai.launch(rank=args.rank,
rank=args.rank,
world_size=args.world_size, world_size=args.world_size,
host=args.host, host=args.host,
port=args.port, port=args.port,
@ -93,20 +92,11 @@ PyTorch自带的启动器需要在每个节点上都启动命令才能启动多
首先我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装那么我们自然而然应该使用`colossalai.launch_from_torch`。 首先我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装那么我们自然而然应该使用`colossalai.launch_from_torch`。
分布式环境所需的参数,如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的,可以直接从环境变量中读取。 分布式环境所需的参数,如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的,可以直接从环境变量中读取。
config.py
```python
BATCH_SIZE = 512
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 0.3
NUM_EPOCHS = 2
```
train.py train.py
```python ```python
import colossalai import colossalai
colossalai.launch_from_torch( colossalai.launch_from_torch()
config="./config.py",
)
... ...
``` ```
@ -186,7 +176,6 @@ colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1 --e
import colossalai import colossalai
colossalai.launch_from_slurm( colossalai.launch_from_slurm(
config=<CONFIG>,
host=args.host, host=args.host,
port=args.port port=args.port
) )
@ -206,7 +195,6 @@ srun python train.py --host <master_node> --port 29500
您可以在您的训练脚本中尝试以下操作。 您可以在您的训练脚本中尝试以下操作。
```python ```python
colossalai.launch_from_openmpi( colossalai.launch_from_openmpi(
config=<CONFIG>,
host=args.host, host=args.host,
port=args.port port=args.port
) )
@ -219,3 +207,5 @@ mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node n
- --hostfile: 指定一个要运行的主机列表。 - --hostfile: 指定一个要运行的主机列表。
- --np: 设置总共要启动的进程GPU的数量。例如如果 --np 44个 python 进程将被初始化以运行 train.py。 - --np: 设置总共要启动的进程GPU的数量。例如如果 --np 44个 python 进程将被初始化以运行 train.py。
<!-- doc-test-command: echo -->

View File

@ -46,7 +46,7 @@ parser = colossalai.get_default_parser()
args = parser.parse_args() args = parser.parse_args()
# launch from torch # launch from torch
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
``` ```

View File

@ -61,7 +61,7 @@ from colossalai.nn.lr_scheduler import CosineAnnealingLR
我们需要初始化分布式环境. 为了快速演示,我们使用`launch_from_torch`. 您可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md) 我们需要初始化分布式环境. 为了快速演示,我们使用`launch_from_torch`. 您可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)
```python ```python
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()
``` ```

View File

@ -29,7 +29,7 @@ from colossalai.booster.plugin import GeminiPlugin
from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining
colossalai.launch({}) colossalai.launch()
plugin = GeminiPlugin() plugin = GeminiPlugin()
booster = Booster(plugin) booster = Booster(plugin)

View File

@ -19,11 +19,11 @@ AMP 代表自动混合精度训练。
2. apex.amp 2. apex.amp
3. naive amp 3. naive amp
| Colossal-AI | 支持张量并行 | 支持流水并行 | fp16 范围 | | Colossal-AI | 支持张量并行 | 支持流水并行 | fp16 范围 |
| -------------- | ------------ | ------------ | --------------------------------------------------------- | |----------------|--------------|--------------|-------------------------------------------------------|
| AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间,模型参数、激活和梯度向下转换至 fp16 | | AMP_TYPE.TORCH | ✅ | ❌ | 在前向和反向传播期间,模型参数、激活和梯度向下转换至 fp16 |
| AMP_TYPE.APEX | ❌ | ❌ | 更细粒度,我们可以选择 opt_level O0, O1, O2, O3 | | AMP_TYPE.APEX | ❌ | ❌ | 更细粒度,我们可以选择 opt_level O0, O1, O2, O3 |
| AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作,全都向下转换至 fp16 | | AMP_TYPE.NAIVE | ✅ | ✅ | 模型参数、前向和反向操作,全都向下转换至 fp16 |
前两个依赖于 PyTorch (1.6 及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的,因此,需要在不同的进程之间进行通信,以检查整个模型权重中是否出现 inf 或 nan。我们修改了 torch amp 实现,使其现在与张量并行兼容。 前两个依赖于 PyTorch (1.6 及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的,因此,需要在不同的进程之间进行通信,以检查整个模型权重中是否出现 inf 或 nan。我们修改了 torch amp 实现,使其现在与张量并行兼容。
@ -153,7 +153,7 @@ parser = colossalai.get_default_parser()
args = parser.parse_args() args = parser.parse_args()
# launch from torch # launch from torch
colossalai.launch_from_torch(config=dict()) colossalai.launch_from_torch()
``` ```

View File

@ -175,7 +175,7 @@ Mem usage: 4968.016 MB
```python ```python
def train_gemini_cpu(nvme_offload_fraction: float = 0.0): def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
config = GPT2Config() config = GPT2Config()
with ColoInitContext(device=torch.cuda.current_device()): with ColoInitContext(device=torch.cuda.current_device()):
model = GPT2LMHeadModel(config) model = GPT2LMHeadModel(config)

View File

@ -174,7 +174,7 @@ def main():
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
NUM_STEPS = 10 NUM_STEPS = 10
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
# build criterion # build criterion
criterion = GPTLMLoss() criterion = GPTLMLoss()

View File

@ -35,12 +35,12 @@ def main():
if args.vscode_debug: if args.vscode_debug:
colossalai.launch( colossalai.launch(
config={}, rank=args.rank, world_size=args.world_size, host=args.host, port=args.port, backend=args.backend rank=args.rank, world_size=args.world_size, host=args.host, port=args.port, backend=args.backend
) )
args.local_rank = -1 args.local_rank = -1
args.log_interval = 1 args.log_interval = 1
else: else:
colossalai.launch_from_torch(config={}) # args.colossal_config colossalai.launch_from_torch() # args.colossal_config
args.local_rank = int(os.environ["LOCAL_RANK"]) args.local_rank = int(os.environ["LOCAL_RANK"])
logger.info( logger.info(
f"launch_from_torch, world size: {torch.distributed.get_world_size()} | " f"launch_from_torch, world size: {torch.distributed.get_world_size()} | "

View File

@ -9,7 +9,7 @@ from colossalai.zero import ColoInitContext
path = "/data/scratch/diffuser/stable-diffusion-v1-4" path = "/data/scratch/diffuser/stable-diffusion-v1-4"
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
with ColoInitContext(device="cpu"): with ColoInitContext(device="cpu"):
vae = AutoencoderKL.from_pretrained( vae = AutoencoderKL.from_pretrained(
path, path,

View File

@ -372,9 +372,9 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
def main(args): def main(args):
if args.seed is None: if args.seed is None:
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
else: else:
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
local_rank = dist.get_rank() local_rank = dist.get_rank()
world_size = dist.get_world_size() world_size = dist.get_world_size()

View File

@ -371,9 +371,9 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
def main(args): def main(args):
if args.seed is None: if args.seed is None:
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
else: else:
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
local_rank = gpc.get_local_rank(ParallelMode.DATA) local_rank = gpc.get_local_rank(ParallelMode.DATA)
world_size = gpc.get_world_size(ParallelMode.DATA) world_size = gpc.get_world_size(ParallelMode.DATA)

View File

@ -128,7 +128,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# update the learning rate with linear scaling # update the learning rate with linear scaling

View File

@ -46,7 +46,7 @@ def main():
args = parse_benchmark_args() args = parse_benchmark_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size

View File

@ -137,7 +137,7 @@ def main():
args = parse_demo_args() args = parse_demo_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size

View File

@ -136,7 +136,7 @@ def benchmark_inference(args):
def hybrid_inference(rank, world_size, port, args): def hybrid_inference(rank, world_size, port, args):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
benchmark_inference(args) benchmark_inference(args)

View File

@ -68,7 +68,7 @@ def run_inference(args):
def run_tp_pipeline_inference(rank, world_size, port, args): def run_tp_pipeline_inference(rank, world_size, port, args):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_inference(args) run_inference(args)

View File

@ -81,7 +81,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# local_batch_size = BATCH_SIZE // coordinator.world_size # local_batch_size = BATCH_SIZE // coordinator.world_size

View File

@ -202,7 +202,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
lr = LEARNING_RATE * coordinator.world_size lr = LEARNING_RATE * coordinator.world_size

View File

@ -94,8 +94,7 @@ def train_gpt(args):
def run(rank, world_size, port, args): def run(rank, world_size, port, args):
config = {} colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
train_gpt(args) train_gpt(args)

View File

@ -47,7 +47,7 @@ def get_data(batch_size, seq_len, vocab_size):
def main(): def main():
disable_existing_loggers() disable_existing_loggers()
launch_from_torch(config={}) launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()
config = transformers.GPT2Config(n_position=SEQ_LENGTH, n_layer=NUM_LAYERS, n_head=NUM_HEADS, n_embd=HIDDEN_DIM) config = transformers.GPT2Config(n_position=SEQ_LENGTH, n_layer=NUM_LAYERS, n_head=NUM_HEADS, n_embd=HIDDEN_DIM)
if FP16: if FP16:

View File

@ -132,7 +132,7 @@ def main():
PROF_FLAG = False # The flag of profiling, False by default PROF_FLAG = False # The flag of profiling, False by default
disable_existing_loggers() disable_existing_loggers()
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()
logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0]) logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])

View File

@ -67,7 +67,7 @@ def main():
parser.add_argument("--cpu_offload", action="store_true", help="Use gradient checkpointing") parser.add_argument("--cpu_offload", action="store_true", help="Use gradient checkpointing")
args = parser.parse_args() args = parser.parse_args()
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
def empty_init(): def empty_init():

View File

@ -196,7 +196,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# local_batch_size = BATCH_SIZE // coordinator.world_size # local_batch_size = BATCH_SIZE // coordinator.world_size

View File

@ -36,9 +36,9 @@ def main():
args = parser.parse_args() args = parser.parse_args()
disable_existing_loggers() disable_existing_loggers()
if args.from_torch: if args.from_torch:
colossalai.launch_from_torch(config=args.config) colossalai.launch_from_torch()
else: else:
colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42) colossalai.launch_from_slurm(host=args.host, port=29500, seed=42)
logger = get_dist_logger() logger = get_dist_logger()
data_path = None if args.use_dummy_dataset else os.environ["DATA"] data_path = None if args.use_dummy_dataset else os.environ["DATA"]

View File

@ -16,7 +16,7 @@ if __name__ == "__main__":
parser = get_default_parser() parser = get_default_parser()
args = parser.parse_args() args = parser.parse_args()
start = time.time() start = time.time()
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
plugin = HybridParallelPlugin( plugin = HybridParallelPlugin(
tp_size=coordinator.world_size, tp_size=coordinator.world_size,

View File

@ -78,7 +78,7 @@ def main():
parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False) parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
args = parser.parse_args() args = parser.parse_args()
colossalai.launch_from_torch({}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
def empty_init(): def empty_init():

View File

@ -146,7 +146,7 @@ def main():
args = parse_args() args = parse_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# Set plugin # Set plugin

View File

@ -207,7 +207,7 @@ def main():
args = parse_args() args = parse_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
test_mode = args.model_name == "test" test_mode = args.model_name == "test"

View File

@ -46,7 +46,7 @@ def main():
args = parse_benchmark_args() args = parse_benchmark_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size

View File

@ -64,7 +64,7 @@ def main():
args = parse_demo_args() args = parse_demo_args()
# Launch ColossalAI # Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed) colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator() coordinator = DistCoordinator()
world_size = coordinator.world_size world_size = coordinator.world_size

View File

@ -102,7 +102,7 @@ args = parse_args()
if args.distplan not in ["colossalai", "pytorch"]: if args.distplan not in ["colossalai", "pytorch"]:
raise TypeError(f"{args.distplan} is error") raise TypeError(f"{args.distplan} is error")
disable_existing_loggers() disable_existing_loggers()
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()

View File

@ -20,7 +20,7 @@ def _benchmark(rank, world_size, port):
only result in minor performance drop. So at last we might be able to find better training batch size for our only result in minor performance drop. So at last we might be able to find better training batch size for our
model (combine with large batch training optimizer such as LAMB). model (combine with large batch training optimizer such as LAMB).
""" """
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = tm.resnet152() model = tm.resnet152()
gm = symbolic_trace(model) gm = symbolic_trace(model)
raw_graph = deepcopy(gm.graph) raw_graph = deepcopy(gm.graph)

View File

@ -17,7 +17,7 @@ def _benchmark(rank, world_size, port, args):
The benchmark will sample in a range of memory budget for each model and output the benchmark summary and The benchmark will sample in a range of memory budget for each model and output the benchmark summary and
data visualization of peak memory vs. budget memory and relative step time vs. peak memory. data visualization of peak memory vs. budget memory and relative step time vs. peak memory.
""" """
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
if args.model == "resnet50": if args.model == "resnet50":
model = tm.resnet50() model = tm.resnet50()
data_gen = partial(data_gen_resnet, batch_size=128, shape=(3, 224, 224)) data_gen = partial(data_gen_resnet, batch_size=128, shape=(3, 224, 224))

View File

@ -128,7 +128,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# update the learning rate with linear scaling # update the learning rate with linear scaling

View File

@ -148,7 +148,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch()
coordinator = DistCoordinator() coordinator = DistCoordinator()
# update the learning rate with linear scaling # update the learning rate with linear scaling

View File

@ -125,7 +125,7 @@ def main():
# ============================== # ==============================
# Launch Distributed Environment # Launch Distributed Environment
# ============================== # ==============================
colossalai.launch_from_torch(config={}, seed=42) colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator() coordinator = DistCoordinator()
# local_batch_size = BATCH_SIZE // coordinator.world_size # local_batch_size = BATCH_SIZE // coordinator.world_size

View File

@ -289,7 +289,7 @@ class DummyDataloader:
def main(): def main():
args = parse_args() args = parse_args()
disable_existing_loggers() disable_existing_loggers()
colossalai.legacy.launch_from_torch(config=dict()) colossalai.legacy.launch_from_torch()
logger = get_dist_logger() logger = get_dist_logger()
is_main_process = dist.get_rank() == 0 is_main_process = dist.get_rank() == 0

View File

@ -27,7 +27,7 @@ except:
def _run_C_solver_consistency_test(rank, world_size, port): def _run_C_solver_consistency_test(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
for M, mem_budget in [(tm.resnet50, 4000), (tm.densenet121, 8080)]: for M, mem_budget in [(tm.resnet50, 4000), (tm.densenet121, 8080)]:
model = M() model = M()

View File

@ -75,7 +75,7 @@ def check_backward_consistency(
def _run_ckpt_solver(rank, world_size, port): def _run_ckpt_solver(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
MODEL_LIST = [tm.densenet121] MODEL_LIST = [tm.densenet121]
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
@ -111,7 +111,7 @@ def test_ckpt_solver():
def _run_ckpt_solver_torch11(rank, world_size, port): def _run_ckpt_solver_torch11(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
MODEL_LIST = [tm.densenet121] MODEL_LIST = [tm.densenet121]
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True

View File

@ -141,8 +141,7 @@ def exam_fwd_bwd(model_name: str, memory_budget: float, solver_name: str):
def run_dist(rank, world_size, port): def run_dist(rank, world_size, port):
config = {} colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
exam_fwd_bwd() exam_fwd_bwd()

View File

@ -42,7 +42,7 @@ class ConvModel(torch.nn.Module):
def check_linear_module(rank, world_size, port): def check_linear_module(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = LinearModel(4, 8).cuda() model = LinearModel(4, 8).cuda()
input = torch.rand(4, 4).cuda() input = torch.rand(4, 4).cuda()
output_compare = model(input) output_compare = model(input)
@ -59,7 +59,7 @@ def check_linear_module(rank, world_size, port):
def check_conv_module(rank, world_size, port): def check_conv_module(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = ConvModel(3, 6, 2).cuda() model = ConvModel(3, 6, 2).cuda()
input = torch.rand(4, 3, 64, 64).cuda() input = torch.rand(4, 3, 64, 64).cuda()
output_compare = model(input) output_compare = model(input)

View File

@ -39,7 +39,7 @@ class GPT2MLPWithCkpt(nn.Module):
def check_act_ckpt(rank, world_size, port): def check_act_ckpt(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = GPT2MLPWithCkpt(intermediate_size=4 * HIDDEN_SIZE, hidden_size=HIDDEN_SIZE) model = GPT2MLPWithCkpt(intermediate_size=4 * HIDDEN_SIZE, hidden_size=HIDDEN_SIZE)
torch.rand(1, 64, HIDDEN_SIZE) torch.rand(1, 64, HIDDEN_SIZE)
input_sample = { input_sample = {

View File

@ -32,7 +32,7 @@ class MLP(torch.nn.Module):
def check_compatibility_with_ddp(rank, world_size, port): def check_compatibility_with_ddp(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = MLP(4).cuda() model = MLP(4).cuda()
if rank in [0, 1]: if rank in [0, 1]:
input = torch.arange(0, 16, dtype=torch.float).reshape(4, 4).cuda() input = torch.arange(0, 16, dtype=torch.float).reshape(4, 4).cuda()

View File

@ -34,7 +34,7 @@ class MLP(torch.nn.Module):
def check_auto_parallel_with_gemini(rank, world_size, port): def check_auto_parallel_with_gemini(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = MLP(4).half().cuda() model = MLP(4).half().cuda()
if rank in [0, 1]: if rank in [0, 1]:
input = torch.arange(0, 16).reshape(4, 4).half().cuda() input = torch.arange(0, 16).reshape(4, 4).half().cuda()

View File

@ -73,7 +73,7 @@ def _check_module_grad(
def check_attention_layer(rank, model_cls, world_size, port): def check_attention_layer(rank, model_cls, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
config = transformers.GPT2Config(n_position=64, n_layer=2, n_head=16, n_embd=HIDDEN_DIM) config = transformers.GPT2Config(n_position=64, n_layer=2, n_head=16, n_embd=HIDDEN_DIM)

View File

@ -31,7 +31,7 @@ def _binary_elementwise_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = BinaryElementwiseOpModule(token=torch.add, shape=1024).cuda() model = BinaryElementwiseOpModule(token=torch.add, shape=1024).cuda()
input = torch.rand(32, 1024).cuda() input = torch.rand(32, 1024).cuda()
input.requires_grad = True input.requires_grad = True

View File

@ -31,7 +31,7 @@ def _conv_module_mem_test(rank, world_size, port, bias):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.Conv2d(4, 64, 3, padding=1, bias=bias)).cuda() model = nn.Sequential(nn.Conv2d(4, 64, 3, padding=1, bias=bias)).cuda()
input = torch.rand(4, 4, 64, 64).cuda() input = torch.rand(4, 4, 64, 64).cuda()
input.requires_grad = True input.requires_grad = True
@ -72,7 +72,7 @@ def _conv_function_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = ConvFunctionModule().cuda() model = ConvFunctionModule().cuda()
input = torch.rand(4, 4, 64, 64).cuda() input = torch.rand(4, 4, 64, 64).cuda()
input.requires_grad = True input.requires_grad = True

View File

@ -30,7 +30,7 @@ def _linear_module_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.Linear(64, 128, bias=False)).cuda() model = nn.Sequential(nn.Linear(64, 128, bias=False)).cuda()
input = torch.rand(8, 8, 16, 64).cuda() input = torch.rand(8, 8, 16, 64).cuda()
input.requires_grad = True input.requires_grad = True
@ -68,7 +68,7 @@ def _linear_function_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = MyModule().cuda() model = MyModule().cuda()
input = torch.rand(8, 8, 16, 64).cuda() input = torch.rand(8, 8, 16, 64).cuda()
input.requires_grad = True input.requires_grad = True

View File

@ -25,7 +25,7 @@ def _batchnorm_module_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.BatchNorm2d(128)).cuda() model = nn.Sequential(nn.BatchNorm2d(128)).cuda()
input = torch.rand(4, 128, 64, 64).cuda() input = torch.rand(4, 128, 64, 64).cuda()
input.requires_grad = True input.requires_grad = True

View File

@ -21,7 +21,7 @@ def _adaptiveavgpool_module_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.AdaptiveAvgPool2d((16, 16))).cuda() model = nn.Sequential(nn.AdaptiveAvgPool2d((16, 16))).cuda()
input = torch.rand(4, 128, 64, 64).cuda() input = torch.rand(4, 128, 64, 64).cuda()
input.requires_grad = True input.requires_grad = True
@ -62,7 +62,7 @@ def _maxpool_module_mem_test(rank, world_size, port):
port: port for initializing process group port: port for initializing process group
""" """
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.MaxPool2d((16, 16))).cuda() model = nn.Sequential(nn.MaxPool2d((16, 16))).cuda()
input = torch.rand(4, 128, 64, 64).cuda() input = torch.rand(4, 128, 64, 64).cuda()
input.requires_grad = True input.requires_grad = True

View File

@ -40,7 +40,7 @@ class AddBMMTorchFunctionModule(nn.Module):
def check_2d_device_mesh(rank, world_size, port, module, bias_shape, using_kwargs): def check_2d_device_mesh(rank, world_size, port, module, bias_shape, using_kwargs):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = module(using_kwargs).cuda() model = module(using_kwargs).cuda()
physical_mesh_id = torch.arange(0, 4) physical_mesh_id = torch.arange(0, 4)
mesh_shape = (2, 2) mesh_shape = (2, 2)
@ -150,7 +150,7 @@ def check_2d_device_mesh(rank, world_size, port, module, bias_shape, using_kwarg
def check_1d_device_mesh(rank, module, bias_shape, using_kwargs, world_size, port): def check_1d_device_mesh(rank, module, bias_shape, using_kwargs, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
physical_mesh_id = torch.arange(0, 4) physical_mesh_id = torch.arange(0, 4)
mesh_shape = (1, 4) mesh_shape = (1, 4)
device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True) device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)

View File

@ -40,7 +40,7 @@ class AddmmModel_with_param(nn.Module):
def check_addmm_function_handler(rank, world_size, port, input_shape, model_cls): def check_addmm_function_handler(rank, world_size, port, input_shape, model_cls):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
if model_cls == AddmmModel: if model_cls == AddmmModel:
model = AddmmModel().cuda() model = AddmmModel().cuda()
else: else:

View File

@ -16,7 +16,7 @@ from tests.test_auto_parallel.test_tensor_shard.test_node_handler.utils import n
def check_bn_module_handler(rank, world_size, port): def check_bn_module_handler(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = nn.Sequential(nn.BatchNorm2d(16)).cuda() model = nn.Sequential(nn.BatchNorm2d(16)).cuda()
physical_mesh_id = torch.arange(0, 4) physical_mesh_id = torch.arange(0, 4)

View File

@ -34,7 +34,7 @@ class LinearModule(torch.nn.Module):
def check_linear_module_handler(rank, world_size, port): def check_linear_module_handler(rank, world_size, port):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = LinearModule(weight_shape=WEIGHT_SHAPE).cuda() model = LinearModule(weight_shape=WEIGHT_SHAPE).cuda()
physical_mesh_id = torch.arange(0, 4) physical_mesh_id = torch.arange(0, 4)

View File

@ -30,7 +30,7 @@ class LinearModule(torch.nn.Module):
def check_linear_module_handler(rank, world_size, port, bias): def check_linear_module_handler(rank, world_size, port, bias):
disable_existing_loggers() disable_existing_loggers()
launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
model = LinearModule(16, 32, bias=bias).cuda() model = LinearModule(16, 32, bias=bias).cuda()
physical_mesh_id = torch.arange(0, 4) physical_mesh_id = torch.arange(0, 4)

Some files were not shown because too many files have changed in this diff Show More