added CI for unit testing (#69)

pull/73/head
Frank Lee 2021-12-16 10:32:08 +08:00 committed by GitHub
parent 45355a62f7
commit cd9c28e055
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
68 changed files with 1089 additions and 766 deletions

40
.github/workflows/build.yml vendored Normal file
View File

@ -0,0 +1,40 @@
name: Build
on:
pull_request:
types: [review_requested]
branches:
- "*"
jobs:
build:
name: Build and test Colossal-AI
runs-on: [self-hosted, gpu]
container:
image: nvcr.io/nvidia/pytorch:21.07-py3
options: --gpus all --rm --ipc=host -v /data/cifar-10:/data/cifar-10
timeout-minutes: 1200
if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
steps:
- name: Setup Environment
run: |
export https_proxy=http://172.17.0.1:7890 http_proxy=http://172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890
- name: Install dependencies
run: |
python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install -U pip setuptools wheel --user
pip install pytest tensorboard deepspeed apex
- uses: actions/checkout@v2
- name: Install Colossal-AI
run: |
pip install -v --no-cache-dir --global-option="--cuda_ext" .
- name: Unit Testing
run: |
pytest tests
env:
DATA: /data/cifar-10

View File

@ -5,7 +5,6 @@ import os
import os.path as osp import os.path as osp
import torch import torch
from torch.utils.tensorboard import SummaryWriter
from typing import List from typing import List
from decimal import Decimal from decimal import Decimal
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
@ -100,6 +99,7 @@ class TensorboardHook(BaseHook):
priority: int = 10, priority: int = 10,
) -> None: ) -> None:
super().__init__(priority=priority) super().__init__(priority=priority)
from torch.utils.tensorboard import SummaryWriter
# create log dir # create log dir
if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0: if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:

View File

@ -1,15 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
from functools import partial
from pathlib import Path
import pytest import pytest
import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
from colossalai import launch from colossalai import launch
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from functools import partial
from pathlib import Path
CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute() CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()
@ -75,6 +75,7 @@ def init_2d(rank, world_size, backend, port, host):
check_2d_parallel_rank(rank) check_2d_parallel_rank(rank)
check_pipeline_parallel_rank(rank) check_pipeline_parallel_rank(rank)
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.cpu @pytest.mark.cpu
@ -86,7 +87,7 @@ def test_2d_init():
test_fn = partial(init_2d, test_fn = partial(init_2d,
world_size=world_size, world_size=world_size,
backend='gloo', backend='gloo',
port='29500', port='29900',
host='localhost' host='localhost'
) )
mp.spawn(test_fn, nprocs=world_size) mp.spawn(test_fn, nprocs=world_size)

View File

@ -5,6 +5,7 @@ from functools import partial
from pathlib import Path from pathlib import Path
import pytest import pytest
import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
@ -98,6 +99,7 @@ def init_2halfd(rank, world_size, backend, port, host):
check_tensor_parallel_rank(rank) check_tensor_parallel_rank(rank)
check_2p5d_parallel_rank(rank) check_2p5d_parallel_rank(rank)
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.cpu @pytest.mark.cpu
@ -109,7 +111,7 @@ def test_2halfd_init():
test_fn = partial(init_2halfd, test_fn = partial(init_2halfd,
world_size=world_size, world_size=world_size,
backend='gloo', backend='gloo',
port='29501', port='29901',
host='localhost' host='localhost'
) )
mp.spawn(test_fn, nprocs=world_size) mp.spawn(test_fn, nprocs=world_size)

View File

@ -5,8 +5,10 @@ from functools import partial
from pathlib import Path from pathlib import Path
import pytest import pytest
import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.initialize import launch from colossalai.initialize import launch
@ -90,6 +92,7 @@ def init_3d(rank, world_size, backend, port, host):
check_data_parallel_rank(rank) check_data_parallel_rank(rank)
check_pipeline_parallel_rank(rank) check_pipeline_parallel_rank(rank)
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.cpu @pytest.mark.cpu
@ -101,7 +104,7 @@ def test_3d_init():
test_fn = partial(init_3d, test_fn = partial(init_3d,
world_size=world_size, world_size=world_size,
backend='gloo', backend='gloo',
port='29502', port='29902',
host='localhost' host='localhost'
) )
mp.spawn(test_fn, nprocs=world_size) mp.spawn(test_fn, nprocs=world_size)

View File

@ -6,7 +6,7 @@ from functools import partial
from pathlib import Path from pathlib import Path
import pytest import pytest
import torch.cuda import torch
import torch.distributed as dist import torch.distributed as dist
import torch.multiprocessing as mp import torch.multiprocessing as mp
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
@ -49,7 +49,7 @@ def run_data_sampler(rank, world_size):
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,
backend='gloo', backend='gloo',
port='29503', port='29903',
host='localhost' host='localhost'
) )
colossalai.launch(**dist_args) colossalai.launch(**dist_args)
@ -73,6 +73,7 @@ def run_data_sampler(rank, world_size):
if gpc.get_local_rank(ParallelMode.DATA) != 0: if gpc.get_local_rank(ParallelMode.DATA) != 0:
assert not torch.equal(img, assert not torch.equal(img,
img_to_compare), 'Same image was distributed across ranks but expected it to be different' img_to_compare), 'Same image was distributed across ranks but expected it to be different'
torch.cuda.empty_cache()
@pytest.mark.cpu @pytest.mark.cpu

View File

@ -6,7 +6,7 @@ from functools import partial
from pathlib import Path from pathlib import Path
import pytest import pytest
import torch.cuda import torch
import torch.distributed as dist import torch.distributed as dist
import torch.multiprocessing as mp import torch.multiprocessing as mp
from torchvision import transforms from torchvision import transforms
@ -52,11 +52,10 @@ def run_data_sampler(rank, world_size):
rank=rank, rank=rank,
world_size=world_size, world_size=world_size,
backend='gloo', backend='gloo',
port='29499', port='29904',
host='localhost' host='localhost'
) )
colossalai.launch(**dist_args) colossalai.launch(**dist_args)
print('finished initialization')
dataset_cfg = gpc.config.train_data.dataset dataset_cfg = gpc.config.train_data.dataset
dataloader_cfg = gpc.config.train_data.dataloader dataloader_cfg = gpc.config.train_data.dataloader
@ -88,6 +87,7 @@ def run_data_sampler(rank, world_size):
# this should be false if data parallel sampler to given to the dataloader # this should be false if data parallel sampler to given to the dataloader
assert torch.equal(img, assert torch.equal(img,
img_to_compare), 'Same image was distributed across ranks and expected it to be the same' img_to_compare), 'Same image was distributed across ranks and expected it to be the same'
torch.cuda.empty_cache()
@pytest.mark.cpu @pytest.mark.cpu

View File

@ -1,3 +1,4 @@
import pytest
from pathlib import Path from pathlib import Path
from colossalai.amp.amp_type import AMP_TYPE from colossalai.amp.amp_type import AMP_TYPE
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
@ -34,7 +35,9 @@ CONFIG = dict(
) )
def main(): @pytest.mark.dist
@pytest.mark.skip("This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually")
def test_hybrid_parallel():
parser = colossalai.get_default_parser() parser = colossalai.get_default_parser()
args = parser.parse_args() args = parser.parse_args()
colossalai.launch_from_slurm(config=CONFIG, colossalai.launch_from_slurm(config=CONFIG,

View File

@ -1,4 +0,0 @@
#!/usr/bin/env sh
test_file=$1
python $test_file --world_size $SLURM_NPROCS --host $HOST --port 29500 --rank $SLURM_PROCID

View File

@ -8,6 +8,7 @@ import torch
import os.path as osp import os.path as osp
from pathlib import Path from pathlib import Path
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
@ -15,9 +16,9 @@ from colossalai.core import global_context as gpc
from colossalai.amp import AMP_TYPE from colossalai.amp import AMP_TYPE
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import report_memory_usage, get_dataloader from colossalai.utils import report_memory_usage, get_dataloader
from colossalai.initialize import get_default_parser
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
# Config # Config
@ -37,18 +38,15 @@ CONFIG = dict(
) )
def run_no_pipeline(): def run_engine(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
# init dist env # init dist env
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29910,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -69,8 +67,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True) drop_last=True)
# build optimizer # build optimizer
@ -102,12 +98,14 @@ def run_no_pipeline():
gpc.destroy() gpc.destroy()
logger.info('Test engine finished') logger.info('Test engine finished')
report_memory_usage("After testing") report_memory_usage("After testing")
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist @pytest.mark.dist
def test_engine(): def test_engine():
run_no_pipeline() world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -5,6 +5,7 @@ import torch
import os.path as osp import os.path as osp
from pathlib import Path from pathlib import Path
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from colossalai.initialize import get_default_parser from colossalai.initialize import get_default_parser
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
# Config # Config
@ -36,18 +38,15 @@ CONFIG = dict(
) )
def run_no_pipeline(): def run_engine(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
# init dist env # init dist env
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29911,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -68,8 +67,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True) drop_last=True)
# build optimizer # build optimizer
@ -101,12 +98,14 @@ def run_no_pipeline():
gpc.destroy() gpc.destroy()
logger.info('Test engine finished') logger.info('Test engine finished')
report_memory_usage("After testing") report_memory_usage("After testing")
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist @pytest.mark.dist
def test_engine(): def test_engine():
run_no_pipeline() world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -5,6 +5,7 @@ import torch
import os.path as osp import os.path as osp
from pathlib import Path from pathlib import Path
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from colossalai.initialize import get_default_parser from colossalai.initialize import get_default_parser
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
# Config # Config
@ -33,18 +35,15 @@ CONFIG = dict(
) )
def run_no_pipeline(): def run_engine(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
# init dist env # init dist env
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29912,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -65,8 +64,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True) drop_last=True)
# build optimizer # build optimizer
@ -98,12 +95,14 @@ def run_no_pipeline():
gpc.destroy() gpc.destroy()
logger.info('Test engine finished') logger.info('Test engine finished')
report_memory_usage("After testing") report_memory_usage("After testing")
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist @pytest.mark.dist
def test_engine(): def test_engine():
run_no_pipeline() world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -5,6 +5,7 @@ import torch
import os.path as osp import os.path as osp
from pathlib import Path from pathlib import Path
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
@ -15,6 +16,7 @@ from colossalai.utils import report_memory_usage, get_dataloader
from colossalai.initialize import get_default_parser from colossalai.initialize import get_default_parser
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
# Config # Config
@ -34,18 +36,15 @@ CONFIG = dict(
) )
def run_no_pipeline(): def run_engine(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
# init dist env # init dist env
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29913,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -66,8 +65,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True) drop_last=True)
# build optimizer # build optimizer
@ -99,12 +96,14 @@ def run_no_pipeline():
gpc.destroy() gpc.destroy()
logger.info('Test engine finished') logger.info('Test engine finished')
report_memory_usage("After testing") report_memory_usage("After testing")
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist @pytest.mark.dist
def test_engine(): def test_engine():
run_no_pipeline() world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,4 +0,0 @@
#!/usr/bin/env sh
test_file=$1
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500

View File

@ -1,4 +1,3 @@
from tests.test_layers.test_3d.common import IMG_SIZE
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from torch.nn import Parameter from torch.nn import Parameter
@ -7,7 +6,7 @@ from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.nn import Linear1D_Col, Linear1D_Row, TransformerMLP1D, TransformerSelfAttention1D, ViTMLP1D, ViTSelfAttention1D, ViTPatchEmbedding1D, ViTHead1D, ViTTokenFuser1D from colossalai.nn import Linear1D_Col, Linear1D_Row, TransformerMLP1D, TransformerSelfAttention1D, ViTMLP1D, ViTSelfAttention1D, ViTPatchEmbedding1D, ViTHead1D, ViTTokenFuser1D
from colossalai.utils import get_current_device, print_rank_0 from colossalai.utils import get_current_device, print_rank_0
from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, NUM_CLASSES, check_equal, IMG_SIZE from .common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, NUM_CLASSES, check_equal, IMG_SIZE
def check_linear_col(): def check_linear_col():

View File

@ -2,10 +2,13 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import pytest import pytest
import torch
import torch.multiprocessing as mp
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.initialize import launch, get_default_parser from colossalai.initialize import launch, get_default_parser
from test_layer import * from functools import partial
from checks_1d.check_layer_1d import *
CONFIG = dict( CONFIG = dict(
parallel=dict( parallel=dict(
@ -18,8 +21,14 @@ CONFIG = dict(
) )
def check_layer(): def check_layer(rank, world_size):
# print_rank_0('start check_linear_col') launch(config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29920,
backend='nccl')
check_linear_col() check_linear_col()
check_linear_row() check_linear_row()
check_attention() check_attention()
@ -28,21 +37,15 @@ def check_layer():
check_embed() check_embed()
check_head() check_head()
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist @pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_1d(): def test_1d():
parser = get_default_parser() world_size = 2
args = parser.parse_args() run_func = partial(check_layer, world_size=world_size)
launch(config=CONFIG, mp.spawn(run_func, nprocs=world_size)
rank=args.rank,
world_size=args.world_size,
host=args.host,
port=args.port,
backend=args.backend)
check_layer()
gpc.destroy()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.nn import Linear2D, LayerNorm2D, TransformerSelfAttention2D, TransformerMLP2D, TransformerLayer2D from colossalai.nn import Linear2D, LayerNorm2D, TransformerSelfAttention2D, TransformerMLP2D, TransformerLayer2D
from colossalai.utils import get_current_device, print_rank_0 from colossalai.utils import get_current_device, print_rank_0
from common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal from .common import HIDDEN_SIZE, DEPTH, BATCH_SIZE, SEQ_LENGTH, check_equal
def check_linear(): def check_linear():

View File

@ -8,7 +8,7 @@ from colossalai.core import global_context as gpc
from colossalai.nn.layer.parallel_2d import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D from colossalai.nn.layer.parallel_2d import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils import print_rank_0 from colossalai.utils import print_rank_0
from common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
def check_AB(): def check_AB():

View File

@ -2,11 +2,15 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import pytest import pytest
import torch
import torch.multiprocessing as mp
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.initialize import launch, get_default_parser from colossalai.initialize import launch, get_default_parser
from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer from checks_2d.check_layer_2d import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
from test_operation import check_AB, check_ABT, check_ATB from checks_2d.check_operation_2d import check_AB, check_ABT, check_ATB
from functools import partial
CONFIG = dict( CONFIG = dict(
parallel=dict( parallel=dict(
@ -33,20 +37,25 @@ def check_layer():
check_transformerlayer() check_transformerlayer()
@pytest.mark.dist def check_layer_and_operation(rank, world_size):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d():
parser = get_default_parser()
args = parser.parse_args()
launch(config=CONFIG, launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29921,
backend=args.backend) backend='nccl')
check_operations() check_operations()
check_layer() check_layer()
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_2d():
world_size = 4
run_func = partial(check_layer_and_operation, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -6,7 +6,7 @@ from colossalai.nn import (Linear2p5D, LayerNorm2p5D, TransformerSelfAttention2p
TransformerLayer2p5D) TransformerLayer2p5D)
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils import print_rank_0 from colossalai.utils import print_rank_0
from common import * from .common import *
def check_linear(): def check_linear():

View File

@ -6,7 +6,7 @@ from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_
Matmul_ATB_2p5D Matmul_ATB_2p5D
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils import print_rank_0 from colossalai.utils import print_rank_0
from common import * from .common import *
def check_AB(): def check_AB():

View File

@ -1,3 +0,0 @@
#!/bin/bash
python -m torch.distributed.launch test_2p5d.py --nproc_per_node 8 --host $HOST --port 29516 --world_size 8

View File

@ -1,9 +1,13 @@
import pytest import pytest
import torch
import torch.multiprocessing as mp
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.initialize import launch, get_default_parser from colossalai.initialize import launch
from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer from checks_2p5d.check_layer_2p5d import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer
from test_operation import check_AB, check_ABT, check_ATB from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
from functools import partial
CONFIG = dict( CONFIG = dict(
parallel=dict( parallel=dict(
@ -27,20 +31,25 @@ def check_layer():
check_transformerlayer() check_transformerlayer()
@pytest.mark.dist def check_layer_and_operation(rank, world_size):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2p5d():
parser = get_default_parser()
args = parser.parse_args()
launch(config=CONFIG, launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29922,
backend=args.backend) backend='nccl')
check_layer()
check_operations() check_operations()
check_layer()
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_2p5d():
world_size = 8
run_func = partial(check_layer_and_operation, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -13,7 +13,7 @@ from colossalai.utils import get_current_device, print_rank_0
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D
from common import * from .common import *
def check_linear(): def check_linear():

View File

@ -7,7 +7,7 @@ from colossalai.logging import get_dist_logger
from colossalai.nn.layer.parallel_3d._operation import * from colossalai.nn.layer.parallel_3d._operation import *
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from common import * from .common import *
def check_AB(): def check_AB():

View File

@ -1,22 +0,0 @@
#!/bin/bash
python -m torch.distributed.launch test_2d.py --nproc_per_node 8 test_3d.py --host $HOST --port 29516 --world_size 8
# expected test output
# distributed environment initialized
# AB forward: pass
# AB backward: pass
# ABT forward: pass
# ABT backward: pass
# ATB forward: pass
# ATB backward: pass
# linear backward: pass
# linear backward: pass
# layer norm forward: pass
# layer norm backward: pass
# self attention forward: pass
# self attention backward: pass
# mlp forward: pass
# mlp backward: pass
# transformerlayer forward: pass
# transformerlayer backward: pass

View File

@ -1,11 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import pytest
import torch
import torch.multiprocessing as mp
from colossalai.initialize import launch, get_default_parser from colossalai.initialize import launch, get_default_parser
from test_layer import * from checks_3d.check_layer_3d import *
from test_operation import * from checks_3d.check_operation_3d import *
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from functools import partial
CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)), CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)),
seed=0) seed=0)
@ -38,26 +41,25 @@ def check_layer():
ranks=[0]) ranks=[0])
def _test_main(): def check_layer_and_operation(rank, world_size):
# init dist
parser = get_default_parser()
args = parser.parse_args()
launch(config=CONFIG, launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29923,
backend=args.backend) backend='nccl')
logger = get_dist_logger()
logger.info('Distributed environment is initialzied.', ranks=[0])
torch.backends.cudnn.benchmark = True
# check operation
# check_operations()
# check layers
check_layer() check_layer()
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_3d():
world_size = 8
run_func = partial(check_layer_and_operation, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
_test_main() test_3d()

View File

@ -1,9 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
import pytest
import torch
import torch.multiprocessing as mp
from colossalai.initialize import launch, get_default_parser from colossalai.initialize import launch, get_default_parser
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from test_layer import * from checks_seq.check_layer_seq import *
from functools import partial
CONFIG = dict( CONFIG = dict(
parallel=dict( parallel=dict(
@ -17,24 +22,28 @@ def check_layer():
check_selfattention() check_selfattention()
def _test_main(): def run_check_sequence(rank, world_size):
# init dist # init dist
parser = get_default_parser()
args = parser.parse_args()
launch(config=CONFIG, launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29924,
backend=args.backend) backend='nccl')
logger = get_dist_logger() logger = get_dist_logger()
logger.info('Distributed environment is initialzied.', ranks=[0]) logger.info('Distributed environment is initialzied.', ranks=[0])
torch.backends.cudnn.benchmark = True
# check layers # check layers
check_layer() check_layer()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_sequence():
world_size = 4
run_func = partial(run_check_sequence, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
_test_main() test_sequence()

View File

@ -1,19 +0,0 @@
import os
from pathlib import Path
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='AccuracyHook'),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='CosineAnnealingLR',
warmup_steps=5
)
),
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
]

View File

@ -1,133 +0,0 @@
import os
from pathlib import Path
from colossalai.engine import AMP_TYPE
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 8
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
num_epochs = 60
train_data = dict(
dataset=dict(type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
# num_workers=1,
shuffle=True,
))
test_data = dict(
dataset=dict(type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]),
dataloader=dict(
batch_size=400,
pin_memory=True,
# num_workers=1,
))
optimizer = dict(type='Adam', lr=0.001, weight_decay=0)
loss = dict(type='CrossEntropyLoss2D', )
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
),
droppath_cfg=dict(type='VanillaViTDropPath', ),
mlp_cfg=dict(type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=1),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2 ** 8)
engine = dict(
schedule=dict(num_microbatches=1)
)
logging = dict(root_path='./logs')

View File

@ -1,4 +0,0 @@
#!/usr/bin/env sh
test_file=$1
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500

View File

@ -1,232 +0,0 @@
# referenced from Megatron and used to testify communication
import os.path as osp
import pytest
import torch
from torch.utils.data import DataLoader
from colossalai.builder import ModelInitializer, build_dataset, build_optimizer, build_loss
from colossalai.communication import p2p as p2p_communication
from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import initialize
from colossalai.utils import print_rank_0, get_current_device
NUM_BATCH = 128
NUM_MICRO = 6
def get_num_microbatches():
return NUM_MICRO
def to_cuda(data):
if isinstance(data, (tuple, list)):
data = data[0].to(get_current_device())
else:
data = data.to(get_current_device())
return data
def step_func(loss):
def _step_func(input_tensor, model):
output = model(input_tensor)
if isinstance(output, (tuple, list)):
if len(output) > 1:
raise NotImplementedError("Multiple output!!!")
else:
output = output[0]
return output, loss
return _step_func
def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
"""Forward step for passed-in model.
If first stage, input tensor is obtained from data_iterator, otherwise
passed-in input_tensor is used.
Returns output tensor."""
if input_tensor is None:
data, label = data_iterator.next()
input_tensor = to_cuda(data)
output_tensor, loss_func = forward_step_func(input_tensor, model)
if gpc.is_last_rank(ParallelMode.PIPELINE):
data, label = data_iterator.next()
label = to_cuda(label)
output_tensor = loss_func(output_tensor, label) / get_num_microbatches()
losses_reduced.append(output_tensor)
return output_tensor
def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
with respect to stage's output tensor.
Returns gradient of loss with respect to input tensor (None if first
stage)."""
# Retain the grad on the input_tensor.
if input_tensor is not None:
input_tensor.retain_grad()
# Backward pass.
torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)
# Collect the grad of the input_tensor.
input_tensor_grad = None
if input_tensor is not None:
input_tensor_grad = input_tensor.grad
return input_tensor_grad
def forward_backward_pipelining_without_interleaving(forward_step_func, data_iterator,
model, optimizer, forward_only):
"""Run non-interleaved 1F1B schedule, with communication between pipeline
stages.
Returns dictionary with losses if the last stage, empty dict otherwise."""
# Compute number of warmup microbatches.
num_microbatches = get_num_microbatches()
num_warmup_microbatches = \
(gpc.get_world_size(ParallelMode.PIPELINE) -
gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
num_warmup_microbatches = min(
num_warmup_microbatches,
num_microbatches)
num_microbatches_remaining = \
num_microbatches - num_warmup_microbatches
# Input, output tensors only need to be saved when doing backward passes
input_tensors = None
output_tensors = None
if not forward_only:
input_tensors = []
output_tensors = []
losses_reduced = []
# Used for tensor meta information communication
ft_shape = None
bt_shape = None
fs_checker = True
# Run warmup forward passes.
for i in range(num_warmup_microbatches):
if not gpc.is_first_rank(ParallelMode.PIPELINE):
ft_shape = recv_tensor_meta(ft_shape)
input_tensor = p2p_communication.recv_forward(ft_shape)
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, losses_reduced)
if not gpc.is_last_rank(ParallelMode.PIPELINE):
bt_shape = output_tensor.shape
fs_checker = send_tensor_meta(output_tensor, fs_checker)
p2p_communication.send_forward(output_tensor)
if not forward_only:
input_tensors.append(input_tensor)
output_tensors.append(output_tensor)
# Before running 1F1B, need to receive first forward tensor.
# If all microbatches are run in warmup / cooldown phase, then no need to
# receive this tensor here.
if num_microbatches_remaining > 0:
if not gpc.is_first_rank(ParallelMode.PIPELINE):
ft_shape = recv_tensor_meta(ft_shape)
input_tensor = p2p_communication.recv_forward(ft_shape)
# Run 1F1B in steady state.
for i in range(num_microbatches_remaining):
last_iteration = (i == (num_microbatches_remaining - 1))
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, losses_reduced)
if forward_only:
p2p_communication.send_forward(output_tensor)
if not last_iteration:
input_tensor = p2p_communication.recv_forward(ft_shape)
else:
output_tensor_grad = \
p2p_communication.send_forward_recv_backward(output_tensor, bt_shape)
# Add input_tensor and output_tensor to end of list.
input_tensors.append(input_tensor)
output_tensors.append(output_tensor)
# Pop input_tensor and output_tensor from the start of the list for
# the backward pass.
input_tensor = input_tensors.pop(0)
output_tensor = output_tensors.pop(0)
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
if last_iteration:
input_tensor = None
p2p_communication.send_backward(input_tensor_grad)
else:
input_tensor = \
p2p_communication.send_backward_recv_forward(input_tensor_grad, ft_shape)
# Run cooldown backward passes.
if not forward_only:
for i in range(num_warmup_microbatches):
input_tensor = input_tensors.pop(0)
output_tensor = output_tensors.pop(0)
output_tensor_grad = p2p_communication.recv_backward(bt_shape)
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
p2p_communication.send_backward(input_tensor_grad)
return losses_reduced
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_vit.py')
@pytest.mark.skip(reason="This is only for debugging purpose, please ignore this test")
@pytest.mark.dist
def test_schedule():
initialize(CONFIG_PATH)
# build model
model = ModelInitializer(gpc.config.model, 1).model_initialize()
print_rank_0('model is created')
# keep the same sampler for all process
torch.manual_seed(1331)
dataset = build_dataset(gpc.config.data.dataset)
dataloader = DataLoader(dataset=dataset, **gpc.config.data.dataloader)
print_rank_0('train data is created')
# build optimizer and loss
optim = build_optimizer(gpc.config.optimizer, model)
loss = build_loss(gpc.config.loss)
print_rank_0('optim and loss is created')
forward_backward_pipelining_without_interleaving(
step_func(loss),
iter(dataloader),
model,
optim,
False
)
gpc.destroy()
print_rank_0('training finished')
if __name__ == '__main__':
test_schedule()

View File

@ -0,0 +1,2 @@
from .layers import *
from .resnet import VanillaResNet

View File

@ -0,0 +1,3 @@
from .basic_block import ResNetBasicBlock
from .bottleneck import ResNetBottleneck
from .reslayer import ResLayer

View File

@ -0,0 +1,64 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional, Callable
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from .conv import conv3x3
@LAYERS.register_module
class ResNetBasicBlock(nn.Module):
"""Basic ResNet block
"""
expansion: int = 1
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError(
'BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError(
"Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out

View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional, Callable
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from .conv import conv3x3, conv1x1
@LAYERS.register_module
class ResNetBottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion: int = 4
def __init__(
self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None
) -> None:
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out

View File

@ -0,0 +1,15 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
from colossalai.registry import LAYERS
from .conv import conv1x1
@LAYERS.register_module
class ResLayer(nn.Module):
def __init__(self,
block_type: str,
norm_layer_type: str,
inplanes: int,
planes: int,
blocks: int,
groups: int,
base_width: int,
stride: int = 1,
dilation: int = 1,
dilate: bool = False,
):
super().__init__()
self.block = LAYERS.get_module(block_type)
self.norm_layer = LAYERS.get_module(norm_layer_type)
self.inplanes = inplanes
self.planes = planes
self.blocks = blocks
self.groups = groups
self.dilation = dilation
self.base_width = base_width
self.dilate = dilate
self.stride = stride
self.layer = self._make_layer()
def _make_layer(self):
norm_layer = self.norm_layer
downsample = None
previous_dilation = self.dilation
if self.dilate:
self.dilation *= self.stride
self.stride = 1
if self.stride != 1 or self.inplanes != self.planes * self.block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, self.planes * self.block.expansion, self.stride),
norm_layer(self.planes * self.block.expansion),
)
layers = []
layers.append(self.block(self.inplanes, self.planes, self.stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = self.planes * self.block.expansion
for _ in range(1, self.blocks):
layers.append(self.block(self.inplanes, self.planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def forward(self, x):
return self.layer(x)

View File

@ -0,0 +1,163 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import List, Optional
import torch
import torch.nn as nn
from torch import Tensor
from colossalai.registry import LAYERS
from colossalai.registry import MODELS
from colossalai.nn.model import ModelFromConfig
@MODELS.register_module
class VanillaResNet(ModelFromConfig):
"""ResNet from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
"""
def __init__(
self,
num_cls: int,
block_type: str,
layers: List[int],
norm_layer_type: str = 'BatchNorm2d',
in_channels: int = 3,
groups: int = 1,
width_per_group: int = 64,
zero_init_residual: bool = False,
replace_stride_with_dilation: Optional[List[bool]] = None,
dilations=(1, 1, 1, 1)
) -> None:
super().__init__()
self.inplanes = 64
self.zero_init_residual = zero_init_residual
self.blocks = layers
self.block_expansion = LAYERS.get_module(block_type).expansion
self.dilations = dilations
self.reslayer_common_cfg = dict(
type='ResLayer',
block_type=block_type,
norm_layer_type=norm_layer_type,
groups=groups,
base_width=width_per_group
)
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.layers_cfg = [
# conv1
dict(type='Conv2d',
in_channels=in_channels,
out_channels=self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False),
# bn1
dict(
type=norm_layer_type,
num_features=self.inplanes
),
# relu
dict(
type='ReLU',
inplace=True
),
# maxpool
dict(
type='MaxPool2d',
kernel_size=3,
stride=2,
padding=1
),
# layer 1
dict(
inplanes=self.inplanes,
planes=64,
blocks=self.blocks[0],
dilation=self.dilations[0],
**self.reslayer_common_cfg
),
# layer 2
dict(
inplanes=64 * self.block_expansion,
planes=128,
blocks=self.blocks[1],
stride=2,
dilate=replace_stride_with_dilation[0],
dilation=self.dilations[1],
**self.reslayer_common_cfg
),
# layer 3
dict(
inplanes=128 * self.block_expansion,
planes=256,
blocks=layers[2],
stride=2,
dilate=replace_stride_with_dilation[1],
dilation=self.dilations[2],
**self.reslayer_common_cfg
),
# layer 4
dict(
inplanes=256 * self.block_expansion,
planes=512,
blocks=layers[3], stride=2,
dilate=replace_stride_with_dilation[2],
dilation=self.dilations[3],
**self.reslayer_common_cfg
),
# avg pool
dict(
type='AdaptiveAvgPool2d',
output_size=(1, 1)
),
# flatten
dict(
type='LambdaWrapper',
func=lambda mod, x: torch.flatten(x, 1)
),
# linear
dict(
type='Linear',
in_features=512 * self.block_expansion,
out_features=num_cls
)
]
def forward(self, x: Tensor):
for layer in self.layers:
x = layer(x)
return x,
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(
m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if self.zero_init_residual:
for m in self.modules():
if isinstance(m, LAYERS.get_module('ResNetBottleneck')):
# type: ignore[arg-type]
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, LAYERS.get_module('ResNetBasicBlock')):
# type: ignore[arg-type]
nn.init.constant_(m.bn2.weight, 0)

View File

@ -0,0 +1,19 @@
import os
from pathlib import Path
BATCH_SIZE = 128
IMG_SIZE = 224
DIM = 768
NUM_CLASSES = 10
NUM_ATTN_HEADS = 12
# resnet 18
model = dict(type='VanillaResNet',
block_type='ResNetBasicBlock',
layers=[2, 2, 2, 2],
num_cls=10)
parallel = dict(
pipeline=dict(size=4),
tensor=dict(size=1, mode=None)
)

View File

@ -4,6 +4,7 @@
import pytest import pytest
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.multiprocessing as mp
from colossalai.communication import (recv_backward, recv_forward, from colossalai.communication import (recv_backward, recv_forward,
recv_tensor_meta, send_backward, recv_tensor_meta, send_backward,
@ -12,13 +13,14 @@ from colossalai.communication import (recv_backward, recv_forward,
send_tensor_meta) send_tensor_meta)
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.initialize import init_dist, parse_args from colossalai.initialize import launch
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from functools import partial
BATCH_SIZE = 32 BATCH_SIZE = 16
SEQ_LENGTH = 128 SEQ_LENGTH = 64
HIDDEN_SIZE = 512 HIDDEN_SIZE = 128
CONFIG = dict( CONFIG = dict(
parallel=dict( parallel=dict(
@ -106,7 +108,7 @@ def check_op(size, rank, prev_rank, next_rank, up_group, down_group, logger):
rank, check_equal(tensor, out))) rank, check_equal(tensor, out)))
def test_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger): def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
dtype = torch.float32 dtype = torch.float32
device = get_current_device() device = get_current_device()
tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE) tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
@ -121,13 +123,15 @@ def test_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
check_forward_backward(tensor, grad, rank, logger) check_forward_backward(tensor, grad, rank, logger)
@pytest.mark.skip("This test should be invoked using the test.sh provided") def run_check(rank, world_size):
@pytest.mark.dist launch(
def test_main(): config=CONFIG,
args = parse_args() rank=rank,
world_size = args.world_size world_size=world_size,
host='localhost',
init_dist(CONFIG) port=29932,
backend='nccl'
)
logger = get_dist_logger() logger = get_dist_logger()
rank = gpc.get_global_rank() rank = gpc.get_global_rank()
prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE) prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
@ -141,9 +145,18 @@ def test_main():
rank, prev_rank, up_ranks, next_rank, down_ranks)) rank, prev_rank, up_ranks, next_rank, down_ranks))
logger.info('Distributed environment is initialzied.') logger.info('Distributed environment is initialzied.')
test_comm(world_size, rank, prev_rank, next_rank, up_group, down_group, check_comm(world_size, rank, prev_rank, next_rank, up_group, down_group,
logger) logger)
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_p2p():
world_size = 4
run_func = partial(run_check, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_main() test_p2p()

View File

@ -2,35 +2,46 @@ import os.path as osp
import pytest import pytest
import torch import torch
import torch.multiprocessing as mp
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from colossalai.builder import build_dataset, ModelInitializer from colossalai.builder.pipeline import PipelineModelInitializer
from colossalai.core import global_context from colossalai.core import global_context
from colossalai.initialize import init_dist from colossalai.initialize import launch
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from functools import partial
import model
DIR_PATH = osp.dirname(osp.realpath(__file__)) DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py') CONFIG_PATH = osp.join(DIR_PATH, 'resnet_config.py')
@pytest.mark.skip("This test should be invoked using the test.sh provided") def run_partition(rank, world_size):
@pytest.mark.dist launch(config=CONFIG_PATH,
def test_partition(): rank=rank,
init_dist(CONFIG_PATH) world_size=world_size,
host='localhost',
port=29933,
backend='nccl'
)
logger = get_dist_logger() logger = get_dist_logger()
logger.info('finished initialization') logger.info('finished initialization')
# build model # build model
model = ModelInitializer(global_context.config.model, 1, verbose=True).model_initialize() model = PipelineModelInitializer(global_context.config.model, 1, verbose=True).initialize()
assert isinstance(model, torch.nn.Module)
logger.info('model is created') logger.info('model is created')
dataset = build_dataset(global_context.config.train_data.dataset)
dataloader = DataLoader(dataset=dataset, **global_context.config.train_data.dataloader)
logger.info('train data is created')
global_context.destroy() global_context.destroy()
torch.cuda.synchronize()
logger.info('training finished') logger.info('training finished')
torch.cuda.empty_cache()
@pytest.mark.dist
def test_partition():
world_size = 4
run_func = partial(run_partition, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -0,0 +1,94 @@
# referenced from Megatron and used to testify communication
import colossalai
import os
import os.path as osp
import pytest
import torch
import torch.multiprocessing as mp
import model
from colossalai.builder import PipelineModelInitializer
from colossalai.communication import p2p as p2p_communication
from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.utils import print_rank_0, get_current_device, get_dataloader
from colossalai.engine.schedule import PipelineSchedule
from torchvision.datasets import CIFAR10
from torchvision import transforms
from pathlib import Path
from functools import partial
BATCH_SIZE = 32
NUM_MICRO = 8
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, './resnet_config.py')
def run_schedule(rank, world_size):
launch(config=CONFIG_PATH,
rank=rank,
world_size=world_size,
host='localhost',
port=29934,
backend='nccl')
# build model
model = PipelineModelInitializer(gpc.config.model, 1).initialize()
print_rank_0('model is created')
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.RandomCrop(size=32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
0.2023, 0.1994, 0.2010]),
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
add_sampler=True,
batch_size=BATCH_SIZE,
pin_memory=True,
)
# build criterion
criterion = torch.nn.CrossEntropyLoss()
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
# initialize
engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader)
# build pipeline schedule
schedule = PipelineSchedule(num_microbatches=NUM_MICRO)
# run schedule
data_iter = iter(train_dataloader)
schedule.forward_backward_step(engine, data_iter)
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_pipeline_schedule():
world_size = 4
run_func = partial(run_schedule, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_pipeline_schedule()

View File

@ -1,51 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os.path as osp
import pytest
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import initialize
from colossalai.logging import get_dist_logger
NUM_BATCH = 128
BATCH_SIZE = 32
SEQ_LENGTH = 128
HIDDEN_SIZE = 512
DIR_PATH = osp.dirname(osp.realpath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py')
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_schedule():
engine, train_dataloader, test_dataloader = initialize(CONFIG_PATH)
logger = get_dist_logger()
model = engine.model
optimizer = engine.optimizer
criterion = engine.criterion
schedule = engine._schedule
output, label, loss = schedule.forward_backward_step(
data_iter=iter(train_dataloader),
model=model,
optimizer=optimizer,
criterion=criterion,
forward_only=False
)
schedule.optimizer_step(model, optimizer)
if gpc.is_last_rank(ParallelMode.PIPELINE):
logger.info('losses: {}'.format(loss))
gpc.destroy()
logger.info('training finished')
if __name__ == '__main__':
test_schedule()

View File

@ -1,20 +1,23 @@
import colossalai import colossalai
import os import os
from colossalai.amp.amp_type import AMP_TYPE import pytest
import torch
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from pathlib import Path from pathlib import Path
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
from colossalai.initialize import get_default_parser from colossalai.amp.amp_type import AMP_TYPE
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer from colossalai.trainer import Trainer
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 128 BATCH_SIZE = 16
IMG_SIZE = 32 IMG_SIZE = 32
NUM_EPOCHS = 200 NUM_EPOCHS = 200
@ -26,16 +29,14 @@ CONFIG = dict(
) )
def test_trainer(): def run_trainer_no_pipeline(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29930,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -70,13 +71,11 @@ def test_trainer():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
test_dataloader = get_dataloader(dataset=test_dataset, test_dataloader = get_dataloader(dataset=test_dataset,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
@ -107,7 +106,16 @@ def test_trainer():
display_progress=True, display_progress=True,
test_interval=5 test_interval=5
) )
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_trainer_no_pipeline():
world_size = 4
run_func = partial(run_trainer_no_pipeline, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_trainer() test_trainer_no_pipeline()

View File

@ -1,14 +1,14 @@
import colossalai import colossalai
import os import os
import pytest
import torch import torch
from colossalai.amp.amp_type import AMP_TYPE
from colossalai.context.parallel_mode import ParallelMode
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as mp
from pathlib import Path from pathlib import Path
from torchvision import transforms from torchvision import transforms
from torch.optim import Adam from torch.optim import Adam
from colossalai.initialize import get_default_parser from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer from colossalai.trainer import Trainer
@ -16,8 +16,10 @@ from colossalai.utils import get_dataloader
from colossalai.engine.schedule import PipelineSchedule from colossalai.engine.schedule import PipelineSchedule
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 32
BATCH_SIZE = 16
IMG_SIZE = 32 IMG_SIZE = 32
NUM_EPOCHS = 200 NUM_EPOCHS = 200
@ -25,23 +27,17 @@ CONFIG = dict(
parallel=dict( parallel=dict(
pipeline=2, pipeline=2,
), ),
# Config
fp16=dict(
mode=AMP_TYPE.TORCH
)
) )
def test_trainer(): def run_trainer_with_pipeline(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch( colossalai.launch(
config=CONFIG, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29931,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -101,13 +97,11 @@ def test_trainer():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
test_dataloader = get_dataloader(dataset=test_dataset, test_dataloader = get_dataloader(dataset=test_dataset,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
@ -140,7 +134,16 @@ def test_trainer():
display_progress=True, display_progress=True,
test_interval=5 test_interval=5
) )
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_trainer_with_pipeline():
world_size = 4
run_func = partial(run_trainer_with_pipeline, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_trainer() test_trainer_with_pipeline()

View File

@ -54,6 +54,7 @@ def test_activation_checkpointing():
loss.backward() loss.backward()
assert torch.all(data.grad == data_.grad), 'Gradient of the input does not match' assert torch.all(data.grad == data_.grad), 'Gradient of the input does not match'
torch.cuda.empty_cache()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -104,13 +104,14 @@ def run_no_pipeline(rank, world_size):
'param should be the same in the first few iterations and only changed in the last iteration' 'param should be the same in the first few iterations and only changed in the last iteration'
gpc.destroy() gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist @pytest.mark.dist
def test_engine(): def test_engine():
func = partial(run_no_pipeline, world_size=4) world_size = 4
mp.spawn(func, nprocs=4) func = partial(run_no_pipeline, world_size=world_size)
mp.spawn(func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,4 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path

View File

@ -1,4 +0,0 @@
#!/bin/bash
test_file="test_zero.py"
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500

View File

@ -0,0 +1,102 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import pytest
import torch
import torch.multiprocessing as mp
from pathlib import Path
import colossalai
from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader
from torchvision import transforms
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 16
IMG_SIZE = 224
CONFIG = dict(
fp16=dict(
mode=None,
),
zero=dict(
level=2,
cpu_offload=True,
verbose=False,
),
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=1, mode=None)
)
)
def run_dist(rank, world_size):
colossalai.launch(config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29940,
backend='nccl')
# build model
model = resnet18(num_classes=10)
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
# train
model.train()
for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad()
data = data.cuda()
label = label.cuda()
output = engine(data)
loss = engine.criterion(output, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_zero_level_2():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_zero_level_2()

View File

@ -4,36 +4,25 @@
import os import os
import pytest import pytest
import torch import torch
import torch.multiprocessing as mp
from pathlib import Path from pathlib import Path
import colossalai import colossalai
from colossalai.initialize import get_default_parser
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
from torchvision import transforms from torchvision import transforms
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 128 BATCH_SIZE = 16
IMG_SIZE = 224 IMG_SIZE = 224
NUM_CLS = 1000
CONFIG = dict( CONFIG = dict(
fp16=dict( fp16=dict(
mode=None, mode=None,
), ),
zero=dict( zero=dict(
# ==============
# level 2 config
# ==============
# level=2,
# cpu_offload=True,
# verbose=False,
# ==============
# level 3 config
# ==============
level=3, level=3,
verbose=False, verbose=False,
offload_optimizer_config=dict( offload_optimizer_config=dict(
@ -57,16 +46,13 @@ CONFIG = dict(
) )
def run_dist(): def run_dist(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch(config=CONFIG, colossalai.launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29941,
backend=args.backend) backend='nccl')
# build model # build model
model = resnet18(num_classes=10) model = resnet18(num_classes=10)
@ -86,7 +72,6 @@ def run_dist():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
@ -104,22 +89,27 @@ def run_dist():
model.train() model.train()
for idx, (data, label) in enumerate(train_dataloader): for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad() engine.zero_grad()
data = data.cuda() data = data.cuda().half()
label = label.cuda() label = label.cuda()
output = engine(data) output = engine(data).float()
loss = engine.criterion(output, label) loss = engine.criterion(output, label)
engine.backward(loss) engine.backward(loss)
engine.step() engine.step()
break break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked manually using the script provided")
@pytest.mark.dist @pytest.mark.dist
def test_zero(): @pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
run_dist() def test_zero_level_3():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_zero() test_zero_level_3()

View File

@ -1,12 +0,0 @@
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=2
)

View File

@ -1,12 +0,0 @@
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=3
)

View File

@ -1,4 +0,0 @@
#!/usr/bin/env sh
test_file=$1
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500

View File

@ -6,12 +6,11 @@ from pathlib import Path
import pytest import pytest
import torch.autograd import torch.autograd
import torch.multiprocessing as mp
import colossalai import colossalai
import torch import torch
from colossalai.initialize import get_default_parser
from colossalai.builder import build_model from colossalai.builder import build_model
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms from torchvision import transforms
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from components import * from components import *
from functools import partial
level = os.environ['LEVEL'] CONFIG = dict(
CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py') parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=2
)
)
def train_epoch(engine, train_dataloader): def train_epoch(engine, train_dataloader):
@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
return avg_loss return avg_loss
@pytest.mark.dist def run_2d_parallel_vision_transformer_level_2(rank, world_size):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch( colossalai.launch(
config=CONFIG_PATH, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29950,
backend=args.backend backend='nccl'
) )
# build model # build model
@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
engine.step() engine.step()
break break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_2d_vit_zero_level_2():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_2d_parallel_vision_transformer() test_2d_vit_zero_level_2()

View File

@ -0,0 +1,119 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
import pytest
import torch.autograd
import torch.multiprocessing as mp
import colossalai
import torch
from colossalai.core import global_context as gpc
from colossalai.builder import build_model
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.nn.layer._parallel_utilities import _gather
from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms
from torchvision.datasets import CIFAR10
from functools import partial
from components import *
CONFIG = dict(
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=3
)
)
def train_epoch(engine, train_dataloader):
engine.train()
accumulated_loss = 0
num_steps = len(train_dataloader)
data_iter = iter(train_dataloader)
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / num_steps
return avg_loss
def run_2d_parallel_vision_transformer_level_3(rank, world_size):
colossalai.launch(
config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29951,
backend='nccl'
)
# build model
model = build_model(model_cfg)
model.build_from_cfg()
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss2D()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
logger = get_dist_logger()
logger.info('start training')
engine.train()
for img, label in train_dataloader:
engine.zero_grad()
img = img.cuda()
label = label.cuda()
out = engine(img)
loss = engine.criterion(out, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
def test_3d_vit_zero_level_3():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_3d_vit_zero_level_3()