mirror of https://github.com/hpcaitech/ColossalAI
[legacy] move trainer to legacy (#4545)
* [legacy] move trainer to legacy * [doc] update docs related to trainer * [test] ignore legacy testpull/4612/head^2
parent
807e01a4ba
commit
89fe027787
|
@ -1,14 +1,13 @@
|
||||||
from typing import Union, List, Any
|
from typing import Any, List, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from colossalai.engine import Engine
|
from colossalai.engine import Engine
|
||||||
|
from colossalai.legacy.trainer.hooks import BaseHook
|
||||||
from colossalai.logging import DistributedLogger
|
from colossalai.logging import DistributedLogger
|
||||||
from colossalai.utils import MultiTimer
|
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
|
||||||
from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
|
|
||||||
from colossalai.trainer.hooks import BaseHook
|
|
||||||
|
|
||||||
|
|
||||||
class Trainer:
|
class Trainer:
|
|
@ -1,7 +1,12 @@
|
||||||
from ._base_hook import BaseHook
|
from ._base_hook import BaseHook
|
||||||
from ._checkpoint_hook import SaveCheckpointHook
|
from ._checkpoint_hook import SaveCheckpointHook
|
||||||
from ._log_hook import (LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook, LogTimingByEpochHook,
|
from ._log_hook import (
|
||||||
TensorboardHook)
|
LogMemoryByEpochHook,
|
||||||
|
LogMetricByEpochHook,
|
||||||
|
LogMetricByStepHook,
|
||||||
|
LogTimingByEpochHook,
|
||||||
|
TensorboardHook,
|
||||||
|
)
|
||||||
from ._lr_scheduler_hook import LRSchedulerHook
|
from ._lr_scheduler_hook import LRSchedulerHook
|
||||||
from ._metric_hook import AccuracyHook, LossHook, MetricHook, ThroughputHook
|
from ._metric_hook import AccuracyHook, LossHook, MetricHook, ThroughputHook
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
import torch
|
import torch
|
||||||
from colossalai.logging import get_dist_logger
|
|
||||||
|
|
||||||
|
from colossalai.legacy.trainer.hooks import BaseHook
|
||||||
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.registry import HOOKS
|
from colossalai.registry import HOOKS
|
||||||
from colossalai.trainer.hooks import BaseHook
|
|
||||||
from colossalai.utils.checkpointing import save_checkpoint
|
from colossalai.utils.checkpointing import save_checkpoint
|
||||||
|
|
||||||
from ._lr_scheduler_hook import LRSchedulerHook
|
from ._lr_scheduler_hook import LRSchedulerHook
|
||||||
|
|
||||||
|
|
|
@ -3,17 +3,17 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.registry import HOOKS
|
from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
|
||||||
from colossalai.logging import DistributedLogger
|
from colossalai.logging import DistributedLogger
|
||||||
from colossalai.utils import report_memory_usage, is_dp_rank_0, \
|
from colossalai.registry import HOOKS
|
||||||
is_tp_rank_0, is_no_pp_or_last_stage, MultiTimer
|
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
|
||||||
|
|
||||||
from ._base_hook import BaseHook
|
from ._base_hook import BaseHook
|
||||||
from ._commons_ import _format_number
|
from ._commons_ import _format_number
|
||||||
from colossalai.trainer.hooks._metric_hook import ThroughputMetric
|
|
||||||
|
|
||||||
|
|
||||||
class LogByEpochHook(BaseHook):
|
class LogByEpochHook(BaseHook):
|
|
@ -1,6 +1,7 @@
|
||||||
from colossalai.registry import HOOKS
|
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
|
from colossalai.registry import HOOKS
|
||||||
|
|
||||||
from ._metric_hook import LearningRateMetric, MetricHook
|
from ._metric_hook import LearningRateMetric, MetricHook
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ from typing import Callable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
from colossalai.communication import all_reduce
|
from colossalai.communication import all_reduce
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
|
@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
||||||
PipelineSchedule)
|
PipelineSchedule)
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils.timer import MultiTimer
|
from colossalai.utils.timer import MultiTimer
|
||||||
from model_zoo.gpt import GPTLMLoss
|
from model_zoo.gpt import GPTLMLoss
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
@ -268,3 +268,4 @@ def train():
|
||||||
return_output_label=False,
|
return_output_label=False,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -38,7 +38,7 @@ from colossalai.builder import build_pipeline_model
|
||||||
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
||||||
PipelineSchedule)
|
PipelineSchedule)
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
from colossalai.utils import MultiTimer, get_dataloader
|
||||||
from timm.models import vision_transformer as vit
|
from timm.models import vision_transformer as vit
|
||||||
from torchvision import transforms
|
from torchvision import transforms
|
||||||
|
@ -245,3 +245,4 @@ def train():
|
||||||
hooks=hook_list,
|
hooks=hook_list,
|
||||||
display_progress=True)
|
display_progress=True)
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -79,7 +79,7 @@ from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
||||||
from colossalai.nn.metric import Accuracy
|
from colossalai.nn.metric import Accuracy
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
```
|
```
|
||||||
|
|
||||||
- Other modules
|
- Other modules
|
||||||
|
@ -644,3 +644,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
|
||||||
# If your torch >= 1.9.0
|
# If your torch >= 1.9.0
|
||||||
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
|
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -64,7 +64,7 @@ Trainer is a more high-level wrapper for the user to execute training with fewer
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
|
|
||||||
# build components and initialize with colossalai.initialize
|
# build components and initialize with colossalai.initialize
|
||||||
...
|
...
|
||||||
|
@ -107,7 +107,7 @@ If you want to customize your own hook class, you can inherit `hooks.BaseHook` a
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.trainer import hooks
|
from colossalai.legacy.trainer import hooks
|
||||||
|
|
||||||
class LogMessageHook(hooks.BaseHook):
|
class LogMessageHook(hooks.BaseHook):
|
||||||
|
|
||||||
|
@ -345,7 +345,7 @@ If you wish to train with a trainer object, you can follow the code snippet belo
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.nn.metric import Accuracy
|
from colossalai.nn.metric import Accuracy
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
|
|
||||||
|
|
||||||
# create a trainer object
|
# create a trainer object
|
||||||
|
@ -387,3 +387,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
|
||||||
# with trainer
|
# with trainer
|
||||||
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
|
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -41,7 +41,7 @@ for epoch in range(num_epochs):
|
||||||
|
|
||||||
#### Save when using trainer
|
#### Save when using trainer
|
||||||
```python
|
```python
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
model = ...
|
model = ...
|
||||||
engine, _, _, _ = colossalai.initialize(model=model, ...)
|
engine, _, _, _ = colossalai.initialize(model=model, ...)
|
||||||
trainer = Trainer(engine, ...)
|
trainer = Trainer(engine, ...)
|
||||||
|
@ -61,3 +61,4 @@ model = ...
|
||||||
load_checkpoint('xxx.pt', model)
|
load_checkpoint('xxx.pt', model)
|
||||||
... # train or test
|
... # train or test
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -267,7 +267,7 @@ from pathlib import Path
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.utils import get_dataloader
|
from colossalai.utils import get_dataloader
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
||||||
from timm.models import vit_base_patch16_224
|
from timm.models import vit_base_patch16_224
|
||||||
from torchvision import datasets, transforms
|
from torchvision import datasets, transforms
|
||||||
|
|
|
@ -79,7 +79,7 @@ import colossalai.nn as col_nn
|
||||||
|
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
from colossalai.utils import MultiTimer, get_dataloader
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||||
|
@ -157,3 +157,4 @@ trainer.fit(train_dataloader=train_dataloader,
|
||||||
```
|
```
|
||||||
|
|
||||||
We use `2` pipeline stages and the batch will be split into `4` micro batches.
|
We use `2` pipeline stages and the batch will be split into `4` micro batches.
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -43,7 +43,7 @@ from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
||||||
PipelineSchedule)
|
PipelineSchedule)
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils.timer import MultiTimer
|
from colossalai.utils.timer import MultiTimer
|
||||||
from model_zoo.gpt import GPTLMLoss
|
from model_zoo.gpt import GPTLMLoss
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
@ -273,3 +273,4 @@ def train():
|
||||||
return_output_label=False,
|
return_output_label=False,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -36,7 +36,7 @@ from colossalai.builder import build_pipeline_model
|
||||||
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
|
||||||
PipelineSchedule)
|
PipelineSchedule)
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
from colossalai.utils import MultiTimer, get_dataloader
|
||||||
from timm.models import vision_transformer as vit
|
from timm.models import vision_transformer as vit
|
||||||
from torchvision import transforms
|
from torchvision import transforms
|
||||||
|
@ -244,3 +244,4 @@ def train():
|
||||||
hooks=hook_list,
|
hooks=hook_list,
|
||||||
display_progress=True)
|
display_progress=True)
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -74,7 +74,7 @@ from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
||||||
from colossalai.nn.metric import Accuracy
|
from colossalai.nn.metric import Accuracy
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
```
|
```
|
||||||
|
|
||||||
- 其他模块
|
- 其他模块
|
||||||
|
@ -589,3 +589,4 @@ torchrun --standalone --nproc_per_node <NUM_GPUs> train_hybrid.py --config ./co
|
||||||
# If your torch >= 1.9.0
|
# If your torch >= 1.9.0
|
||||||
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
|
# python -m torch.distributed.run --standalone --nproc_per_node= <NUM_GPUs> train_hybrid.py --config ./configs/config_hybrid_parallel.py
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -61,7 +61,7 @@ Trainer 的参数 `schedule` 默认值是 `None` 。在大多数情况下,除
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
|
|
||||||
# build components and initialize with colossalai.initialize
|
# build components and initialize with colossalai.initialize
|
||||||
...
|
...
|
||||||
|
@ -104,7 +104,7 @@ trainer.fit(
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.trainer import hooks
|
from colossalai.legacy.trainer import hooks
|
||||||
|
|
||||||
class LogMessageHook(hooks.BaseHook):
|
class LogMessageHook(hooks.BaseHook):
|
||||||
|
|
||||||
|
@ -341,7 +341,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from colossalai.nn.metric import Accuracy
|
from colossalai.nn.metric import Accuracy
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
|
|
||||||
|
|
||||||
# create a trainer object
|
# create a trainer object
|
||||||
|
@ -384,3 +384,4 @@ python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr loc
|
||||||
# with trainer
|
# with trainer
|
||||||
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
|
python -m torch.distributed.launch --nproc_per_node <num_gpus> --master_addr localhost --master_port 29500 run_resnet_cifar10_with_trainer.py
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -41,7 +41,7 @@ for epoch in range(num_epochs):
|
||||||
|
|
||||||
#### 用 trainer 保存
|
#### 用 trainer 保存
|
||||||
```python
|
```python
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
model = ...
|
model = ...
|
||||||
engine, _, _, _ = colossalai.initialize(model=model, ...)
|
engine, _, _, _ = colossalai.initialize(model=model, ...)
|
||||||
trainer = Trainer(engine, ...)
|
trainer = Trainer(engine, ...)
|
||||||
|
@ -61,3 +61,4 @@ model = ...
|
||||||
load_checkpoint('xxx.pt', model)
|
load_checkpoint('xxx.pt', model)
|
||||||
... # train or test
|
... # train or test
|
||||||
```
|
```
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -245,7 +245,7 @@ from pathlib import Path
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.utils import get_dataloader
|
from colossalai.utils import get_dataloader
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
from colossalai.nn.lr_scheduler import LinearWarmupLR
|
||||||
from timm.models import vit_base_patch16_224
|
from timm.models import vit_base_patch16_224
|
||||||
from torchvision import datasets, transforms
|
from torchvision import datasets, transforms
|
||||||
|
|
|
@ -78,7 +78,7 @@ import colossalai.nn as col_nn
|
||||||
|
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.trainer import Trainer, hooks
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
from colossalai.utils import MultiTimer, get_dataloader
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||||
|
@ -156,3 +156,4 @@ trainer.fit(train_dataloader=train_dataloader,
|
||||||
```
|
```
|
||||||
|
|
||||||
我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。
|
我们使用 `2` 个流水段,并且 batch 将被切分为 `4` 个 micro batches。
|
||||||
|
<!-- doc-test-command: echo -->
|
||||||
|
|
|
@ -10,9 +10,9 @@ import colossalai
|
||||||
import colossalai.utils as utils
|
import colossalai.utils as utils
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.trainer import Trainer, hooks
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||||
from colossalai.nn import LinearWarmupLR
|
from colossalai.nn import LinearWarmupLR
|
||||||
from colossalai.trainer import Trainer, hooks
|
|
||||||
from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
|
from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
|
||||||
from colossalai.utils.timer import MultiTimer
|
from colossalai.utils.timer import MultiTimer
|
||||||
from colossalai.zero.legacy.init_ctx import ZeroInitContext
|
from colossalai.zero.legacy.init_ctx import ZeroInitContext
|
||||||
|
|
|
@ -4,4 +4,4 @@ markers =
|
||||||
gpu: tests which requires a single GPU
|
gpu: tests which requires a single GPU
|
||||||
dist: tests which are run in a multi-GPU or multi-machine environment
|
dist: tests which are run in a multi-GPU or multi-machine environment
|
||||||
experiment: tests for experimental features
|
experiment: tests for experimental features
|
||||||
addopts = --ignore=tests/test_analyzer --ignore=tests/test_auto_parallel --ignore=tests/test_autochunk --ignore=tests/test_moe --ignore=tests/test_fx
|
addopts = --ignore=tests/test_analyzer --ignore=tests/test_auto_parallel --ignore=tests/test_autochunk --ignore=tests/test_moe --ignore=tests/test_fx --ignore=tests/test_legacy
|
||||||
|
|
|
@ -1,100 +0,0 @@
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
|
||||||
from torchvision import transforms
|
|
||||||
from torchvision.datasets import CIFAR10
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.amp import AMP_TYPE
|
|
||||||
from colossalai.context import ParallelMode
|
|
||||||
from colossalai.core import global_context as gpc
|
|
||||||
from colossalai.logging import get_dist_logger
|
|
||||||
from colossalai.nn import CrossEntropyLoss
|
|
||||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
|
||||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
|
||||||
from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
|
|
||||||
from colossalai.trainer import Trainer, hooks
|
|
||||||
from colossalai.utils import get_dataloader
|
|
||||||
|
|
||||||
BATCH_SIZE = 4
|
|
||||||
NUM_EPOCHS = 60
|
|
||||||
WARMUP_EPOCHS = 5
|
|
||||||
CONFIG = dict(NUM_MICRO_BATCHES=2,
|
|
||||||
parallel=dict(pipeline=2, tensor=dict(size=2, mode='1d')),
|
|
||||||
fp16=dict(mode=AMP_TYPE.NAIVE),
|
|
||||||
gradient_accumulation=2)
|
|
||||||
|
|
||||||
|
|
||||||
def run_trainer(rank, world_size, port):
|
|
||||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
|
||||||
|
|
||||||
logger = get_dist_logger()
|
|
||||||
|
|
||||||
# get logger
|
|
||||||
logger = get_dist_logger()
|
|
||||||
|
|
||||||
pipelinable = PipelinableContext()
|
|
||||||
try:
|
|
||||||
from titans.model.vit import vit_tiny_patch4_32
|
|
||||||
except ImportError:
|
|
||||||
logger.warning('skip the test_cifar_with_data_pipeline_tensor test because titan is not installed')
|
|
||||||
logger.warning('please install titan from https://github.com/hpcaitech/Titans')
|
|
||||||
return
|
|
||||||
with pipelinable:
|
|
||||||
model = vit_tiny_patch4_32()
|
|
||||||
pipelinable.to_layer_list()
|
|
||||||
pipelinable.policy = "uniform"
|
|
||||||
model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
|
|
||||||
|
|
||||||
# create dataloaders
|
|
||||||
root = Path(os.environ['DATA'])
|
|
||||||
transform_train = transforms.Compose([
|
|
||||||
transforms.RandomCrop(32, padding=4, pad_if_needed=True),
|
|
||||||
transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
|
|
||||||
transforms.ToTensor(),
|
|
||||||
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
|
|
||||||
])
|
|
||||||
train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train)
|
|
||||||
train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
|
|
||||||
|
|
||||||
# create loss function
|
|
||||||
criterion = CrossEntropyLoss(label_smoothing=0.1)
|
|
||||||
|
|
||||||
# create optimizer
|
|
||||||
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
|
|
||||||
|
|
||||||
# create lr scheduler
|
|
||||||
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
|
|
||||||
|
|
||||||
# initialize
|
|
||||||
engine, train_dataloader, *_ = colossalai.initialize(model=model,
|
|
||||||
optimizer=optimizer,
|
|
||||||
criterion=criterion,
|
|
||||||
train_dataloader=train_dataloader)
|
|
||||||
|
|
||||||
logger = get_dist_logger()
|
|
||||||
|
|
||||||
trainer = Trainer(engine=engine, logger=logger)
|
|
||||||
|
|
||||||
hook_list = [
|
|
||||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
|
|
||||||
]
|
|
||||||
|
|
||||||
trainer.fit(train_dataloader=train_dataloader,
|
|
||||||
epochs=NUM_EPOCHS,
|
|
||||||
max_steps=2,
|
|
||||||
hooks=hook_list,
|
|
||||||
display_progress=True)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
|
||||||
@skip_if_not_enough_gpus(min_gpus=8)
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
def test_hybrid_parallel():
|
|
||||||
spawn(run_trainer, 8)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test_hybrid_parallel()
|
|
|
@ -3,9 +3,9 @@ import torch
|
||||||
|
|
||||||
import colossalai
|
import colossalai
|
||||||
from colossalai.amp.amp_type import AMP_TYPE
|
from colossalai.amp.amp_type import AMP_TYPE
|
||||||
|
from colossalai.legacy.trainer import Trainer
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||||
from colossalai.trainer import Trainer
|
|
||||||
from colossalai.utils import MultiTimer
|
from colossalai.utils import MultiTimer
|
||||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||||
|
|
|
@ -12,9 +12,9 @@ from torchvision.models import resnet18
|
||||||
import colossalai
|
import colossalai
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.trainer import Trainer
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||||
from colossalai.trainer import Trainer
|
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
from colossalai.utils import MultiTimer, get_dataloader
|
||||||
|
|
||||||
BATCH_SIZE = 4
|
BATCH_SIZE = 4
|
|
@ -1,25 +1,16 @@
|
||||||
import os
|
import os
|
||||||
from typing import Callable, List, Optional, Type, Union
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from rpc_test_utils import parse_args, rpc_run
|
||||||
from titans.dataloader.cifar10 import build_cifar
|
from titans.dataloader.cifar10 import build_cifar
|
||||||
from torchvision.models import resnet50
|
from torchvision.models import resnet50
|
||||||
from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from rpc_test_utils import rpc_run, parse_args
|
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||||
import colossalai
|
from colossalai.pipeline.rpc import OneFOneBPipelineEngine
|
||||||
import colossalai.nn as col_nn
|
|
||||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
|
||||||
from colossalai.trainer import Trainer, hooks
|
|
||||||
from colossalai.utils import MultiTimer, get_dataloader
|
|
||||||
from colossalai.context import ParallelMode
|
|
||||||
from colossalai.pipeline.pipelinable import PipelinableContext, PipelinableModel
|
|
||||||
from colossalai.pipeline.rpc import OneFOneBPipelineEngine, ChimeraPipelineEngine
|
|
||||||
from colossalai.pipeline.pipeline_process_group import ppg
|
|
||||||
|
|
||||||
|
|
||||||
def flatten(x):
|
def flatten(x):
|
||||||
|
|
Loading…
Reference in New Issue