[tutorial] added synthetic data for hybrid parallel (#1919)

pull/1921/head
Frank Lee 2022-11-12 17:49:48 +08:00 committed by GitHub
parent 1b0dd05940
commit 3c42fdbedc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 153 additions and 119 deletions

View File

@ -2,16 +2,23 @@
## Prepare Dataset
We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default.
We use CIFAR10 dataset in this example. You should invoke the `donwload_cifar10.py` in the tutorial root directory or directly run the `auto_parallel_with_resnet.py`.
The dataset will be downloaded to `colossalai/examples/tutorials/data` by default.
If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
```bash
export DATA=/path/to/data
```
You can also use synthetic data for this tutorial if you don't wish to download the `CIFAR10` dataset by adding the `-s` or `--synthetic` flag to the command.
## Run on 2*2 device mesh
```bash
# run with cifar10
colossalai run --nproc_per_node 4 train.py --config config.py
# run with synthetic dataset
colossalai run --nproc_per_node 4 train.py --config config.py -s
```

View File

@ -1,23 +1,51 @@
import os
import colossalai
import torch
import torch
from titans.dataloader.cifar10 import build_cifar
from titans.model.vit.vit import _create_vit_model
from tqdm import tqdm
import colossalai
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn import CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import Lars, Lamb
from colossalai.utils import is_using_pp, get_dataloader
from colossalai.nn.optimizer import Lamb, Lars
from colossalai.pipeline.pipelinable import PipelinableContext
from titans.model.vit.vit import _create_vit_model
from titans.dataloader.cifar10 import build_cifar
from colossalai.utils import get_dataloader, is_using_pp
class DummyDataloader():
def __init__(self, length, batch_size):
self.length = length
self.batch_size = batch_size
def generate(self):
data = torch.rand(self.batch_size, 3, 224, 224)
label = torch.randint(low=0, high=10, size=(self.batch_size,))
return data, label
def __iter__(self):
self.step = 0
return self
def __next__(self):
if self.step < self.length:
self.step += 1
return self.generate()
else:
raise StopIteration
def __len__(self):
return self.length
def main():
# initialize distributed setting
parser = colossalai.get_default_parser()
parser.add_argument('-s', '--synthetic', action="store_true", help="whether use synthetic data")
args = parser.parse_args()
# launch from torch
@ -53,8 +81,7 @@ def main():
model = _create_vit_model(**model_kwargs)
pipelinable.to_layer_list()
pipelinable.policy = "uniform"
model = pipelinable.partition(
1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
else:
model = _create_vit_model(**model_kwargs)
@ -66,20 +93,21 @@ def main():
pipeline_stage = 0
else:
pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
logger.info(
f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
logger.info(f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
# create dataloaders
root = os.environ.get('DATA', '../data/cifar10')
train_dataloader, test_dataloader = build_cifar(
gpc.config.BATCH_SIZE, root, pad_if_needed=True)
root = os.environ.get('DATA', '../data/')
if args.synthetic:
train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
else:
train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
# create loss function
criterion = CrossEntropyLoss(label_smoothing=0.1)
# create optimizer
optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE,
weight_decay=gpc.config.WEIGHT_DECAY)
optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
# create lr scheduler
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
@ -95,11 +123,10 @@ def main():
logger.info("Engine is built", ranks=[0])
data_iter = iter(train_dataloader)
for epoch in range(gpc.config.NUM_EPOCHS):
# training
engine.train()
data_iter = iter(train_dataloader)
if gpc.get_global_rank() == 0:
description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)