From 6642cebdbe7a052fcc288fa176ae373283452131 Mon Sep 17 00:00:00 2001 From: BlueRum <70618399+ht-zhou@users.noreply.github.com> Date: Mon, 26 Dec 2022 15:22:20 +0800 Subject: [PATCH] [example] Change some training settings for diffusion (#2195) --- examples/images/diffusion/README.md | 8 ++++--- .../diffusion/configs/train_colossalai.yaml | 9 +++++--- .../images/diffusion/configs/train_ddp.yaml | 22 ++++++++----------- examples/images/diffusion/train.sh | 5 ----- examples/images/diffusion/train_colossalai.sh | 5 +++++ examples/images/diffusion/train_ddp.sh | 5 +++++ 6 files changed, 30 insertions(+), 24 deletions(-) delete mode 100755 examples/images/diffusion/train.sh create mode 100755 examples/images/diffusion/train_colossalai.sh create mode 100644 examples/images/diffusion/train_ddp.sh diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md index eb899c563..324337426 100644 --- a/examples/images/diffusion/README.md +++ b/examples/images/diffusion/README.md @@ -87,14 +87,15 @@ you should the change the `data.file_path` in the `config/train_colossalai.yaml` ## Training -We provide the script `train.sh` to run the training task , and two Stategy in `configs`:`train_colossalai.yaml` and `train_ddp.yaml` +We provide the script `train_colossalai.sh` to run the training task with colossalai, +and can also use `train_ddp.sh` to run the training task with ddp to compare. -For example, you can run the training from colossalai by +In `train_colossalai.sh` the main command is: ``` python main.py --logdir /tmp/ -t -b configs/train_colossalai.yaml ``` -- you can change the `--logdir` the save the log information and the last checkpoint +- you can change the `--logdir` to decide where to save the log information and the last checkpoint. ### Training config @@ -155,6 +156,7 @@ optional arguments: --config CONFIG path to config which constructs model --ckpt CKPT path to checkpoint of model --seed SEED the seed (for reproducible sampling) + --use_int8 whether to use quantization method --precision {full,autocast} evaluate at this precision ``` diff --git a/examples/images/diffusion/configs/train_colossalai.yaml b/examples/images/diffusion/configs/train_colossalai.yaml index 873308f8c..e8df63bf6 100644 --- a/examples/images/diffusion/configs/train_colossalai.yaml +++ b/examples/images/diffusion/configs/train_colossalai.yaml @@ -80,19 +80,22 @@ model: data: target: main.DataModuleFromConfig params: - batch_size: 64 + batch_size: 128 wrap: False + # num_workwers should be 2 * batch_size, and total num less than 1024 + # e.g. if use 8 devices, no more than 128 + num_workers: 128 train: target: ldm.data.base.Txt2ImgIterableBaseDataset params: - file_path: "/data/scratch/diffuser/laion_part0/" + file_path: # YOUR DATASET_PATH world_size: 1 rank: 0 lightning: trainer: accelerator: 'gpu' - devices: 4 + devices: 8 log_gpu_memory: all max_epochs: 2 precision: 16 diff --git a/examples/images/diffusion/configs/train_ddp.yaml b/examples/images/diffusion/configs/train_ddp.yaml index 4308998f4..a63df887e 100644 --- a/examples/images/diffusion/configs/train_ddp.yaml +++ b/examples/images/diffusion/configs/train_ddp.yaml @@ -80,25 +80,21 @@ model: data: target: main.DataModuleFromConfig params: - batch_size: 16 - num_workers: 4 + batch_size: 128 + # num_workwers should be 2 * batch_size, and the total num less than 1024 + # e.g. if use 8 devices, no more than 128 + num_workers: 128 train: - target: ldm.data.teyvat.hf_dataset + target: ldm.data.base.Txt2ImgIterableBaseDataset params: - path: Fazzie/Teyvat - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - - target: torchvision.transforms.RandomCrop - params: - size: 512 - - target: torchvision.transforms.RandomHorizontalFlip + file_path: # YOUR DATAPATH + world_size: 1 + rank: 0 lightning: trainer: accelerator: 'gpu' - devices: 2 + devices: 8 log_gpu_memory: all max_epochs: 2 precision: 16 diff --git a/examples/images/diffusion/train.sh b/examples/images/diffusion/train.sh deleted file mode 100755 index ed9ae4b75..000000000 --- a/examples/images/diffusion/train.sh +++ /dev/null @@ -1,5 +0,0 @@ -# HF_DATASETS_OFFLINE=1 -# TRANSFORMERS_OFFLINE=1 -# DIFFUSERS_OFFLINE=1 - -python main.py --logdir /tmp/ -t -b configs/Teyvat/train_colossalai_teyvat.yaml diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh new file mode 100755 index 000000000..4223a6941 --- /dev/null +++ b/examples/images/diffusion/train_colossalai.sh @@ -0,0 +1,5 @@ +HF_DATASETS_OFFLINE=1 +TRANSFORMERS_OFFLINE=1 +DIFFUSERS_OFFLINE=1 + +python main.py --logdir /tmp -t -b /configs/train_colossalai.yaml diff --git a/examples/images/diffusion/train_ddp.sh b/examples/images/diffusion/train_ddp.sh new file mode 100644 index 000000000..78fe76548 --- /dev/null +++ b/examples/images/diffusion/train_ddp.sh @@ -0,0 +1,5 @@ +HF_DATASETS_OFFLINE=1 +TRANSFORMERS_OFFLINE=1 +DIFFUSERS_OFFLINE=1 + +python main.py --logdir /tmp -t -b /configs/train_ddp.yaml