From 6642cebdbe7a052fcc288fa176ae373283452131 Mon Sep 17 00:00:00 2001
From: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Date: Mon, 26 Dec 2022 15:22:20 +0800
Subject: [PATCH] [example] Change some training settings for diffusion (#2195)

---
 examples/images/diffusion/README.md           |  8 ++++---
 .../diffusion/configs/train_colossalai.yaml   |  9 +++++---
 .../images/diffusion/configs/train_ddp.yaml   | 22 ++++++++-----------
 examples/images/diffusion/train.sh            |  5 -----
 examples/images/diffusion/train_colossalai.sh |  5 +++++
 examples/images/diffusion/train_ddp.sh        |  5 +++++
 6 files changed, 30 insertions(+), 24 deletions(-)
 delete mode 100755 examples/images/diffusion/train.sh
 create mode 100755 examples/images/diffusion/train_colossalai.sh
 create mode 100644 examples/images/diffusion/train_ddp.sh

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index eb899c563..324337426 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -87,14 +87,15 @@ you should the change the `data.file_path` in the `config/train_colossalai.yaml`
 
 ## Training
 
-We provide the script `train.sh` to run the training task , and two Stategy in `configs`:`train_colossalai.yaml` and `train_ddp.yaml`
+We provide the script `train_colossalai.sh` to run the training task with colossalai,
+and can also use `train_ddp.sh` to run the training task with ddp to compare.
 
-For example, you can run the training from colossalai by
+In `train_colossalai.sh` the main command is:
 ```
 python main.py --logdir /tmp/ -t -b configs/train_colossalai.yaml
 ```
 
-- you can change the `--logdir` the save the log information and the last checkpoint
+- you can change the `--logdir` to decide where to save the log information and the last checkpoint.
 
 ### Training config
 
@@ -155,6 +156,7 @@ optional arguments:
   --config CONFIG       path to config which constructs model
   --ckpt CKPT           path to checkpoint of model
   --seed SEED           the seed (for reproducible sampling)
+  --use_int8            whether to use quantization method
   --precision {full,autocast}
                         evaluate at this precision
 ```
diff --git a/examples/images/diffusion/configs/train_colossalai.yaml b/examples/images/diffusion/configs/train_colossalai.yaml
index 873308f8c..e8df63bf6 100644
--- a/examples/images/diffusion/configs/train_colossalai.yaml
+++ b/examples/images/diffusion/configs/train_colossalai.yaml
@@ -80,19 +80,22 @@ model:
 data:
   target: main.DataModuleFromConfig
   params:
-    batch_size: 64
+    batch_size: 128
     wrap: False
+    # num_workwers should be 2 * batch_size, and total num less than 1024
+    # e.g. if use 8 devices, no more than 128
+    num_workers: 128
     train:
       target: ldm.data.base.Txt2ImgIterableBaseDataset
       params:
-        file_path: "/data/scratch/diffuser/laion_part0/"
+        file_path: # YOUR DATASET_PATH
         world_size: 1
         rank: 0
 
 lightning:
   trainer:
     accelerator: 'gpu'
-    devices: 4
+    devices: 8
     log_gpu_memory: all
     max_epochs: 2
     precision: 16
diff --git a/examples/images/diffusion/configs/train_ddp.yaml b/examples/images/diffusion/configs/train_ddp.yaml
index 4308998f4..a63df887e 100644
--- a/examples/images/diffusion/configs/train_ddp.yaml
+++ b/examples/images/diffusion/configs/train_ddp.yaml
@@ -80,25 +80,21 @@ model:
 data:
   target: main.DataModuleFromConfig
   params:
-    batch_size: 16
-    num_workers: 4
+    batch_size: 128
+    # num_workwers should be 2 * batch_size, and the total num less than 1024
+    # e.g. if use 8 devices, no more than 128
+    num_workers: 128
     train:
-      target: ldm.data.teyvat.hf_dataset
+      target: ldm.data.base.Txt2ImgIterableBaseDataset
       params:
-        path: Fazzie/Teyvat
-        image_transforms:
-        - target: torchvision.transforms.Resize
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomCrop
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomHorizontalFlip
+        file_path: # YOUR DATAPATH
+        world_size: 1
+        rank: 0
 
 lightning:
   trainer:
     accelerator: 'gpu' 
-    devices: 2
+    devices: 8
     log_gpu_memory: all
     max_epochs: 2
     precision: 16
diff --git a/examples/images/diffusion/train.sh b/examples/images/diffusion/train.sh
deleted file mode 100755
index ed9ae4b75..000000000
--- a/examples/images/diffusion/train.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# HF_DATASETS_OFFLINE=1
-# TRANSFORMERS_OFFLINE=1
-# DIFFUSERS_OFFLINE=1
-
-python main.py --logdir /tmp/ -t -b configs/Teyvat/train_colossalai_teyvat.yaml
diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh
new file mode 100755
index 000000000..4223a6941
--- /dev/null
+++ b/examples/images/diffusion/train_colossalai.sh
@@ -0,0 +1,5 @@
+HF_DATASETS_OFFLINE=1 
+TRANSFORMERS_OFFLINE=1 
+DIFFUSERS_OFFLINE=1 
+
+python main.py --logdir /tmp  -t -b /configs/train_colossalai.yaml
diff --git a/examples/images/diffusion/train_ddp.sh b/examples/images/diffusion/train_ddp.sh
new file mode 100644
index 000000000..78fe76548
--- /dev/null
+++ b/examples/images/diffusion/train_ddp.sh
@@ -0,0 +1,5 @@
+HF_DATASETS_OFFLINE=1 
+TRANSFORMERS_OFFLINE=1 
+DIFFUSERS_OFFLINE=1 
+
+python main.py --logdir /tmp  -t -b /configs/train_ddp.yaml