mirror of https://github.com/hpcaitech/ColossalAI
Polish Code
parent
152239bbfa
commit
de84c0311a
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
|
@ -19,50 +18,42 @@ model:
|
|||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
|
@ -18,50 +17,42 @@ model:
|
|||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
|
|
@ -19,106 +19,97 @@ model:
|
|||
use_ema: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 9
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 9
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
|
||||
data:
|
||||
target: ldm.data.laion.WebDataModuleFromConfig
|
||||
params:
|
||||
tar_base: null # for concat as in LAION-A
|
||||
p_unsafe_threshold: 0.1
|
||||
filter_word_list: "data/filters.yaml"
|
||||
max_pwatermark: 0.45
|
||||
batch_size: 8
|
||||
num_workers: 6
|
||||
multinode: True
|
||||
min_size: 512
|
||||
train:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
|
||||
shuffle: 10000
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
# NOTE use enough shards to avoid empty validation loops in workers
|
||||
validation:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
|
||||
shuffle: 0
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.CenterCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
tar_base: null # for concat as in LAION-A
|
||||
p_unsafe_threshold: 0.1
|
||||
filter_word_list: "data/filters.yaml"
|
||||
max_pwatermark: 0.45
|
||||
batch_size: 8
|
||||
num_workers: 6
|
||||
multinode: True
|
||||
min_size: 512
|
||||
train:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
|
||||
shuffle: 10000
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
# NOTE use enough shards to avoid empty validation loops in workers
|
||||
validation:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
|
||||
shuffle: 0
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.CenterCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
|
||||
lightning:
|
||||
find_unused_parameters: True
|
||||
|
@ -132,8 +123,6 @@ lightning:
|
|||
every_n_train_steps: 10000
|
||||
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
enable_autocast: False
|
||||
disabled: False
|
||||
batch_frequency: 1000
|
||||
|
|
|
@ -19,54 +19,45 @@ model:
|
|||
use_ema: False
|
||||
|
||||
depth_stage_config:
|
||||
target: ldm.modules.midas.api.MiDaSInference
|
||||
params:
|
||||
model_type: "dpt_hybrid"
|
||||
model_type: "dpt_hybrid"
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 5
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 5
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
|
|
@ -20,56 +20,47 @@ model:
|
|||
use_ema: False
|
||||
|
||||
low_scale_config:
|
||||
target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
|
||||
params:
|
||||
noise_schedule_config: # image space
|
||||
linear_start: 0.0001
|
||||
linear_end: 0.02
|
||||
max_noise_level: 350
|
||||
noise_schedule_config: # image space
|
||||
linear_start: 0.0001
|
||||
linear_end: 0.02
|
||||
max_noise_level: 350
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
num_classes: 1000 # timesteps for noise conditioning (here constant, just need one)
|
||||
image_size: 128
|
||||
in_channels: 7
|
||||
out_channels: 4
|
||||
model_channels: 256
|
||||
attention_resolutions: [ 2,4,8]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 2, 4]
|
||||
disable_self_attentions: [True, True, True, False]
|
||||
disable_middle_self_attn: False
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_linear_in_transformer: True
|
||||
use_checkpoint: True
|
||||
num_classes: 1000 # timesteps for noise conditioning (here constant, just need one)
|
||||
image_size: 128
|
||||
in_channels: 7
|
||||
out_channels: 4
|
||||
model_channels: 256
|
||||
attention_resolutions: [ 2,4,8]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 2, 4]
|
||||
disable_self_attentions: [True, True, True, False]
|
||||
disable_middle_self_attn: False
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_linear_in_transformer: True
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
ddconfig:
|
||||
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
|
||||
double_z: True
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
embed_dim: 4
|
||||
ddconfig:
|
||||
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
|
||||
double_z: True
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
|
@ -20,81 +19,70 @@ model:
|
|||
use_ema: False
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 16
|
||||
num_workers: 4
|
||||
train:
|
||||
target: ldm.data.teyvat.hf_dataset
|
||||
params:
|
||||
path: Fazzie/Teyvat
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
batch_size: 16
|
||||
num_workers: 4
|
||||
train:
|
||||
target: ldm.data.teyvat.hf_dataset
|
||||
params:
|
||||
path: Fazzie/Teyvat
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
|
@ -105,13 +93,11 @@ lightning:
|
|||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
|
@ -120,9 +106,7 @@ lightning:
|
|||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
|
@ -19,95 +18,83 @@ model:
|
|||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 128
|
||||
wrap: False
|
||||
# num_workwers should be 2 * batch_size, and total num less than 1024
|
||||
# e.g. if use 8 devices, no more than 128
|
||||
num_workers: 128
|
||||
train:
|
||||
target: ldm.data.base.Txt2ImgIterableBaseDataset
|
||||
params:
|
||||
file_path: # YOUR DATASET_PATH
|
||||
world_size: 1
|
||||
rank: 0
|
||||
batch_size: 128
|
||||
wrap: False
|
||||
# num_workwers should be 2 * batch_size, and total num less than 1024
|
||||
# e.g. if use 8 devices, no more than 128
|
||||
num_workers: 128
|
||||
train:
|
||||
target: ldm.data.base.Txt2ImgIterableBaseDataset
|
||||
params:
|
||||
file_path: # YOUR DATASET_PATH
|
||||
world_size: 1
|
||||
rank: 0
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
devices: 8
|
||||
devices: 2
|
||||
log_gpu_memory: all
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
|
@ -116,9 +103,7 @@ lightning:
|
|||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
|
@ -19,82 +18,71 @@ model:
|
|||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 4
|
||||
num_workers: 4
|
||||
train:
|
||||
target: ldm.data.cifar10.hf_dataset
|
||||
params:
|
||||
name: cifar10
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
batch_size: 4
|
||||
num_workers: 4
|
||||
train:
|
||||
target: ldm.data.cifar10.hf_dataset
|
||||
params:
|
||||
name: cifar10
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
|
@ -105,13 +93,11 @@ lightning:
|
|||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: true
|
||||
min_chunk_size: 64
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
|
@ -120,9 +106,7 @@ lightning:
|
|||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
|
@ -19,77 +18,65 @@ model:
|
|||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 128
|
||||
# num_workwers should be 2 * batch_size, and the total num less than 1024
|
||||
# e.g. if use 8 devices, no more than 128
|
||||
num_workers: 128
|
||||
train:
|
||||
target: ldm.data.base.Txt2ImgIterableBaseDataset
|
||||
params:
|
||||
file_path: # YOUR DATAPATH
|
||||
world_size: 1
|
||||
rank: 0
|
||||
batch_size: 128
|
||||
# num_workwers should be 2 * batch_size, and the total num less than 1024
|
||||
# e.g. if use 8 devices, no more than 128
|
||||
num_workers: 128
|
||||
train:
|
||||
target: ldm.data.base.Txt2ImgIterableBaseDataset
|
||||
params:
|
||||
file_path: # YOUR DATAPATH
|
||||
world_size: 1
|
||||
rank: 0
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
|
@ -100,9 +87,7 @@ lightning:
|
|||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: strategies.DDPStrategy
|
||||
params:
|
||||
find_unused_parameters: False
|
||||
find_unused_parameters: False
|
||||
log_every_n_steps: 2
|
||||
# max_steps: 6o
|
||||
logger: True
|
||||
|
@ -111,9 +96,7 @@ lightning:
|
|||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/data2/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
name: nowname
|
||||
save_dir: "/data2/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
|
|
|
@ -1,16 +1,13 @@
|
|||
import torch
|
||||
try:
|
||||
import lightning.pytorch as pl
|
||||
except:
|
||||
import pytorch_lightning as pl
|
||||
import lightning.pytorch as pl
|
||||
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from torch.nn import Identity
|
||||
from contextlib import contextmanager
|
||||
|
||||
from ldm.modules.diffusionmodules.model import Encoder, Decoder
|
||||
from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
|
||||
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.modules.ema import LitEma
|
||||
|
||||
|
||||
|
@ -32,7 +29,7 @@ class AutoencoderKL(pl.LightningModule):
|
|||
self.image_key = image_key
|
||||
self.encoder = Encoder(**ddconfig)
|
||||
self.decoder = Decoder(**ddconfig)
|
||||
self.loss = instantiate_from_config(lossconfig)
|
||||
self.loss = Identity()
|
||||
assert ddconfig["double_z"]
|
||||
self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||
|
|
|
@ -9,9 +9,10 @@ from copy import deepcopy
|
|||
from einops import rearrange
|
||||
from glob import glob
|
||||
from natsort import natsorted
|
||||
|
||||
from ldm.models.diffusion.ddpm import LatentDiffusion
|
||||
from ldm.lr_scheduler import LambdaLinearScheduler
|
||||
from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
|
||||
from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
|
||||
from ldm.util import log_txt_as_img, default, ismap
|
||||
|
||||
__models__ = {
|
||||
'class_label': EncoderUNetModel,
|
||||
|
@ -86,7 +87,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
|||
print(f"Unexpected Keys: {unexpected}")
|
||||
|
||||
def load_diffusion(self):
|
||||
model = instantiate_from_config(self.diffusion_config)
|
||||
model = LatentDiffusion(**self.diffusion_config.get('params',dict()))
|
||||
self.diffusion_model = model.eval()
|
||||
self.diffusion_model.train = disabled_train
|
||||
for param in self.diffusion_model.parameters():
|
||||
|
@ -221,7 +222,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
|||
optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
|
||||
|
||||
if self.use_scheduler:
|
||||
scheduler = instantiate_from_config(self.scheduler_config)
|
||||
scheduler = LambdaLinearScheduler(**self.scheduler_config.get('params',dict()))
|
||||
|
||||
print("Setting up LambdaLR scheduler...")
|
||||
scheduler = [
|
||||
|
|
|
@ -22,19 +22,22 @@ from contextlib import contextmanager, nullcontext
|
|||
from functools import partial
|
||||
|
||||
from einops import rearrange, repeat
|
||||
from ldm.lr_scheduler import LambdaLinearScheduler
|
||||
from ldm.models.autoencoder import *
|
||||
from ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage
|
||||
from ldm.models.diffusion.ddim import *
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.modules.midas.api import MiDaSInference
|
||||
from ldm.modules.diffusionmodules.model import *
|
||||
from ldm.modules.diffusionmodules.model import Decoder, Encoder, Model
|
||||
from ldm.modules.diffusionmodules.openaimodel import *
|
||||
from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d
|
||||
from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d, UNetModel
|
||||
from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule, noise_like
|
||||
from ldm.modules.distributions.distributions import DiagonalGaussianDistribution, normal_kl
|
||||
from ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
|
||||
from ldm.modules.ema import LitEma
|
||||
from ldm.modules.encoders.modules import *
|
||||
from ldm.util import count_params, default, exists, instantiate_from_config, isimage, ismap, log_txt_as_img, mean_flat
|
||||
from ldm.util import count_params, default, exists, isimage, ismap, log_txt_as_img, mean_flat
|
||||
from omegaconf import ListConfig
|
||||
from torch.optim.lr_scheduler import LambdaLR
|
||||
from torchvision.utils import make_grid
|
||||
|
@ -690,7 +693,7 @@ class LatentDiffusion(DDPM):
|
|||
self.make_cond_schedule()
|
||||
|
||||
def instantiate_first_stage(self, config):
|
||||
model = instantiate_from_config(config)
|
||||
model = AutoencoderKL(**config)
|
||||
self.first_stage_model = model.eval()
|
||||
self.first_stage_model.train = disabled_train
|
||||
for param in self.first_stage_model.parameters():
|
||||
|
@ -706,15 +709,13 @@ class LatentDiffusion(DDPM):
|
|||
self.cond_stage_model = None
|
||||
# self.be_unconditional = True
|
||||
else:
|
||||
model = instantiate_from_config(config)
|
||||
model = FrozenOpenCLIPEmbedder(**config)
|
||||
self.cond_stage_model = model.eval()
|
||||
self.cond_stage_model.train = disabled_train
|
||||
for param in self.cond_stage_model.parameters():
|
||||
param.requires_grad = False
|
||||
else:
|
||||
assert config != '__is_first_stage__'
|
||||
assert config != '__is_unconditional__'
|
||||
model = instantiate_from_config(config)
|
||||
model = FrozenOpenCLIPEmbedder(**config)
|
||||
self.cond_stage_model = model
|
||||
|
||||
def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
|
||||
|
@ -1479,8 +1480,7 @@ class LatentDiffusion(DDPM):
|
|||
|
||||
# opt = torch.optim.AdamW(params, lr=lr)
|
||||
if self.use_scheduler:
|
||||
assert 'target' in self.scheduler_config
|
||||
scheduler = instantiate_from_config(self.scheduler_config)
|
||||
scheduler = LambdaLinearScheduler(**self.scheduler_config)
|
||||
|
||||
rank_zero_info("Setting up LambdaLR scheduler...")
|
||||
scheduler = [{'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule), 'interval': 'step', 'frequency': 1}]
|
||||
|
@ -1502,7 +1502,7 @@ class DiffusionWrapper(pl.LightningModule):
|
|||
def __init__(self, diff_model_config, conditioning_key):
|
||||
super().__init__()
|
||||
self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
|
||||
self.diffusion_model = instantiate_from_config(diff_model_config)
|
||||
self.diffusion_model = UNetModel(**diff_model_config)
|
||||
self.conditioning_key = conditioning_key
|
||||
assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
|
||||
|
||||
|
@ -1551,7 +1551,7 @@ class LatentUpscaleDiffusion(LatentDiffusion):
|
|||
self.noise_level_key = noise_level_key
|
||||
|
||||
def instantiate_low_stage(self, config):
|
||||
model = instantiate_from_config(config)
|
||||
model = ImageConcatWithNoiseAugmentation(**config)
|
||||
self.low_scale_model = model.eval()
|
||||
self.low_scale_model.train = disabled_train
|
||||
for param in self.low_scale_model.parameters():
|
||||
|
@ -1933,7 +1933,7 @@ class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
|
|||
|
||||
def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs):
|
||||
super().__init__(concat_keys=concat_keys, *args, **kwargs)
|
||||
self.depth_model = instantiate_from_config(depth_stage_config)
|
||||
self.depth_model = MiDaSInference(**depth_stage_config)
|
||||
self.depth_stage_key = concat_keys[0]
|
||||
|
||||
@torch.no_grad()
|
||||
|
@ -2006,7 +2006,7 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
|
|||
self.low_scale_key = low_scale_key
|
||||
|
||||
def instantiate_low_stage(self, config):
|
||||
model = instantiate_from_config(config)
|
||||
model = ImageConcatWithNoiseAugmentation(**config)
|
||||
self.low_scale_model = model.eval()
|
||||
self.low_scale_model.train = disabled_train
|
||||
for param in self.low_scale_model.parameters():
|
||||
|
|
|
@ -10,11 +10,8 @@ import time
|
|||
import numpy as np
|
||||
import torch
|
||||
import torchvision
|
||||
import lightning.pytorch as pl
|
||||
|
||||
try:
|
||||
import lightning.pytorch as pl
|
||||
except:
|
||||
import pytorch_lightning as pl
|
||||
|
||||
from functools import partial
|
||||
|
||||
|
@ -23,19 +20,15 @@ from packaging import version
|
|||
from PIL import Image
|
||||
from prefetch_generator import BackgroundGenerator
|
||||
from torch.utils.data import DataLoader, Dataset, Subset, random_split
|
||||
from ldm.models.diffusion.ddpm import LatentDiffusion
|
||||
|
||||
try:
|
||||
from lightning.pytorch import seed_everything
|
||||
from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
|
||||
from lightning.pytorch.trainer import Trainer
|
||||
from lightning.pytorch.utilities import rank_zero_info, rank_zero_only
|
||||
LIGHTNING_PACK_NAME = "lightning.pytorch."
|
||||
except:
|
||||
from pytorch_lightning import seed_everything
|
||||
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
|
||||
from pytorch_lightning.trainer import Trainer
|
||||
from pytorch_lightning.utilities import rank_zero_info, rank_zero_only
|
||||
LIGHTNING_PACK_NAME = "pytorch_lightning."
|
||||
from lightning.pytorch import seed_everything
|
||||
from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
|
||||
from lightning.pytorch.trainer import Trainer
|
||||
from lightning.pytorch.utilities import rank_zero_info, rank_zero_only
|
||||
from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
|
||||
from lightning.pytorch.strategies import ColossalAIStrategy,DDPStrategy
|
||||
LIGHTNING_PACK_NAME = "lightning.pytorch."
|
||||
|
||||
from ldm.data.base import Txt2ImgIterableBaseDataset
|
||||
from ldm.util import instantiate_from_config
|
||||
|
@ -687,153 +680,114 @@ if __name__ == "__main__":
|
|||
config.model["params"].update({"ckpt": ckpt})
|
||||
rank_zero_info("Using ckpt_path = {}".format(config.model["params"]["ckpt"]))
|
||||
|
||||
model = instantiate_from_config(config.model)
|
||||
model = LatentDiffusion(**config.model.get("params", dict()))
|
||||
# trainer and callbacks
|
||||
trainer_kwargs = dict()
|
||||
|
||||
# config the logger
|
||||
# Default logger configs to log training metrics during the training process.
|
||||
# These loggers are specified as targets in the dictionary, along with the configuration settings specific to each logger.
|
||||
default_logger_cfgs = {
|
||||
"wandb": {
|
||||
"target": LIGHTNING_PACK_NAME + "loggers.WandbLogger",
|
||||
"params": {
|
||||
"name": nowname,
|
||||
"save_dir": logdir,
|
||||
"offline": opt.debug,
|
||||
"id": nowname,
|
||||
}
|
||||
},
|
||||
,
|
||||
"tensorboard": {
|
||||
"target": LIGHTNING_PACK_NAME + "loggers.TensorBoardLogger",
|
||||
"params": {
|
||||
"save_dir": logdir,
|
||||
"name": "diff_tb",
|
||||
"log_graph": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Set up the logger for TensorBoard
|
||||
default_logger_cfg = default_logger_cfgs["tensorboard"]
|
||||
if "logger" in lightning_config:
|
||||
logger_cfg = lightning_config.logger
|
||||
trainer_kwargs["logger"] = WandbLogger(**logger_cfg)
|
||||
else:
|
||||
logger_cfg = default_logger_cfg
|
||||
logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
|
||||
trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
|
||||
trainer_kwargs["logger"] = TensorBoardLogger(**logger_cfg)
|
||||
|
||||
# config the strategy, defualt is ddp
|
||||
if "strategy" in trainer_config:
|
||||
strategy_cfg = trainer_config["strategy"]
|
||||
strategy_cfg["target"] = LIGHTNING_PACK_NAME + strategy_cfg["target"]
|
||||
trainer_kwargs["strategy"] = ColossalAIStrategy(**strategy_cfg)
|
||||
else:
|
||||
strategy_cfg = {
|
||||
"target": LIGHTNING_PACK_NAME + "strategies.DDPStrategy",
|
||||
"params": {
|
||||
"find_unused_parameters": False
|
||||
}
|
||||
}
|
||||
|
||||
trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
|
||||
strategy_cfg = {"find_unused_parameters": False}
|
||||
trainer_kwargs["strategy"] = DDPStrategy(**strategy_cfg)
|
||||
|
||||
# Set up ModelCheckpoint callback to save best models
|
||||
# modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
|
||||
# specify which metric is used to determine best models
|
||||
default_modelckpt_cfg = {
|
||||
"target": LIGHTNING_PACK_NAME + "callbacks.ModelCheckpoint",
|
||||
"params": {
|
||||
"dirpath": ckptdir,
|
||||
"filename": "{epoch:06}",
|
||||
"verbose": True,
|
||||
"save_last": True,
|
||||
}
|
||||
}
|
||||
if hasattr(model, "monitor"):
|
||||
default_modelckpt_cfg["params"]["monitor"] = model.monitor
|
||||
default_modelckpt_cfg["params"]["save_top_k"] = 3
|
||||
default_modelckpt_cfg["monitor"] = model.monitor
|
||||
default_modelckpt_cfg["save_top_k"] = 3
|
||||
|
||||
if "modelcheckpoint" in lightning_config:
|
||||
modelckpt_cfg = lightning_config.modelcheckpoint
|
||||
modelckpt_cfg = lightning_config.modelcheckpoint["params"]
|
||||
else:
|
||||
modelckpt_cfg = OmegaConf.create()
|
||||
modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
|
||||
if version.parse(pl.__version__) < version.parse('1.4.0'):
|
||||
trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
|
||||
trainer_kwargs["checkpoint_callback"] = ModelCheckpoint(**modelckpt_cfg)
|
||||
|
||||
# Set up various callbacks, including logging, learning rate monitoring, and CUDA management
|
||||
# add callback which sets up log directory
|
||||
default_callbacks_cfg = {
|
||||
"setup_callback": { # callback to set up the training
|
||||
"target": "main.SetupCallback",
|
||||
"params": {
|
||||
"resume": opt.resume, # resume training if applicable
|
||||
"now": now,
|
||||
"logdir": logdir, # directory to save the log file
|
||||
"ckptdir": ckptdir, # directory to save the checkpoint file
|
||||
"cfgdir": cfgdir, # directory to save the configuration file
|
||||
"config": config, # configuration dictionary
|
||||
"lightning_config": lightning_config, # LightningModule configuration
|
||||
}
|
||||
},
|
||||
"image_logger": { # callback to log image data
|
||||
"target": "main.ImageLogger",
|
||||
"params": {
|
||||
"batch_frequency": 750, # how frequently to log images
|
||||
"max_images": 4, # maximum number of images to log
|
||||
"clamp": True # whether to clamp pixel values to [0,1]
|
||||
}
|
||||
},
|
||||
"learning_rate_logger": { # callback to log learning rate
|
||||
"target": "main.LearningRateMonitor",
|
||||
"params": {
|
||||
"logging_interval": "step", # logging frequency (either 'step' or 'epoch')
|
||||
# "log_momentum": True # whether to log momentum (currently commented out)
|
||||
}
|
||||
},
|
||||
"cuda_callback": { # callback to handle CUDA-related operations
|
||||
"target": "main.CUDACallback"
|
||||
},
|
||||
}
|
||||
#Create an empty OmegaConf configuration object
|
||||
|
||||
# If the LightningModule configuration has specified callbacks, use those
|
||||
# Otherwise, create an empty OmegaConf configuration object
|
||||
if "callbacks" in lightning_config:
|
||||
callbacks_cfg = lightning_config.callbacks
|
||||
else:
|
||||
callbacks_cfg = OmegaConf.create()
|
||||
|
||||
# If the 'metrics_over_trainsteps_checkpoint' callback is specified in the
|
||||
# LightningModule configuration, update the default callbacks configuration
|
||||
if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg:
|
||||
print(
|
||||
'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.')
|
||||
default_metrics_over_trainsteps_ckpt_dict = {
|
||||
'metrics_over_trainsteps_checkpoint': {
|
||||
"target": LIGHTNING_PACK_NAME + 'callbacks.ModelCheckpoint',
|
||||
'params': {
|
||||
"dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
|
||||
"filename": "{epoch:06}-{step:09}",
|
||||
"verbose": True,
|
||||
'save_top_k': -1,
|
||||
'every_n_train_steps': 10000,
|
||||
'save_weights_only': True
|
||||
}
|
||||
}
|
||||
callbacks_cfg = OmegaConf.create()
|
||||
|
||||
#Instantiate items according to the configs
|
||||
trainer_kwargs.setdefault("callbacks", [])
|
||||
setup_callback_config = {
|
||||
"resume": opt.resume, # resume training if applicable
|
||||
"now": now,
|
||||
"logdir": logdir, # directory to save the log file
|
||||
"ckptdir": ckptdir, # directory to save the checkpoint file
|
||||
"cfgdir": cfgdir, # directory to save the configuration file
|
||||
"config": config, # configuration dictionary
|
||||
"lightning_config": lightning_config, # LightningModule configuration
|
||||
}
|
||||
default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
|
||||
trainer_kwargs["callbacks"].append(SetupCallback(**setup_callback_config))
|
||||
|
||||
# Merge the default callbacks configuration with the specified callbacks configuration, and instantiate the callbacks
|
||||
callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
|
||||
|
||||
trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
|
||||
image_logger_config = {
|
||||
|
||||
"batch_frequency": 750, # how frequently to log images
|
||||
"max_images": 4, # maximum number of images to log
|
||||
"clamp": True # whether to clamp pixel values to [0,1]
|
||||
}
|
||||
trainer_kwargs["callbacks"].append(ImageLogger(**image_logger_config))
|
||||
|
||||
learning_rate_logger_config = {
|
||||
"logging_interval": "step", # logging frequency (either 'step' or 'epoch')
|
||||
# "log_momentum": True # whether to log momentum (currently commented out)
|
||||
}
|
||||
trainer_kwargs["callbacks"].append(LearningRateMonitor(**learning_rate_logger_config))
|
||||
|
||||
metrics_over_trainsteps_checkpoint_config= {
|
||||
"dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
|
||||
"filename": "{epoch:06}-{step:09}",
|
||||
"verbose": True,
|
||||
'save_top_k': -1,
|
||||
'every_n_train_steps': 10000,
|
||||
'save_weights_only': True
|
||||
}
|
||||
trainer_kwargs["callbacks"].append(ModelCheckpoint(**metrics_over_trainsteps_checkpoint_config))
|
||||
trainer_kwargs["callbacks"].append(CUDACallback())
|
||||
|
||||
# Create a Trainer object with the specified command-line arguments and keyword arguments, and set the log directory
|
||||
trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
|
||||
trainer.logdir = logdir
|
||||
|
||||
# Create a data module based on the configuration file
|
||||
data = instantiate_from_config(config.data)
|
||||
data = DataModuleFromConfig(**config.data)
|
||||
|
||||
# NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||
# calling these ourselves should not be necessary but it is.
|
||||
# lightning still takes care of proper multiprocessing though
|
||||
|
@ -846,7 +800,7 @@ if __name__ == "__main__":
|
|||
|
||||
# Configure learning rate based on the batch size, base learning rate and number of GPUs
|
||||
# If scale_lr is true, calculate the learning rate based on additional factors
|
||||
bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
|
||||
bs, base_lr = config.data.batch_size, config.model.base_learning_rate
|
||||
if not cpu:
|
||||
ngpu = trainer_config["devices"]
|
||||
else:
|
||||
|
|
|
@ -7,8 +7,9 @@ from datetime import datetime
|
|||
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
from ldm.util import instantiate_from_config
|
||||
|
||||
from main import get_parser
|
||||
from ldm.modules.diffusionmodules.openaimodel import UNetModel
|
||||
|
||||
if __name__ == "__main__":
|
||||
with torch.no_grad():
|
||||
|
@ -17,7 +18,7 @@ if __name__ == "__main__":
|
|||
config = f.read()
|
||||
base_config = yaml.load(config, Loader=yaml.FullLoader)
|
||||
unet_config = base_config['model']['params']['unet_config']
|
||||
diffusion_model = instantiate_from_config(unet_config).to("cuda:0")
|
||||
diffusion_model = UNetModel(**unet_config).to("cuda:0")
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"/data/scratch/diffuser/stable-diffusion-v1-4"
|
||||
|
|
|
@ -3,3 +3,4 @@ TRANSFORMERS_OFFLINE=1
|
|||
DIFFUSERS_OFFLINE=1
|
||||
|
||||
python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt diffuser_root_dir/512-base-ema.ckpt
|
||||
|
||||
|
|
Loading…
Reference in New Issue