mirror of https://github.com/hpcaitech/ColossalAI
fix typo examples and docs (#3932)
parent
407aa48461
commit
33eef714db
|
@ -141,16 +141,16 @@ for mn, module in model.named_modules():
|
||||||
|
|
||||||
if 'mlp.c_fc' in mn:
|
if 'mlp.c_fc' in mn:
|
||||||
if 'weight' in pn or 'bias' in pn:
|
if 'weight' in pn or 'bias' in pn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
# keep the shape of the output from c_fc
|
# keep the shape of the output from c_fc
|
||||||
param.compute_spec.set_output_replicate(False)
|
param.compute_spec.set_output_replicate(False)
|
||||||
elif 'mlp.c_proj' in mn:
|
elif 'mlp.c_proj' in mn:
|
||||||
if 'weight' in pn:
|
if 'weight' in pn:
|
||||||
split_param_row_tp1d(param, pg) # row slice
|
split_param_row_tp1d(param, pg) # row slice
|
||||||
elif 'wte' in mn or 'wpe' in mn:
|
elif 'wte' in mn or 'wpe' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif 'c_attn' in mn or 'c_proj' in mn:
|
elif 'c_attn' in mn or 'c_proj' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
```
|
```
|
||||||
|
|
||||||
The modified model is illustrated below.
|
The modified model is illustrated below.
|
||||||
|
|
|
@ -126,16 +126,16 @@ for mn, module in model.named_modules():
|
||||||
|
|
||||||
if 'mlp.c_fc' in mn:
|
if 'mlp.c_fc' in mn:
|
||||||
if 'weight' in pn or 'bias' in pn:
|
if 'weight' in pn or 'bias' in pn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
# keep the shape of the output from c_fc
|
# keep the shape of the output from c_fc
|
||||||
param.compute_spec.set_output_replicate(False)
|
param.compute_spec.set_output_replicate(False)
|
||||||
elif 'mlp.c_proj' in mn:
|
elif 'mlp.c_proj' in mn:
|
||||||
if 'weight' in pn:
|
if 'weight' in pn:
|
||||||
split_param_row_tp1d(param, pg) # row slice
|
split_param_row_tp1d(param, pg) # row slice
|
||||||
elif 'wte' in mn or 'wpe' in mn:
|
elif 'wte' in mn or 'wpe' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif 'c_attn' in mn or 'c_proj' in mn:
|
elif 'c_attn' in mn or 'c_proj' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
```
|
```
|
||||||
|
|
||||||
修改后的模型如下图所示。
|
修改后的模型如下图所示。
|
||||||
|
|
|
@ -37,7 +37,7 @@ The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Mode
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
|
|
||||||
We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparation. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
|
We provide the script `colossalai.sh` to run the training task with colossalai. Meanwhile, we also provided traditional training process of dreambooth, `dreambooth.sh`, for possible comparison. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
This directory includes two parts: Using the Booster API fintune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
|
This directory includes two parts: Using the Booster API finetune Huggingface Bert and AlBert models and benchmarking Bert and AlBert models with different Booster Plugin.
|
||||||
|
|
||||||
## Finetune
|
## Finetune
|
||||||
```
|
```
|
||||||
|
|
|
@ -162,7 +162,7 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
|
||||||
# shard it w.r.t tp pattern
|
# shard it w.r.t tp pattern
|
||||||
if 'mlp.c_fc' in mn:
|
if 'mlp.c_fc' in mn:
|
||||||
if 'weight' in pn or 'bias' in pn:
|
if 'weight' in pn or 'bias' in pn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
# keep the shape of the output from c_fc
|
# keep the shape of the output from c_fc
|
||||||
param.compute_spec.set_output_replicate(False)
|
param.compute_spec.set_output_replicate(False)
|
||||||
else:
|
else:
|
||||||
|
@ -173,9 +173,9 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
|
||||||
else:
|
else:
|
||||||
param.set_dist_spec(ReplicaSpec())
|
param.set_dist_spec(ReplicaSpec())
|
||||||
elif 'wte' in mn or 'wpe' in mn:
|
elif 'wte' in mn or 'wpe' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif 'c_attn' in mn or 'c_proj' in mn:
|
elif 'c_attn' in mn or 'c_proj' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
else:
|
else:
|
||||||
param.set_dist_spec(ReplicaSpec())
|
param.set_dist_spec(ReplicaSpec())
|
||||||
param.visited = True
|
param.visited = True
|
||||||
|
@ -237,7 +237,7 @@ def main():
|
||||||
if args.tp_degree > 1:
|
if args.tp_degree > 1:
|
||||||
tensor_parallelize(model, tp_pg)
|
tensor_parallelize(model, tp_pg)
|
||||||
|
|
||||||
# asign running configurations
|
# assign running configurations
|
||||||
if args.distplan == "CAI_ZeRO1":
|
if args.distplan == "CAI_ZeRO1":
|
||||||
zero_stage = 1
|
zero_stage = 1
|
||||||
elif args.distplan == "CAI_ZeRO2":
|
elif args.distplan == "CAI_ZeRO2":
|
||||||
|
|
|
@ -305,7 +305,7 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def backward(ctx, grad_output):
|
def backward(ctx, grad_output):
|
||||||
|
|
||||||
# Retreive tensors from the forward path.
|
# Retrieve tensors from the forward path.
|
||||||
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
||||||
|
|
||||||
# All the inputs have softmax as their gradient.
|
# All the inputs have softmax as their gradient.
|
||||||
|
|
|
@ -38,7 +38,7 @@ def train_epoch(epoch, model, optimizer, lr_scheduler, dataloader, booster, coor
|
||||||
|
|
||||||
for batch in pbar:
|
for batch in pbar:
|
||||||
|
|
||||||
# Foward
|
# Forward
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
batch = move_to_cuda(batch, torch.cuda.current_device())
|
batch = move_to_cuda(batch, torch.cuda.current_device())
|
||||||
|
|
||||||
|
|
|
@ -140,15 +140,15 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
|
||||||
continue
|
continue
|
||||||
param.set_dist_spec(ReplicaSpec())
|
param.set_dist_spec(ReplicaSpec())
|
||||||
if 'net.0' in mn:
|
if 'net.0' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif 'to_q' in mn:
|
elif 'to_q' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif 'to_kv' in mn:
|
elif 'to_kv' in mn:
|
||||||
split_param_row_tp1d(param, pg) # row slice
|
split_param_row_tp1d(param, pg) # row slice
|
||||||
elif 'to_out' in mn:
|
elif 'to_out' in mn:
|
||||||
split_param_row_tp1d(param, pg) # row slice
|
split_param_row_tp1d(param, pg) # row slice
|
||||||
elif '1.1' in mn:
|
elif '1.1' in mn:
|
||||||
split_param_col_tp1d(param, pg) # colmn slice
|
split_param_col_tp1d(param, pg) # column slice
|
||||||
elif '1.2' in mn:
|
elif '1.2' in mn:
|
||||||
split_param_row_tp1d(param, pg) # row slice
|
split_param_row_tp1d(param, pg) # row slice
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue