mirror of https://github.com/hpcaitech/ColossalAI
[chatgpt]update ci (#3087)
* [chatgpt]update ci * Update test_ci.sh * Update test_ci.sh * Update test_ci.sh * test * Update train_prompts.py * Update train_dummy.py * add save_path * polish * add save path * polish * add save path * polish * delete bloom-560m test delete bloom-560m test because of oom * add ddp testpull/3131/head
parent
169ed4d24e
commit
23cd5e2ccf
|
@ -15,11 +15,57 @@ export OMP_NUM_THREADS=8
|
|||
pip install -r ${BASE}/requirements.txt
|
||||
|
||||
# train dummy
|
||||
for strategy in ddp colossalai_gemini colossalai_zero2; do
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3 --experience_batch_size 4 --train_batch_size 4
|
||||
done
|
||||
python ${BASE}/train_dummy.py --strategy naive --num_episodes 1 \
|
||||
--max_timesteps 2 --update_timesteps 2 \
|
||||
--max_epochs 1 --train_batch_size 2 --lora_rank 4
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
|
||||
--strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_dummy.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'facebook/opt-350m' --model opt
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
|
||||
--strategy ddp --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_dummy.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'facebook/opt-350m' --model opt
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_dummy.py \
|
||||
--strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'gpt2' --model gpt2 --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_dummy.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_dummy.pt --pretrain 'gpt2' --model gpt2
|
||||
|
||||
rm -rf ${BASE}/actor_checkpoint_dummy.pt
|
||||
|
||||
# train prompts
|
||||
for strategy in ddp colossalai_gemini colossalai_zero2; do
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH --strategy ${strategy} --num_episodes 2 --max_timesteps 3 --update_timesteps 3 --max_epochs 3
|
||||
done
|
||||
python ${BASE}/train_prompts.py $PROMPT_PATH --strategy naive --num_episodes 1 \
|
||||
--max_timesteps 2 --update_timesteps 2 \
|
||||
--max_epochs 1 --train_batch_size 2 --lora_rank 4
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
|
||||
--strategy colossalai_zero2 --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'facebook/opt-350m' --model opt --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_prompts.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'facebook/opt-350m' --model opt
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
|
||||
--strategy ddp --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'gpt2' --model gpt2 --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_prompts.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'gpt2' --model gpt2
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py $PROMPT_PATH \
|
||||
--strategy colossalai_gemini --num_episodes 1 --max_timesteps 2 \
|
||||
--update_timesteps 2 --max_epochs 1 --train_batch_size 2\
|
||||
--pretrain 'gpt2' --model gpt2 --lora_rank 4\
|
||||
--save_path ${BASE}/actor_checkpoint_prompts.pt
|
||||
python ${BASE}/inference.py --model_path ${BASE}/actor_checkpoint_prompts.pt --pretrain 'gpt2' --model gpt2
|
||||
|
||||
rm -rf ${BASE}/actor_checkpoint_prompts.pt
|
||||
|
|
|
@ -115,7 +115,7 @@ def main(args):
|
|||
update_timesteps=args.update_timesteps)
|
||||
|
||||
# save model checkpoint after fitting
|
||||
strategy.save_model(actor, 'actor_checkpoint_dummy.pt', only_rank0=True)
|
||||
strategy.save_model(actor, args.save_path, only_rank0=True)
|
||||
# save optimizer checkpoint on all ranks
|
||||
if args.need_optim_ckpt:
|
||||
strategy.save_optimizer(actor_optim,
|
||||
|
@ -130,6 +130,7 @@ if __name__ == '__main__':
|
|||
default='naive')
|
||||
parser.add_argument('--model', type=str, default='gpt2', choices=['gpt2', 'bloom', 'opt'])
|
||||
parser.add_argument('--pretrain', type=str, default=None)
|
||||
parser.add_argument('--save_path', type=str, default='actor_checkpoint_dummy.pt')
|
||||
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
|
||||
parser.add_argument('--num_episodes', type=int, default=50)
|
||||
parser.add_argument('--max_timesteps', type=int, default=10)
|
||||
|
|
|
@ -102,7 +102,7 @@ def main(args):
|
|||
max_timesteps=args.max_timesteps,
|
||||
update_timesteps=args.update_timesteps)
|
||||
# save model checkpoint after fitting
|
||||
strategy.save_model(actor, 'actor_checkpoint_prompts.pt', only_rank0=True)
|
||||
strategy.save_model(actor, args.save_path, only_rank0=True)
|
||||
# save optimizer checkpoint on all ranks
|
||||
if args.need_optim_ckpt:
|
||||
strategy.save_optimizer(actor_optim,
|
||||
|
@ -118,6 +118,7 @@ if __name__ == '__main__':
|
|||
default='naive')
|
||||
parser.add_argument('--model', default='gpt2', choices=['gpt2', 'bloom', 'opt'])
|
||||
parser.add_argument('--pretrain', type=str, default=None)
|
||||
parser.add_argument('--save_path', type=str, default='actor_checkpoint_prompts.pt')
|
||||
parser.add_argument('--need_optim_ckpt', type=bool, default=False)
|
||||
parser.add_argument('--num_episodes', type=int, default=10)
|
||||
parser.add_argument('--max_timesteps', type=int, default=10)
|
||||
|
|
Loading…
Reference in New Issue