mirror of https://github.com/InternLM/InternLM
test(ci_scripts): add load ckpt cases (#208)
* fix format * add scripts for load ckpt case * update test config * debug:use var in json * fix syntax error * export pythonpath * use absolute path * use father path of workspace * debug load new ckpt * change data path * add train folder * fix code format * fix pylint warning --------- Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>pull/57/head
parent
0600b42c01
commit
cc3c48ae47
|
@ -66,9 +66,23 @@ jobs:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: slurm-train
|
- name: slurm-train
|
||||||
|
id: basic_train
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
|
|
||||||
|
- name: load_preset_ckpt
|
||||||
|
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
|
||||||
|
run: |
|
||||||
|
source activate internlm-env-test
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
|
|
||||||
|
- name: load_new_ckpt
|
||||||
|
run: |
|
||||||
|
source activate internlm-env-test
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
||||||
|
|
||||||
- name: torchrun-train
|
- name: torchrun-train
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
def merge_dicts(dict_a: dict, dict_b: dict):
|
||||||
|
for key in dict_b.keys():
|
||||||
|
if isinstance(dict_b[key], dict):
|
||||||
|
dict_b[key] = {**dict_a[key], **dict_b[key]}
|
||||||
|
merge_dicts(dict_a[key], dict_b[key])
|
||||||
|
dict_c = {**dict_a, **dict_b}
|
||||||
|
return dict_c
|
||||||
|
|
||||||
|
|
||||||
|
def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
|
||||||
|
result = ""
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
result += f"{' ' * indent}{key} = dict(\n"
|
||||||
|
result += format_dict_to_py_string(value, indent + 4, is_nested=True)
|
||||||
|
result += f"{' ' * indent})"
|
||||||
|
else:
|
||||||
|
result += f"{' ' * indent}{key} = {repr(value)}"
|
||||||
|
if is_nested:
|
||||||
|
result += ","
|
||||||
|
result += "\n"
|
||||||
|
result = f"""\
|
||||||
|
{result}
|
||||||
|
"""
|
||||||
|
return result
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
||||||
|
|
|
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
|
||||||
# Ckpt folder format:
|
# Ckpt folder format:
|
||||||
# fs: 'local:/mnt/nfs/XXX'
|
# fs: 'local:/mnt/nfs/XXX'
|
||||||
# oss: 'boto3:s3://model_weights/XXX'
|
# oss: 'boto3:s3://model_weights/XXX'
|
||||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||||
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||||
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
||||||
|
@ -26,7 +26,7 @@ ckpt = dict(
|
||||||
load_optimizer=True,
|
load_optimizer=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
|
TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
|
||||||
data = dict(
|
data = dict(
|
||||||
seq_len=SEQ_LEN,
|
seq_len=SEQ_LEN,
|
||||||
# micro_num means the number of micro_batch contained in one gradient update
|
# micro_num means the number of micro_batch contained in one gradient update
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ci_scripts.common import com_func
|
||||||
|
from internlm.core.context import Config
|
||||||
|
|
||||||
|
|
||||||
|
def generate_new_config(config_py_file, test_config_json, case_name):
|
||||||
|
# generate path of the new config py
|
||||||
|
config_path = os.path.split(config_py_file)
|
||||||
|
new_config_py_file = os.path.join(config_path[0], case_name + ".py")
|
||||||
|
|
||||||
|
# merge dict
|
||||||
|
origin_config = Config.from_file(config_py_file)
|
||||||
|
with open(test_config_json) as f:
|
||||||
|
test_config = json.load(f)
|
||||||
|
if test_config:
|
||||||
|
if case_name not in test_config.keys():
|
||||||
|
raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
|
||||||
|
new_config = com_func.merge_dicts(origin_config, test_config[case_name])
|
||||||
|
print(f"new config is:\n{new_config}")
|
||||||
|
|
||||||
|
# write new config to py file
|
||||||
|
file_content = com_func.format_dict_to_py_string(new_config)
|
||||||
|
with open(new_config_py_file, "w") as f:
|
||||||
|
f.write(file_content)
|
||||||
|
print(f"The new test train config file is {new_config_py_file}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--origin_config",
|
||||||
|
type=str,
|
||||||
|
default="./ci_scripts/train/ci_7B_sft.py",
|
||||||
|
help="path to the origin train config file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--test_config",
|
||||||
|
type=str,
|
||||||
|
default="./ci_scripts/train/test_config.json",
|
||||||
|
help="path to the test train config file",
|
||||||
|
)
|
||||||
|
parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
|
||||||
|
args = parser.parse_args()
|
||||||
|
generate_new_config(args.origin_config, args.test_config, args.case_name)
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
|
||||||
|
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
|
||||||
|
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
||||||
|
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
|
||||||
|
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
|
||||||
|
expected_num=21
|
||||||
|
exit_code=0
|
||||||
|
|
||||||
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
|
||||||
|
echo "start to test slurm training with loading checkpoint."
|
||||||
|
|
||||||
|
python ./ci_scripts/train/generate_config.py --case_name $1
|
||||||
|
file="./ci_scripts/train/$1.py"
|
||||||
|
if [[ ! -f ${file} ]]; then
|
||||||
|
echo "expect: ${file} exists, actual: not exist."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
|
||||||
|
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
|
||||||
|
|
||||||
|
|
||||||
|
num=$(num_files "${CKPTS40_OUTPUT}")
|
||||||
|
if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
|
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# clean the test files.
|
||||||
|
if ! rm -rf ${CKPTS_PATH}/*; then
|
||||||
|
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $exit_code
|
|
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
exit_code=$(($exit_code + 1))
|
exit_code=$(($exit_code + 1))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# clean the test files.
|
|
||||||
if ! rm -rf ${CKPTS_PATH}/*; then
|
|
||||||
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
|
||||||
exit_code=$(($exit_code + 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit $exit_code
|
exit $exit_code
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
{
|
||||||
|
"7B_basic_train": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"ckpt": {
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 20
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7B_load_new_ckpt": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
|
||||||
|
"ckpt": {
|
||||||
|
"load_ckpt_folder": "local:llm_ckpts/20",
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 40
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7B_load_preset_ckpt": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||||
|
"ckpt": {
|
||||||
|
"load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 40
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue