mirror of https://github.com/InternLM/InternLM
test(ci_scripts): add load ckpt cases (#208)
* fix format * add scripts for load ckpt case * update test config * debug:use var in json * fix syntax error * export pythonpath * use absolute path * use father path of workspace * debug load new ckpt * change data path * add train folder * fix code format * fix pylint warning --------- Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>pull/57/head
parent
0600b42c01
commit
cc3c48ae47
|
@ -66,9 +66,23 @@ jobs:
|
|||
- uses: actions/checkout@v3
|
||||
|
||||
- name: slurm-train
|
||||
id: basic_train
|
||||
run: |
|
||||
source activate internlm-env-test
|
||||
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||
|
||||
- name: load_preset_ckpt
|
||||
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
|
||||
run: |
|
||||
source activate internlm-env-test
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||
|
||||
- name: load_new_ckpt
|
||||
run: |
|
||||
source activate internlm-env-test
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
||||
|
||||
- name: torchrun-train
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
|
||||
def merge_dicts(dict_a: dict, dict_b: dict):
|
||||
for key in dict_b.keys():
|
||||
if isinstance(dict_b[key], dict):
|
||||
dict_b[key] = {**dict_a[key], **dict_b[key]}
|
||||
merge_dicts(dict_a[key], dict_b[key])
|
||||
dict_c = {**dict_a, **dict_b}
|
||||
return dict_c
|
||||
|
||||
|
||||
def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
|
||||
result = ""
|
||||
for key, value in data.items():
|
||||
if isinstance(value, dict):
|
||||
result += f"{' ' * indent}{key} = dict(\n"
|
||||
result += format_dict_to_py_string(value, indent + 4, is_nested=True)
|
||||
result += f"{' ' * indent})"
|
||||
else:
|
||||
result += f"{' ' * indent}{key} = {repr(value)}"
|
||||
if is_nested:
|
||||
result += ","
|
||||
result += "\n"
|
||||
result = f"""\
|
||||
{result}
|
||||
"""
|
||||
return result
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
||||
|
|
|
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
|
|||
# Ckpt folder format:
|
||||
# fs: 'local:/mnt/nfs/XXX'
|
||||
# oss: 'boto3:s3://model_weights/XXX'
|
||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||
# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
||||
|
@ -26,7 +26,7 @@ ckpt = dict(
|
|||
load_optimizer=True,
|
||||
)
|
||||
|
||||
TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
|
||||
TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
|
||||
data = dict(
|
||||
seq_len=SEQ_LEN,
|
||||
# micro_num means the number of micro_batch contained in one gradient update
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
from ci_scripts.common import com_func
|
||||
from internlm.core.context import Config
|
||||
|
||||
|
||||
def generate_new_config(config_py_file, test_config_json, case_name):
|
||||
# generate path of the new config py
|
||||
config_path = os.path.split(config_py_file)
|
||||
new_config_py_file = os.path.join(config_path[0], case_name + ".py")
|
||||
|
||||
# merge dict
|
||||
origin_config = Config.from_file(config_py_file)
|
||||
with open(test_config_json) as f:
|
||||
test_config = json.load(f)
|
||||
if test_config:
|
||||
if case_name not in test_config.keys():
|
||||
raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
|
||||
new_config = com_func.merge_dicts(origin_config, test_config[case_name])
|
||||
print(f"new config is:\n{new_config}")
|
||||
|
||||
# write new config to py file
|
||||
file_content = com_func.format_dict_to_py_string(new_config)
|
||||
with open(new_config_py_file, "w") as f:
|
||||
f.write(file_content)
|
||||
print(f"The new test train config file is {new_config_py_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--origin_config",
|
||||
type=str,
|
||||
default="./ci_scripts/train/ci_7B_sft.py",
|
||||
help="path to the origin train config file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test_config",
|
||||
type=str,
|
||||
default="./ci_scripts/train/test_config.json",
|
||||
help="path to the test train config file",
|
||||
)
|
||||
parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
|
||||
args = parser.parse_args()
|
||||
generate_new_config(args.origin_config, args.test_config, args.case_name)
|
|
@ -0,0 +1,38 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
|
||||
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
||||
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
|
||||
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
|
||||
expected_num=21
|
||||
exit_code=0
|
||||
|
||||
source ./ci_scripts/common/basic_func.sh
|
||||
|
||||
echo "start to test slurm training with loading checkpoint."
|
||||
|
||||
python ./ci_scripts/train/generate_config.py --case_name $1
|
||||
file="./ci_scripts/train/$1.py"
|
||||
if [[ ! -f ${file} ]]; then
|
||||
echo "expect: ${file} exists, actual: not exist."
|
||||
exit_code=$(($exit_code + 1))
|
||||
fi
|
||||
|
||||
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
|
||||
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
|
||||
|
||||
|
||||
num=$(num_files "${CKPTS40_OUTPUT}")
|
||||
if [[ ${num} -ne ${expected_num} ]]; then
|
||||
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||
exit_code=$(($exit_code + 1))
|
||||
fi
|
||||
|
||||
# clean the test files.
|
||||
if ! rm -rf ${CKPTS_PATH}/*; then
|
||||
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
||||
exit_code=$(($exit_code + 1))
|
||||
fi
|
||||
|
||||
exit $exit_code
|
|
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
|
|||
exit_code=$(($exit_code + 1))
|
||||
fi
|
||||
|
||||
# clean the test files.
|
||||
if ! rm -rf ${CKPTS_PATH}/*; then
|
||||
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
||||
exit_code=$(($exit_code + 1))
|
||||
fi
|
||||
|
||||
exit $exit_code
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
{
|
||||
"7B_basic_train": {
|
||||
"SEQ_LEN": 1024,
|
||||
"HIDDEN_SIZE": 2048,
|
||||
"NUM_ATTENTION_HEAD": 16,
|
||||
"NUM_LAYER": 16,
|
||||
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||
"ckpt": {
|
||||
"checkpoint_every": 20
|
||||
},
|
||||
"data": {
|
||||
"total_steps": 20
|
||||
}
|
||||
},
|
||||
"7B_load_new_ckpt": {
|
||||
"SEQ_LEN": 1024,
|
||||
"HIDDEN_SIZE": 2048,
|
||||
"NUM_ATTENTION_HEAD": 16,
|
||||
"NUM_LAYER": 16,
|
||||
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||
"LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
|
||||
"ckpt": {
|
||||
"load_ckpt_folder": "local:llm_ckpts/20",
|
||||
"checkpoint_every": 20
|
||||
},
|
||||
"data": {
|
||||
"total_steps": 40
|
||||
}
|
||||
},
|
||||
"7B_load_preset_ckpt": {
|
||||
"SEQ_LEN": 1024,
|
||||
"HIDDEN_SIZE": 2048,
|
||||
"NUM_ATTENTION_HEAD": 16,
|
||||
"NUM_LAYER": 16,
|
||||
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||
"LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||
"ckpt": {
|
||||
"load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||
"checkpoint_every": 20
|
||||
},
|
||||
"data": {
|
||||
"total_steps": 40
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue