test(ci_scripts): add load ckpt cases (#208)

* fix format

* add scripts for load ckpt case

* update test config

* debug:use var in json

* fix syntax error

* export pythonpath

* use absolute path

* use father path of workspace

* debug load new ckpt

* change data path

* add train folder

* fix code format

* fix pylint warning

---------

Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>
pull/57/head
kkscilife 2023-08-21 15:24:43 +08:00 committed by GitHub
parent 0600b42c01
commit cc3c48ae47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 193 additions and 24 deletions

View File

@ -1,5 +1,5 @@
name: demo-in-readme
on:
on:
pull_request:
branches:
- "main"
@ -39,7 +39,7 @@ jobs:
needs: check-requirements
runs-on: [lmtest]
steps:
- name: mask env
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3
@ -60,15 +60,29 @@ jobs:
runs-on: [lmtest]
timeout-minutes: 30
steps:
- name: mask env
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3
- name: slurm-train
id: basic_train
run: |
source activate internlm-env-test
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
@ -91,18 +105,18 @@ jobs:
run: |
source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh
sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
cd ..
rm -rf $GITHUB_WORKSPACE/hf_ckpt
load-chat-model-in-hf:
if: ${{ always() }}
needs: check-requirements
runs-on: [lmtest]
steps:
- name: mask env
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3

View File

@ -1,6 +1,6 @@
name: lint-check
on:
on:
push:
pull_request:
branches:

View File

@ -1,7 +1,7 @@
name: Sonarqube
on:
workflow_dispatch:
jobs:
sonarqube:
name: SonarQube Scan
@ -13,4 +13,4 @@ jobs:
- uses: sonarsource/sonarqube-scan-action@master
env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}

View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
def merge_dicts(dict_a: dict, dict_b: dict):
for key in dict_b.keys():
if isinstance(dict_b[key], dict):
dict_b[key] = {**dict_a[key], **dict_b[key]}
merge_dicts(dict_a[key], dict_b[key])
dict_c = {**dict_a, **dict_b}
return dict_c
def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
result = ""
for key, value in data.items():
if isinstance(value, dict):
result += f"{' ' * indent}{key} = dict(\n"
result += format_dict_to_py_string(value, indent + 4, is_nested=True)
result += f"{' ' * indent})"
else:
result += f"{' ' * indent}{key} = {repr(value)}"
if is_nested:
result += ","
result += "\n"
result = f"""\
{result}
"""
return result

View File

@ -16,7 +16,7 @@ exit_code=0
source ./ci_scripts/common/basic_func.sh
echo "start to test alpaca_tokenizer.py."
echo "start to test alpaca_tokenizer.py."
if [[ -d ${RESULTS} ]]; then
if ! rm -rf ${RESULTS}/*; then

View File

@ -12,7 +12,7 @@ exit_code=0
source ./ci_scripts/common/basic_func.sh
echo "start to test tokenizer.py."
echo "start to test tokenizer.py."
num=$(num_files "${RESULTS}")
if [[ ${num} -gt 0 ]]; then

View File

@ -40,7 +40,7 @@ num=$(num_files "${CKPTS_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
exit_code=$(($exit_code + 1))
fi
# NOTICE: should not remove the cached files, because the cached files will be used in the next test case.

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()

View File

@ -10,10 +10,10 @@ VOCAB_SIZE = 103168
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
# oss: 'boto3:s3://model_weights/XXX'
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
#SAVE_CKPT_FOLDER = "local:llm_ckpts"
# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
SAVE_CKPT_FOLDER = "local:llm_ckpts"
#LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
ckpt = dict(
# Path to save training ckpt.
save_ckpt_folder=SAVE_CKPT_FOLDER,
@ -26,7 +26,7 @@ ckpt = dict(
load_optimizer=True,
)
TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
data = dict(
seq_len=SEQ_LEN,
# micro_num means the number of micro_batch contained in one gradient update

View File

@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import argparse
import json
import os
from ci_scripts.common import com_func
from internlm.core.context import Config
def generate_new_config(config_py_file, test_config_json, case_name):
# generate path of the new config py
config_path = os.path.split(config_py_file)
new_config_py_file = os.path.join(config_path[0], case_name + ".py")
# merge dict
origin_config = Config.from_file(config_py_file)
with open(test_config_json) as f:
test_config = json.load(f)
if test_config:
if case_name not in test_config.keys():
raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
new_config = com_func.merge_dicts(origin_config, test_config[case_name])
print(f"new config is:\n{new_config}")
# write new config to py file
file_content = com_func.format_dict_to_py_string(new_config)
with open(new_config_py_file, "w") as f:
f.write(file_content)
print(f"The new test train config file is {new_config_py_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--origin_config",
type=str,
default="./ci_scripts/train/ci_7B_sft.py",
help="path to the origin train config file",
)
parser.add_argument(
"--test_config",
type=str,
default="./ci_scripts/train/test_config.json",
help="path to the test train config file",
)
parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
args = parser.parse_args()
generate_new_config(args.origin_config, args.test_config, args.case_name)

View File

@ -0,0 +1,38 @@
#!/bin/bash
set -x
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
expected_num=21
exit_code=0
source ./ci_scripts/common/basic_func.sh
echo "start to test slurm training with loading checkpoint."
python ./ci_scripts/train/generate_config.py --case_name $1
file="./ci_scripts/train/$1.py"
if [[ ! -f ${file} ]]; then
echo "expect: ${file} exists, actual: not exist."
exit_code=$(($exit_code + 1))
fi
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS40_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code

View File

@ -25,12 +25,6 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --
num=$(num_files "${CKPTS20_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi

View File

@ -0,0 +1,45 @@
{
"7B_basic_train": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"ckpt": {
"checkpoint_every": 20
},
"data": {
"total_steps": 20
}
},
"7B_load_new_ckpt": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
"ckpt": {
"load_ckpt_folder": "local:llm_ckpts/20",
"checkpoint_every": 20
},
"data": {
"total_steps": 40
}
},
"7B_load_preset_ckpt": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
"ckpt": {
"load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
"checkpoint_every": 20
},
"data": {
"total_steps": 40
}
}
}

View File

@ -25,7 +25,7 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --
num=$(num_files "${CKPTS_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
exit_code=$(($exit_code + 1))
fi
# clean the test files.