test(ci_scripts): add load ckpt cases (#208)

* fix format * add scripts for load ckpt case * update test config * debug:use var in json * fix syntax error * export pythonpath * use absolute path * use father path of workspace * debug load new ckpt * change data path * add train folder * fix code format * fix pylint warning --------- Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>
2023-08-21 15:24:43 +08:00 · 2023-08-21 15:24:43 +08:00 · cc3c48ae47
parent 0600b42c01
commit cc3c48ae47
14 changed files with 193 additions and 24 deletions
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@ -66,9 +66,23 @@ jobs:
    - uses: actions/checkout@v3
    - name: slurm-train
      id: basic_train
      run: |
        source activate internlm-env-test
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
    - name: load_preset_ckpt
      if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
        sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
    - name: load_new_ckpt
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
        sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
        rm -rf $GITHUB_WORKSPACE/llm_ckpts
    - name: torchrun-train
--- a/ci_scripts/common/com_func.py
+++ b/ci_scripts/common/com_func.py
@ -0,0 +1,29 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 def merge_dicts(dict_a: dict, dict_b: dict):
    for key in dict_b.keys():
        if isinstance(dict_b[key], dict):
            dict_b[key] = {**dict_a[key], **dict_b[key]}
            merge_dicts(dict_a[key], dict_b[key])
    dict_c = {**dict_a, **dict_b}
    return dict_c
 def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
    result = ""
    for key, value in data.items():
        if isinstance(value, dict):
            result += f"{' ' * indent}{key} = dict(\n"
            result += format_dict_to_py_string(value, indent + 4, is_nested=True)
            result += f"{' ' * indent})"
        else:
            result += f"{' ' * indent}{key} = {repr(value)}"
        if is_nested:
            result += ","
        result += "\n"
    result = f"""\
 {result}
 """
    return result
--- a/ci_scripts/model/demo_load_7B_chat_model.py
+++ b/ci_scripts/model/demo_load_7B_chat_model.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
--- a/ci_scripts/train/ci_7B_sft.py
+++ b/ci_scripts/train/ci_7B_sft.py
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # oss: 'boto3:s3://model_weights/XXX'
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # SAVE_CKPT_FOLDER = "local:llm_ckpts"
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
 # LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -26,7 +26,7 @@ ckpt = dict(
    load_optimizer=True,
 )
-TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
+TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
 data = dict(
    seq_len=SEQ_LEN,
    # micro_num means the number of micro_batch contained in one gradient update
--- a/ci_scripts/train/generate_config.py
+++ b/ci_scripts/train/generate_config.py
@ -0,0 +1,49 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import argparse
 import json
 import os
 from ci_scripts.common import com_func
 from internlm.core.context import Config
 def generate_new_config(config_py_file, test_config_json, case_name):
    # generate path of the new config py
    config_path = os.path.split(config_py_file)
    new_config_py_file = os.path.join(config_path[0], case_name + ".py")
    # merge dict
    origin_config = Config.from_file(config_py_file)
    with open(test_config_json) as f:
        test_config = json.load(f)
    if test_config:
        if case_name not in test_config.keys():
            raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
    new_config = com_func.merge_dicts(origin_config, test_config[case_name])
    print(f"new config is:\n{new_config}")
    # write new config to py file
    file_content = com_func.format_dict_to_py_string(new_config)
    with open(new_config_py_file, "w") as f:
        f.write(file_content)
    print(f"The new test train config file is {new_config_py_file}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--origin_config",
        type=str,
        default="./ci_scripts/train/ci_7B_sft.py",
        help="path to the origin train config file",
    )
    parser.add_argument(
        "--test_config",
        type=str,
        default="./ci_scripts/train/test_config.json",
        help="path to the test train config file",
    )
    parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
    args = parser.parse_args()
    generate_new_config(args.origin_config, args.test_config, args.case_name)
--- a/ci_scripts/train/load_ckpt.sh
+++ b/ci_scripts/train/load_ckpt.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 set -x
 [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
 readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
 expected_num=21
 exit_code=0
 source ./ci_scripts/common/basic_func.sh
 echo "start to test slurm training with loading checkpoint."
 python ./ci_scripts/train/generate_config.py --case_name $1
 file="./ci_scripts/train/$1.py"
 if [[ ! -f ${file} ]]; then
        echo "expect: ${file} exists, actual: not exist."
        exit_code=$(($exit_code + 1))
    fi
 srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 num=$(num_files "${CKPTS40_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
    exit_code=$(($exit_code + 1))
 fi
 # clean the test files.
 if ! rm -rf ${CKPTS_PATH}/*; then
    echo "cleaning cached file in ${CKPTS_PATH} failed."
    exit_code=$(($exit_code + 1))
 fi
 exit $exit_code
--- a/ci_scripts/train/slurm_train.sh
+++ b/ci_scripts/train/slurm_train.sh
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
    exit_code=$(($exit_code + 1))
 fi
 # clean the test files.
 if ! rm -rf ${CKPTS_PATH}/*; then
    echo "cleaning cached file in ${CKPTS_PATH} failed."
    exit_code=$(($exit_code + 1))
 fi
 exit $exit_code
--- a/ci_scripts/train/test_config.json
+++ b/ci_scripts/train/test_config.json
@ -0,0 +1,45 @@
 {
    "7B_basic_train": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "ckpt": {
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 20
        }
    },
    "7B_load_new_ckpt": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
        "ckpt": {
            "load_ckpt_folder": "local:llm_ckpts/20",
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 40
        }
    },
    "7B_load_preset_ckpt": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
        "ckpt": {
            "load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 40
        }
    }
 }