test(ci_scripts): add load ckpt cases (#208)

* fix format * add scripts for load ckpt case * update test config * debug:use var in json * fix syntax error * export pythonpath * use absolute path * use father path of workspace * debug load new ckpt * change data path * add train folder * fix code format * fix pylint warning --------- Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>
2023-08-21 15:24:43 +08:00 · 2023-08-21 15:24:43 +08:00 · cc3c48ae47
parent 0600b42c01
commit cc3c48ae47
14 changed files with 193 additions and 24 deletions
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@ -66,9 +66,23 @@ jobs:
    - uses: actions/checkout@v3

    - name: slurm-train
+      id: basic_train
      run: |
        source activate internlm-env-test
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_preset_ckpt
+      if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_new_ckpt
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
        rm -rf $GITHUB_WORKSPACE/llm_ckpts

    - name: torchrun-train
--- a/ci_scripts/common/com_func.py
+++ b/ci_scripts/common/com_func.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+
+def merge_dicts(dict_a: dict, dict_b: dict):
+    for key in dict_b.keys():
+        if isinstance(dict_b[key], dict):
+            dict_b[key] = {**dict_a[key], **dict_b[key]}
+            merge_dicts(dict_a[key], dict_b[key])
+    dict_c = {**dict_a, **dict_b}
+    return dict_c
+
+
+def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
+    result = ""
+    for key, value in data.items():
+        if isinstance(value, dict):
+            result += f"{' ' * indent}{key} = dict(\n"
+            result += format_dict_to_py_string(value, indent + 4, is_nested=True)
+            result += f"{' ' * indent})"
+        else:
+            result += f"{' ' * indent}{key} = {repr(value)}"
+        if is_nested:
+            result += ","
+        result += "\n"
+    result = f"""\
+{result}
+"""
+    return result
--- a/ci_scripts/model/demo_load_7B_chat_model.py
+++ b/ci_scripts/model/demo_load_7B_chat_model.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
--- a/ci_scripts/train/ci_7B_sft.py
+++ b/ci_scripts/train/ci_7B_sft.py
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # oss: 'boto3:s3://model_weights/XXX'
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # SAVE_CKPT_FOLDER = "local:llm_ckpts"
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
 # LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -26,7 +26,7 @@ ckpt = dict(
    load_optimizer=True,
 )

-TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
+TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
 data = dict(
    seq_len=SEQ_LEN,
    # micro_num means the number of micro_batch contained in one gradient update
--- a/ci_scripts/train/generate_config.py
+++ b/ci_scripts/train/generate_config.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import argparse
+import json
+import os
+
+from ci_scripts.common import com_func
+from internlm.core.context import Config
+
+
+def generate_new_config(config_py_file, test_config_json, case_name):
+    # generate path of the new config py
+    config_path = os.path.split(config_py_file)
+    new_config_py_file = os.path.join(config_path[0], case_name + ".py")
+
+    # merge dict
+    origin_config = Config.from_file(config_py_file)
+    with open(test_config_json) as f:
+        test_config = json.load(f)
+    if test_config:
+        if case_name not in test_config.keys():
+            raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
+    new_config = com_func.merge_dicts(origin_config, test_config[case_name])
+    print(f"new config is:\n{new_config}")
+
+    # write new config to py file
+    file_content = com_func.format_dict_to_py_string(new_config)
+    with open(new_config_py_file, "w") as f:
+        f.write(file_content)
+    print(f"The new test train config file is {new_config_py_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--origin_config",
+        type=str,
+        default="./ci_scripts/train/ci_7B_sft.py",
+        help="path to the origin train config file",
+    )
+    parser.add_argument(
+        "--test_config",
+        type=str,
+        default="./ci_scripts/train/test_config.json",
+        help="path to the test train config file",
+    )
+    parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
+    args = parser.parse_args()
+    generate_new_config(args.origin_config, args.test_config, args.case_name)
--- a/ci_scripts/train/load_ckpt.sh
+++ b/ci_scripts/train/load_ckpt.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+set -x
+
+[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
+readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
+readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
+readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
+expected_num=21
+exit_code=0
+
+source ./ci_scripts/common/basic_func.sh
+
+echo "start to test slurm training with loading checkpoint."
+
+python ./ci_scripts/train/generate_config.py --case_name $1
+file="./ci_scripts/train/$1.py"
+if [[ ! -f ${file} ]]; then
+        echo "expect: ${file} exists, actual: not exist."
+        exit_code=$(($exit_code + 1))
+    fi
+
+srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
+[[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
+
+
+num=$(num_files "${CKPTS40_OUTPUT}")
+if [[ ${num} -ne ${expected_num} ]]; then
+    echo "expect: ${expected_num} files, actual: ${num} files."
+    exit_code=$(($exit_code + 1))
+fi
+
+# clean the test files.
+if ! rm -rf ${CKPTS_PATH}/*; then
+    echo "cleaning cached file in ${CKPTS_PATH} failed."
+    exit_code=$(($exit_code + 1))
+fi
+
+exit $exit_code
--- a/ci_scripts/train/slurm_train.sh
+++ b/ci_scripts/train/slurm_train.sh
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
    exit_code=$(($exit_code + 1))
 fi

-# clean the test files.
-if ! rm -rf ${CKPTS_PATH}/*; then
-    echo "cleaning cached file in ${CKPTS_PATH} failed."
-    exit_code=$(($exit_code + 1))
-fi
-
 exit $exit_code
--- a/ci_scripts/train/test_config.json
+++ b/ci_scripts/train/test_config.json
@ -0,0 +1,45 @@
+{
+    "7B_basic_train": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "ckpt": {
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 20
+        }
+    },
+    "7B_load_new_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    },
+    "7B_load_preset_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    }
+}