diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index 7a330ed..a3d4cd9 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -9,11 +9,11 @@ on: - "**.md" env: WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) - SLURM_PARTITION: llm + SLURM_PARTITION: llm_s jobs: check-requirements: - runs-on: [lmtest] + runs-on: [t_cluster] steps: - name: mask env run: | @@ -37,7 +37,7 @@ jobs: dataset-preparation: if: ${{ always() }} needs: check-requirements - runs-on: [lmtest] + runs-on: [t_cluster] steps: - name: mask env run: | @@ -57,7 +57,7 @@ jobs: train: if: ${{ always() }} needs: check-requirements - runs-on: [lmtest] + runs-on: [t_cluster] timeout-minutes: 30 steps: - name: mask env @@ -83,18 +83,19 @@ jobs: source activate internlm-env-test export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} - rm -rf $GITHUB_WORKSPACE/llm_ckpts + rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak - name: torchrun-train run: | source activate internlm-env-test sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} - rm -rf $GITHUB_WORKSPACE/llm_ckpts + rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak convert-model-then-load: if: ${{ always() }} needs: check-requirements - runs-on: [lmtest] + runs-on: [t_cluster] + timeout-minutes: 15 steps: - name: mask env run: | @@ -107,13 +108,14 @@ jobs: export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/model/convert_to_hf.sh cd ./hf_ckpt - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py + srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py cd .. - rm -rf $GITHUB_WORKSPACE/hf_ckpt + rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak load-chat-model-in-hf: if: ${{ always() }} needs: check-requirements - runs-on: [lmtest] + runs-on: [t_cluster] + timeout-minutes: 15 steps: - name: mask env run: | @@ -123,4 +125,4 @@ jobs: - name: chat-model-in-hf run: | source activate internlm-env-test - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py + srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py diff --git a/ci_scripts/common/variables.sh b/ci_scripts/common/variables.sh index cc1b0e0..077fee4 100644 --- a/ci_scripts/common/variables.sh +++ b/ci_scripts/common/variables.sh @@ -1,3 +1,4 @@ #!/bin/bash readonly DATA_VOLUME=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/data +readonly CLEAN_PATH=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/ci_clean_bak diff --git a/ci_scripts/data/tokenizer_alpaca.sh b/ci_scripts/data/tokenizer_alpaca.sh index 456921c..db43d80 100644 --- a/ci_scripts/data/tokenizer_alpaca.sh +++ b/ci_scripts/data/tokenizer_alpaca.sh @@ -3,6 +3,7 @@ set -x source ./ci_scripts/common/variables.sh [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result @@ -19,7 +20,7 @@ source ./ci_scripts/common/basic_func.sh echo "start to test alpaca_tokenizer.py." if [[ -d ${RESULTS} ]]; then - if ! rm -rf ${RESULTS}/*; then + if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then echo "cleaning test data in ${RESULTS} failed, exit." exit 1 fi @@ -41,8 +42,8 @@ for file in ${file_list[@]}; do fi done -# clean the test files. -if ! rm -rf ${RESULTS}/*; then +# move the test files. +if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then echo "cleaning test data in ${RESULTS} failed." exit_code=$(($exit_code + 1)) fi diff --git a/ci_scripts/data/tokenizer_chinese.sh b/ci_scripts/data/tokenizer_chinese.sh index 6b68df8..81a5198 100644 --- a/ci_scripts/data/tokenizer_chinese.sh +++ b/ci_scripts/data/tokenizer_chinese.sh @@ -2,7 +2,8 @@ set -x source ./ci_scripts/common/variables.sh -[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; } +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin @@ -16,13 +17,13 @@ echo "start to test tokenizer.py." num=$(num_files "${RESULTS}") if [[ ${num} -gt 0 ]]; then - if ! rm -rf ${RESULTS}; then + if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then echo "cleaning test data ${RESULTS} failed, exit." exit 1 fi fi -srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} +srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} [[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } file_list=($RESULT $RESULT_META) @@ -33,8 +34,8 @@ for file in ${file_list[@]}; do fi done -# clean the test files. -if ! rm -rf ${RESULTS}/*; then +# move the test files. +if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then echo "cleaning cached file in ${RESULTS} failed." exit_code=$(($exit_code + 1)) fi diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh index 7d6536b..d1af389 100644 --- a/ci_scripts/model/convert_to_hf.sh +++ b/ci_scripts/model/convert_to_hf.sh @@ -4,6 +4,7 @@ set -x source ./ci_scripts/common/variables.sh [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20" readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt" @@ -18,7 +19,7 @@ source ./ci_scripts/common/basic_func.sh echo "start to test convert2hf.py." if [[ -d ${CKPTS_OUTPUT} ]]; then - if ! rm -rf ${CKPTS_OUTPUT}/*; then + if ! rsync -av --remove-source-files ${CKPTS_OUTPUT}/* ${CLEAN_PATH}; then echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit." exit 1 fi diff --git a/ci_scripts/train/load_ckpt.sh b/ci_scripts/train/load_ckpt.sh index 413dba4..06c6c1e 100644 --- a/ci_scripts/train/load_ckpt.sh +++ b/ci_scripts/train/load_ckpt.sh @@ -1,7 +1,10 @@ #!/bin/bash set -x +source ./ci_scripts/common/variables.sh [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } + readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40" readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt" @@ -19,7 +22,7 @@ if [[ ! -f ${file} ]]; then exit_code=$(($exit_code + 1)) fi -srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} +srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } @@ -29,10 +32,12 @@ if [[ ${num} -ne ${expected_num} ]]; then exit_code=$(($exit_code + 1)) fi -# clean the test files. -if ! rm -rf ${CKPTS_PATH}/*; then - echo "cleaning cached file in ${CKPTS_PATH} failed." - exit_code=$(($exit_code + 1)) +# move the test files. +if [[ -d ${CKPTS_PATH} ]]; then + if ! rsync -av --remove-source-files ${CKPTS_PATH} ${CLEAN_PATH}; then + echo "cleaning cached file in ${CKPTS_PATH} failed." + exit_code=$(($exit_code + 1)) + fi fi exit $exit_code diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh index 19d7c9b..3871fc4 100644 --- a/ci_scripts/train/slurm_train.sh +++ b/ci_scripts/train/slurm_train.sh @@ -1,7 +1,10 @@ #!/bin/bash set -x +source ./ci_scripts/common/variables.sh [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } + readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt" @@ -13,13 +16,13 @@ source ./ci_scripts/common/basic_func.sh echo "start to test slurm training." if [[ -d ${CKPTS20_PATH} ]]; then - if ! rm -rf ${CKPTS20_PATH}/*; then + if ! rsync -av --remove-source-files ${CKPTS20_PATH} ${CLEAN_PATH}; then echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." exit 1 fi fi -srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py +srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS20_OUTPUT}") diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh index 8870761..29ed54f 100644 --- a/ci_scripts/train/torchrun.sh +++ b/ci_scripts/train/torchrun.sh @@ -1,7 +1,10 @@ #!/bin/bash set -x +source ./ci_scripts/common/variables.sh [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; } + readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt" @@ -13,13 +16,13 @@ source ./ci_scripts/common/basic_func.sh echo "start to test torch training." if [[ -d ${CKPTS20_PATH} ]]; then - if ! rm -rf ${CKPTS20_PATH}/*; then + if ! rsync -av --remove-source-files ${CKPTS20_PATH} ${CLEAN_PATH}; then echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." exit 1 fi fi -srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch +srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch [[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS_OUTPUT}") @@ -28,8 +31,8 @@ if [[ ${num} -ne ${expected_num} ]]; then exit_code=$(($exit_code + 1)) fi -# clean the test files. -if ! rm -rf ${CKPTS_PATH}/*; then +# move the test files. +if ! rsync -av --remove-source-files ${CKPTS_PATH}/* ${CLEAN_PATH}; then echo "cleaning cached file in ${CKPTS_PATH} failed." exit_code=$(($exit_code + 1)) fi diff --git a/tools/transformers/modeling_internlm.py b/tools/transformers/modeling_internlm.py index 5439ba7..1dd31cd 100644 --- a/tools/transformers/modeling_internlm.py +++ b/tools/transformers/modeling_internlm.py @@ -28,10 +28,19 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers.activations import ACT2FN -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) from transformers.modeling_utils import PreTrainedModel from transformers.generation.streamers import BaseStreamer -from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) from configuration_internlm import InternLMConfig @@ -39,6 +48,7 @@ logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "InternLMConfig" + # Copied from transformers.models.bart.modeling_bart._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 @@ -437,6 +447,7 @@ class InternLMModel(InternLMPreTrainedModel): Args: config: InternLMConfig """ + _auto_class = "AutoModel" def __init__(self, config: InternLMConfig): @@ -765,7 +776,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel): for layer_past in past_key_values: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past - + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []): prompt = "" for record in history: @@ -774,43 +785,49 @@ class InternLMForCausalLM(InternLMPreTrainedModel): prompt += "" prompt += f"""<|User|>:{query}\n<|Bot|>:""" return tokenizer([prompt], return_tensors="pt") - + @torch.no_grad() - def chat(self, - tokenizer, - query: str, - history: List[Tuple[str, str]] = [], - streamer: Optional[BaseStreamer] = None, - max_new_tokens: int = 1024, - do_sample: bool = True, - temperature: float = 0.8, - top_p: float = 0.8, - **kwargs): + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): inputs = self.build_inputs(tokenizer, query, history) inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} - outputs = self.generate(**inputs, - streamer=streamer, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - temperature=temperature, - top_p=top_p, - **kwargs) - outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]):] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :] response = tokenizer.decode(outputs, skip_special_tokens=True) response = response.split("")[0] history = history + [(query, response)] return response, history - + @torch.no_grad() - def stream_chat(self, - tokenizer, - query: str, - history: List[Tuple[str, str]] = [], - max_new_tokens: int = 1024, - do_sample: bool = True, - temperature: float = 0.8, - top_p: float = 0.8, - **kwargs): + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): """ Return a generator in format: (response, history) Eg. @@ -856,12 +873,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel): tokenizer=tokenizer, query=query, streamer=ChatStreamer(tokenizer=tokenizer), - history=history, + history=history, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature, top_p=top_p, - **kwargs + **kwargs, ) def consumer(): @@ -869,7 +886,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel): producer.start() while True: res = response_queue.get() - if res is None: + if res is not None: return yield res