From fd2c8d8156d858545743b5c5c96c7a5f2d378c92 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Tue, 8 Nov 2022 10:39:13 +0800 Subject: [PATCH] [example] add opt model in lauguage (#1809) --- examples/language/opt/README.md | 49 ++ examples/language/opt/benchmark.sh | 21 + examples/language/opt/colossalai_zero.py | 6 + examples/language/opt/log | 10 + examples/language/opt/requirements.txt | 5 + examples/language/opt/run_clm.py | 593 +++++++++++++++++++++++ examples/language/opt/run_clm.sh | 22 + examples/language/opt/utils.py | 28 ++ 8 files changed, 734 insertions(+) create mode 100644 examples/language/opt/README.md create mode 100644 examples/language/opt/benchmark.sh create mode 100644 examples/language/opt/colossalai_zero.py create mode 100644 examples/language/opt/log create mode 100644 examples/language/opt/requirements.txt create mode 100755 examples/language/opt/run_clm.py create mode 100644 examples/language/opt/run_clm.sh create mode 100644 examples/language/opt/utils.py diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md new file mode 100644 index 000000000..a2a7f8c6a --- /dev/null +++ b/examples/language/opt/README.md @@ -0,0 +1,49 @@ + + +## OPT +Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model, which stimulates AI programmers to perform various downstream tasks and application deployments. + +The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost. + +We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before +the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling). + +## Quick Start +You can launch training by using the following bash script + +```bash +bash ./run_clm.sh +``` + +- batch-size-per-gpu: number of samples fed to each GPU, default is 16 +- mem-cap: limit memory usage within a value in GB, default is 0 (no limit) +- model: the size of the OPT model, default is `6.7b`. Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`, you can request +the pretrained weights from [OPT weight downloading page](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT). +- gpu-num: the number of GPUs to use, default is 1. + +## Remarkable Performance +On a single GPU, Colossal-AI’s automatic strategy provides remarkable performance gains from the ZeRO Offloading strategy by Microsoft DeepSpeed. +Users can experience up to a 40% speedup, at a variety of model scales. However, when using a traditional deep learning training framework like PyTorch, a single GPU can no longer support the training of models at such a scale. + +

+ +

+ +Adopting the distributed training strategy with 8 GPUs is as simple as adding a `-nprocs 8` to the training command of Colossal-AI! + +More details about behind the scenes can be found on the corresponding [blog](https://medium.com/@yangyou_berkeley/colossal-ai-seamlessly-accelerates-large-models-at-low-costs-with-hugging-face-4d1a887e500d), +and a detailed tutorial will be added in [Documentation](https://www.colossalai.org/docs/get_started/installation) very soon. diff --git a/examples/language/opt/benchmark.sh b/examples/language/opt/benchmark.sh new file mode 100644 index 000000000..f02f7629a --- /dev/null +++ b/examples/language/opt/benchmark.sh @@ -0,0 +1,21 @@ +export BS=16 +export MEMCAP=0 +export MODEL="6.7b" +export GPUNUM=1 + +for MODEL in "6.7b" "13b" "1.3b" +do +for GPUNUM in 8 1 +do +for BS in 16 24 32 8 +do +for MEMCAP in 0 40 +do +pkill -9 torchrun +pkill -9 python + +bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM +done +done +done +done diff --git a/examples/language/opt/colossalai_zero.py b/examples/language/opt/colossalai_zero.py new file mode 100644 index 000000000..833745f3e --- /dev/null +++ b/examples/language/opt/colossalai_zero.py @@ -0,0 +1,6 @@ +from colossalai.zero.shard_utils import TensorShardStrategy + +zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(), + tensor_placement_policy="auto", + reuse_fp16_shard=True), + optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384)) diff --git a/examples/language/opt/log b/examples/language/opt/log new file mode 100644 index 000000000..4284d0038 --- /dev/null +++ b/examples/language/opt/log @@ -0,0 +1,10 @@ + PID TTY STAT TIME COMMAND +2767195 pts/19 Ss 0:01 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19 +2810171 pts/19 T 0:00 \_ bash run_clm.sh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19 SHLVL=1 PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt OLDPWD=/home/lcfjr/codes/Titans ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= _CE_CONDA= CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=3 CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_DEFAULT_ENV=cs CONDA_PROMPT_MODIFIER=(cs) MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: CONDA_PREFIX_1=/home/lcfjr/miniconda3 RSYNC_PROXY=172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 http_proxy=http://172.17.0.1:7890 RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 https_proxy=http://172.17.0.1:7890 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 CUDA_VISIBLE_DEVICES=6 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev _=/usr/bin/bash +2810176 pts/19 Tl 0:01 | \_ /home/lcfjr/miniconda3/envs/cs/bin/python /home/lcfjr/miniconda3/envs/cs/bin/torchrun --nproc_per_node 1 --master_port 19198 run_clm.py --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --model_name_or_path facebook/opt-1.3b --output_dir /home/lcfjr/codes/ColossalAI/examples/language/opt --mem_cap 0 --per_device_train_batch_size 16 SHELL=/usr/bin/zsh LSCOLORS=Gxfxcxdxbxegedabagacad LESS=-R GPUNUM=1 CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle LC_ADDRESS=en_US.UTF-8 LC_NAME=en_US.UTF-8 GITTOKEN=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 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LC_MONETARY=en_US.UTF-8 ENV=/usr/share/modules/init/profile.sh PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt LOGNAME=lcfjr XDG_SESSION_TYPE=tty CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs MODULESHOME=/usr/share/modules MANPATH=: BS=16 MOTD_SHOWN=pam RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 HOME=/home/lcfjr LC_PAPER=en_US.UTF-8 LANG=en_US.UTF-8 LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: MODEL=1.3b CONDA_PROMPT_MODIFIER=(cs) LC_TERMINAL=iTerm2 https_proxy=http://172.17.0.1:7890 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 CUDA_VISIBLE_DEVICES=6 MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 XDG_SESSION_CLASS=user LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 PYTHONPATH=/home/lcfjr/codes/ColossalAI: LC_IDENTIFICATION=en_US.UTF-8 TERM=xterm-256color ZSH=/home/lcfjr/.oh-my-zsh _CE_CONDA= DATA=/data/scratch/cifar-10 USER=lcfjr CONDA_SHLVL=3 LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 LC_TERMINAL_VERSION=3.4.15 RSYNC_PROXY=172.17.0.1:7890 SHLVL=1 BASH_ENV=/usr/share/modules/init/bash PAGER=less LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 XDG_SESSION_ID=6572 http_proxy=http://172.17.0.1:7890 CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python MEMCAP=0 XDG_RUNTIME_DIR=/run/user/1008 SSH_CLIENT=124.14.224.115 17177 10086 CONDA_DEFAULT_ENV=cs LC_TIME=en_US.UTF-8 CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ all_proxy=socks5://172.17.0.1:7890 PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus SSH_TTY=/dev/pts/19 CONDA_PREFIX_1=/home/lcfjr/miniconda3 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev LC_NUMERIC=en_US.UTF-8 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 OLDPWD=/home/lcfjr/codes/Titans MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl BASH_FUNC_switchml%%=() { typeset swfound=1; if [ "${MODULES_USE_COMPAT_VERSION:-0}" = '1' ]; then typeset swname='main'; if [ -e /usr/lib/x86_64-linux-gnu/modulecmd.tcl ]; then typeset swfound=0; unset MODULES_USE_COMPAT_VERSION; fi; else typeset swname='compatibility'; if [ -e /usr/lib/x86_64-linux-gnu/modulecmd-compat ]; then typeset swfound=0; MODULES_USE_COMPAT_VERSION=1; export MODULES_USE_COMPAT_VERSION; fi; fi; if [ $swfound -eq 0 ]; then echo "Switching to Modules $swname version"; source /usr/share/modules/init/bash; else echo "Cannot switch to Modules $swname version, command not found"; return 1; fi } BASH_FUNC_module%%=() { _module_raw "$@" 2>&1 } BASH_FUNC__module_raw%%=() { unset _mlshdbg; if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = '1' ]; then case "$-" in *v*x*) set +vx; _mlshdbg='vx' ;; *v*) set +v; _mlshdbg='v' ;; *x*) set +x; _mlshdbg='x' ;; *) _mlshdbg='' ;; esac; fi; unset _mlre _mlIFS; if [ -n "${IFS+x}" ]; then _mlIFS=$IFS; fi; IFS=' '; for _mlv in ${MODULES_RUN_QUARANTINE:-}; do if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then if [ -n "`eval 'echo ${'$_mlv'+x}'`" ]; then _mlre="${_mlre:-}${_mlv}_modquar='`eval 'echo ${'$_mlv'}'`' "; fi; _mlrv="MODULES_RUNENV_${_mlv}"; _mlre="${_mlre:-}${_mlv}='`eval 'echo ${'$_mlrv':-}'`' "; fi; done; if [ -n "${_mlre:-}" ]; then eval `eval ${_mlre}/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash '"$@"'`; else eval `/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash "$@"`; fi; _mlstatus=$?; if [ -n "${_mlIFS+x}" ]; then IFS=$_mlIFS; else unset IFS; fi; unset _mlre _mlv _mlrv _mlIFS; if [ -n "${_mlshdbg:-}" ]; then set -$_mlshdbg; fi; unset _mlshdbg; return $_mlstatus } _=/home/lcfjr/miniconda3/envs/cs/bin/torchrun +2810184 pts/19 Z 24:41 | \_ [python] +2813011 pts/19 R+ 0:00 \_ ps ef LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19 SHLVL=1 PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt OLDPWD=/home/lcfjr/codes/Titans ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= _CE_CONDA= CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=3 CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_DEFAULT_ENV=cs CONDA_PROMPT_MODIFIER=(cs) MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: CONDA_PREFIX_1=/home/lcfjr/miniconda3 RSYNC_PROXY=172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 http_proxy=http://172.17.0.1:7890 RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 https_proxy=http://172.17.0.1:7890 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 CUDA_VISIBLE_DEVICES=6 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev _=/usr/bin/ps +2666493 pts/35 Ss+ 0:00 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6555 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 33038 10086 SSH_CONNECTION=124.14.224.115 33038 59.108.228.2 10086 SSH_TTY=/dev/pts/35 +2656881 pts/24 Ss+ 0:01 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6551 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 12979 10086 SSH_CONNECTION=124.14.224.115 12979 59.108.228.2 10086 SSH_TTY=/dev/pts/24 +2673174 pts/36 Ss+ 0:00 /usr/bin/zsh USER=lcfjr SSH_CLIENT=124.14.224.115 24967 10086 LC_TIME=en_US.UTF-8 XDG_SESSION_TYPE=tty SHLVL=1 MOTD_SHOWN=pam HOME=/home/lcfjr OLDPWD=/home/lcfjr LC_MONETARY=en_US.UTF-8 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus LOGNAME=lcfjr _=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/node XDG_SESSION_CLASS=user XDG_SESSION_ID=6542 PATH=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/bin/remote-cli:/home/lcfjr/miniconda3/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin LC_ADDRESS=en_US.UTF-8 XDG_RUNTIME_DIR=/run/user/1008 LANG=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 SHELL=/usr/bin/zsh LC_NAME=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 PWD=/home/lcfjr/codes/RecSysDemo SSH_CONNECTION=124.14.224.115 24967 59.108.228.2 10086 LC_NUMERIC=en_US.UTF-8 LC_PAPER=en_US.UTF-8 ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad CONDA_EXE=/home/lcfjr/miniconda3/bin/conda CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=1 CONDA_PREFIX=/home/lcfjr/miniconda3 CONDA_DEFAULT_ENV=base CONDA_PROMPT_MODIFIER=(base) MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: BROWSER=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/bin/helpers/browser.sh TERM_PROGRAM=vscode TERM_PROGRAM_VERSION=1.64.2 COLORTERM=truecolor VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-fba67a188a.sock GIT_ASKPASS=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/extensions/git/dist/askpass.sh VSCODE_GIT_ASKPASS_NODE=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/node VSCODE_GIT_ASKPASS_EXTRA_ARGS= VSCODE_GIT_ASKPASS_MAIN=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/extensions/git/dist/askpass-main.js VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-0c9910f5-ef18-4234-ba4e-523ff58da4be.sock TERM=xterm-256color + 303953 pts/11 Ss+ 0:00 -zsh BASH_ENV=/usr/share/modules/init/bash CONDA_DEFAULT_ENV=cs CONDA_EXE=/home/lcfjr/miniconda3/bin/conda CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_PREFIX_1=/home/lcfjr/miniconda3 CONDA_PROMPT_MODIFIER=(cs) CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=2 CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ CUDA_VISIBLE_DEVICES=5 DATA=/data/scratch/cifar-10 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus ENV=/usr/share/modules/init/profile.sh FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle GITTOKEN=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 HOME=/home/lcfjr LANG=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_NUMERIC=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_TERMINAL=iTerm2 LC_TERMINAL_VERSION=3.4.15 LC_TIME=en_US.UTF-8 LESS=-R LOADEDMODULES= LOGNAME=lcfjr LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: MANPATH=: MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 MODULESHOME=/usr/share/modules MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl MOTD_SHOWN=pam OLDPWD=/home/lcfjr/codes/shenggui/OPT-Demo/logs PAGER=less PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin PWD=/home/lcfjr/codes/shenggui/OPT-Demo PYTHONPATH=/home/lcfjr/codes/ColossalAI: SHELL=/usr/bin/zsh SHLVL=1 SSH_CLIENT=113.208.117.206 52011 10086 SSH_CONNECTION=113.208.117.206 52011 59.108.228.2 10086 SSH_TTY=/dev/pts/10 TERM=screen TMUX=/tmp//tmux-1008/default,303952,0 TMUX_PANE=%0 USER=lcfjr XDG_RUNTIME_DIR=/run/user/1008 XDG_SESSION_CLASS=user XDG_SESSION_ID=174 XDG_SESSION_TYPE=tty ZSH=/home/lcfjr/.oh-my-zsh _=/usr/bin/tmux _CE_CONDA= _CE_M= diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt new file mode 100644 index 000000000..47bec60d2 --- /dev/null +++ b/examples/language/opt/requirements.txt @@ -0,0 +1,5 @@ +colossalai +torch >= 1.8.1 +datasets >= 1.8.0 +sentencepiece != 0.1.92 +protobuf diff --git a/examples/language/opt/run_clm.py b/examples/language/opt/run_clm.py new file mode 100755 index 000000000..b9283de08 --- /dev/null +++ b/examples/language/opt/run_clm.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) +on a text file or a dataset without using HuggingFace Trainer. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=text-generation +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import math +import os +import random +import time +from itertools import chain + +import datasets +import torch +import torch.distributed as dist +from accelerate.utils import set_seed +from datasets import load_dataset +from packaging import version +from titans.utils import barrier_context +from torch.utils.data import DataLoader +from tqdm.auto import tqdm +from utils import colo_memory_cap + +import colossalai +import transformers +from colossalai.context import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.gemini import ChunkManager, GeminiManager +from colossalai.logging import disable_existing_loggers, get_dist_logger +from colossalai.nn.optimizer import HybridAdam +from colossalai.nn.parallel import ZeroDDP +from colossalai.tensor import ProcessGroup +from colossalai.utils import get_current_device, get_dataloader +from colossalai.utils.checkpoint import load_checkpoint, save_checkpoint +from colossalai.utils.model.colo_init_context import ColoInitContext +from colossalai.zero import ZeroOptimizer +from transformers import ( + CONFIG_MAPPING, + MODEL_MAPPING, + AutoConfig, + AutoTokenizer, + GPT2Tokenizer, + OPTForCausalLM, + SchedulerType, + default_data_collator, + get_scheduler, +) +from transformers.utils.versions import require_version + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def get_time_stamp(): + torch.cuda.synchronize() + return time.time() + + +def parse_args(): + parser = colossalai.get_default_parser() + parser.add_argument( + "--dataset_name", + type=str, + default=None, + help="The name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The configuration name of the dataset to use (via the datasets library).", + ) + parser.add_argument("--train_file", + type=str, + default=None, + help="A csv or a json file containing the training data.") + parser.add_argument("--validation_file", + type=str, + default=None, + help="A csv or a json file containing the validation data.") + parser.add_argument( + "--validation_split_percentage", + default=5, + help="The percentage of the train set used as validation set in case there's no validation split", + ) + parser.add_argument( + "--model_name_or_path", + type=str, + help="Path to pretrained model or model identifier from huggingface.co/models.", + required=True, + ) + parser.add_argument( + "--config_name", + type=str, + default=None, + help="Pretrained config name or path if not the same as model_name", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--use_slow_tokenizer", + action="store_true", + help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") + parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], + ) + parser.add_argument("--num_warmup_steps", + type=int, + default=0, + help="Number of steps for the warmup in the lr scheduler.") + parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--model_type", + type=str, + default=None, + help="Model type to use if training from scratch.", + choices=MODEL_TYPES, + ) + parser.add_argument( + "--block_size", + type=int, + default=None, + help=("Optional input sequence length after tokenization. The training dataset will be truncated in block of" + " this size for training. Default to the model max input length for single sentence inputs (take into" + " account special tokens)."), + ) + parser.add_argument( + "--preprocessing_num_workers", + type=int, + default=None, + help="The number of processes to use for the preprocessing.", + ) + parser.add_argument("--overwrite_cache", + type=bool, + default=False, + help="Overwrite the cached training and evaluation sets") + parser.add_argument("--no_keep_linebreaks", + action="store_true", + help="Do not keep line breaks when using TXT files.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_model_id", + type=str, + help="The name of the repository to keep in sync with the local `output_dir`.") + parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + action="store_true", + help="Whether to enable experiment trackers for logging.", + ) + parser.add_argument( + "--report_to", + type=str, + default="all", + help=('The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' + ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + "Only applicable when `--with_tracking` is passed."), + ) + + parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap") + parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu") + args = parser.parse_args() + + # Sanity checks + if args.dataset_name is None and args.train_file is None and args.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if args.train_file is not None: + extension = args.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." + if args.validation_file is not None: + extension = args.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." + + if args.push_to_hub: + assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + + return args + + +def main(): + args = parse_args() + disable_existing_loggers() + colossalai.launch_from_torch(config=dict()) + logger = get_dist_logger() + is_main_process = gpc.get_local_rank(ParallelMode.DATA) == 0 + + if is_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + if args.mem_cap > 0: + colo_memory_cap(args.mem_cap) + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}") + + # Handle the repository creation + with barrier_context(): + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + logger.info("Start preparing dataset", ranks=[0]) + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + ) + else: + data_files = {} + dataset_args = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + extension = args.train_file.split(".")[-1] + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + **dataset_args, + ) + logger.info("Dataset is prepared", ranks=[0]) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + if args.config_name: + config = AutoConfig.from_pretrained(args.config_name) + elif args.model_name_or_path: + config = AutoConfig.from_pretrained(args.model_name_or_path) + else: + config = CONFIG_MAPPING[args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + logger.info("Model config has been created", ranks=[0]) + + if args.model_name_or_path == 'facebook/opt-13b': + tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path) + else: + print(f'load model from {args.model_name_or_path}') + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) + logger.info(f"{tokenizer.__class__.__name__} has been created", ranks=[0]) + + if args.init_in_cpu: + init_dev = torch.device('cpu') + else: + init_dev = get_current_device() + + # build model + if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b': + # currently, there has a bug in pretrained opt-13b + # we can not import it until huggingface fix it + logger.info("Train a new model from scratch", ranks=[0]) + with ColoInitContext(device=init_dev): + model = OPTForCausalLM(config) + else: + logger.info("Finetune a pre-trained model", ranks=[0]) + with ColoInitContext(device=init_dev): + model = OPTForCausalLM.from_pretrained(args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + local_files_only=False) + + # enable graident checkpointing + model.gradient_checkpointing_enable() + + PLACEMENT_POLICY = 'auto' + cai_version = colossalai.__version__ + logger.info(f'using Colossal-AI version {cai_version}') + if version.parse(cai_version) > version.parse("0.1.10"): + from colossalai.gemini import GeminiManager + from colossalai.gemini.chunk import init_chunk_manager + chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_mb=32) + gemini_manager = GeminiManager(PLACEMENT_POLICY, chunk_manager) + model = ZeroDDP(model, gemini_manager, pin_memory=True) + elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): + from colossalai.gemini import ChunkManager, GeminiManager + pg = ProcessGroup() + chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) + chunk_manager = ChunkManager(chunk_size, + pg, + enable_distributed_storage=True, + init_device=GeminiManager.get_default_device(PLACEMENT_POLICY)) + + logger.info(f'{model.__class__.__name__} has been created', ranks=[0]) + + # Preprocessing the datasets. + # First we tokenize all the texts. + column_names = raw_datasets["train"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + def tokenize_function(examples): + return tokenizer(examples[text_column_name]) + + with barrier_context(executor_rank=0, parallel_mode=ParallelMode.DATA): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + + if args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx.") + block_size = 1024 + else: + if args.block_size > tokenizer.model_max_length: + logger.warning(f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}.") + block_size = min(args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i:i + block_size] for i in range(0, total_length, block_size) + ] for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + with barrier_context(executor_rank=0, parallel_mode=ParallelMode.DATA): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + + train_dataset = lm_datasets["train"] + eval_dataset = lm_datasets["validation"] + + # Log a few random samples from the training set: + # for index in random.sample(range(len(train_dataset)), 3): + # logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # DataLoaders creation: + train_dataloader = get_dataloader(train_dataset, + shuffle=True, + add_sampler=True, + collate_fn=default_data_collator, + batch_size=args.per_device_train_batch_size) + eval_dataloader = DataLoader(eval_dataset, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size) + logger.info("Dataloaders have been created", ranks=[0]) + + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + optimizer = HybridAdam(optimizer_grouped_parameters, lr=args.learning_rate) + optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**14) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.max_train_steps, + ) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Train! + total_batch_size = args.per_device_train_batch_size * gpc.get_world_size(ParallelMode.DATA) + + logger.info("***** Running training *****", ranks=[0]) + logger.info(f" Num examples = {len(train_dataset)}", ranks=[0]) + logger.info(f" Num Epochs = {args.num_train_epochs}", ranks=[0]) + logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}", ranks=[0]) + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0]) + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}", ranks=[0]) + logger.info(f" Total optimization steps = {args.max_train_steps}", ranks=[0]) + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) + completed_steps = 0 + starting_epoch = 0 + global_step = 0 + + for epoch in range(starting_epoch, args.num_train_epochs): + + if completed_steps >= args.max_train_steps: + break + + model.train() + for step, batch in enumerate(train_dataloader): + batch = {k: v.cuda() for k, v in batch.items()} + outputs = model(**batch) + loss = outputs['loss'] + optimizer.backward(loss) + + if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + completed_steps += 1 + + global_step += 1 + logger.info("Global step {} finished".format(global_step + 1), ranks=[0]) + + if completed_steps >= args.max_train_steps: + break + + model.eval() + losses = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + batch = {k: v.cuda() for k, v in batch.items()} + outputs = model(**batch) + + loss = outputs['loss'].unsqueeze(0) + losses.append(loss) + + losses = torch.cat(losses) + losses = losses[:len(eval_dataset)] + try: + eval_loss = torch.mean(losses) + perplexity = math.exp(eval_loss) + except OverflowError: + perplexity = float("inf") + + logger.info(f"Epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}", ranks=[0]) + + if args.output_dir is not None: + model_state = model.state_dict() + if is_main_process: + torch.save(model_state, args.output_dir + '/epoch_{}_model.pth'.format(completed_steps)) + dist.barrier() + # load_state = torch.load(args.output_dir + '/epoch_{}_model.pth'.format(completed_steps)) + # model.load_state_dict(load_state, strict=False) + + logger.info("Training finished", ranks=[0]) + + +if __name__ == "__main__": + main() diff --git a/examples/language/opt/run_clm.sh b/examples/language/opt/run_clm.sh new file mode 100644 index 000000000..858d3325a --- /dev/null +++ b/examples/language/opt/run_clm.sh @@ -0,0 +1,22 @@ +set -x +export BS=${1:-16} +export MEMCAP=${2:-0} +export MODEL=${3:-"125m"} +export GPUNUM=${4:-1} + +# make directory for logs +mkdir -p ./logs + +export MODLE_PATH="facebook/opt-${MODEL}" + +# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 +torchrun \ + --nproc_per_node ${GPUNUM} \ + --master_port 19198 \ + run_clm.py \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --output_dir $PWD \ + --mem_cap ${MEMCAP} \ + --model_name_or_path ${MODLE_PATH} \ + --per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log diff --git a/examples/language/opt/utils.py b/examples/language/opt/utils.py new file mode 100644 index 000000000..a7651e5e4 --- /dev/null +++ b/examples/language/opt/utils.py @@ -0,0 +1,28 @@ +import torch +import torch.distributed as dist + + +def memory_cap(size_in_GB): + print(f"use only {size_in_GB} GB of CUDA memory") + assert dist.is_initialized(), "memory_cap must be used after dist init" + local_rank = dist.get_rank() + cuda_capacity = torch.cuda.get_device_properties(local_rank).total_memory + size_in_B = (size_in_GB * 1024**3) + if size_in_B > cuda_capacity: + print(f'memory_cap is uselsess since {cuda_capacity / 1024**3} less than {size_in_GB}') + return + fraction = (size_in_GB * 1024**3) / cuda_capacity + print(f'mem faction is {fraction}') + torch.cuda.set_per_process_memory_fraction(fraction, local_rank) + + +def colo_memory_cap(size_in_GB): + from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device + cuda_capacity = colo_device_memory_capacity(get_current_device()) + if size_in_GB * (1024**3) < cuda_capacity: + colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity) + print("Using {} GB of GPU memory".format(size_in_GB)) + + +if __name__ == '__main__': + memory_cap(40)