From fd2c8d8156d858545743b5c5c96c7a5f2d378c92 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Tue, 8 Nov 2022 10:39:13 +0800
Subject: [PATCH] [example] add opt model in lauguage (#1809)

---
 examples/language/opt/README.md          |  49 ++
 examples/language/opt/benchmark.sh       |  21 +
 examples/language/opt/colossalai_zero.py |   6 +
 examples/language/opt/log                |  10 +
 examples/language/opt/requirements.txt   |   5 +
 examples/language/opt/run_clm.py         | 593 +++++++++++++++++++++++
 examples/language/opt/run_clm.sh         |  22 +
 examples/language/opt/utils.py           |  28 ++
 8 files changed, 734 insertions(+)
 create mode 100644 examples/language/opt/README.md
 create mode 100644 examples/language/opt/benchmark.sh
 create mode 100644 examples/language/opt/colossalai_zero.py
 create mode 100644 examples/language/opt/log
 create mode 100644 examples/language/opt/requirements.txt
 create mode 100755 examples/language/opt/run_clm.py
 create mode 100644 examples/language/opt/run_clm.sh
 create mode 100644 examples/language/opt/utils.py
diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md
new file mode 100644
index 000000000..a2a7f8c6a
--- /dev/null
+++ b/examples/language/opt/README.md
@@ -0,0 +1,49 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+## OPT
+Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model, which stimulates AI programmers to perform various downstream tasks and application deployments.
+
+The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
+
+We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
+the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
+
+## Quick Start
+You can launch training by using the following bash script
+
+```bash
+bash ./run_clm.sh <batch-size-per-gpu> <mem-cap> <model> <gpu-num>
+```
+
+- batch-size-per-gpu: number of samples fed to each GPU, default is 16
+- mem-cap: limit memory usage within a value in GB, default is 0 (no limit)
+- model: the size of the OPT model, default is `6.7b`. Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`, you can request
+the pretrained weights from [OPT weight downloading page](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT).
+- gpu-num: the number of GPUs to use, default is 1.
+
+## Remarkable Performance
+On a single GPU, Colossal-AI’s automatic strategy provides remarkable performance gains from the ZeRO Offloading strategy by Microsoft DeepSpeed.
+Users can experience up to a 40% speedup, at a variety of model scales. However, when using a traditional deep learning training framework like PyTorch, a single GPU can no longer support the training of models at such a scale.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT.png" width=1000/>
+</p>
+
+Adopting the distributed training strategy with 8 GPUs is as simple as adding a `-nprocs 8` to the training command of Colossal-AI!
+
+More details about behind the scenes can be found on the corresponding [blog](https://medium.com/@yangyou_berkeley/colossal-ai-seamlessly-accelerates-large-models-at-low-costs-with-hugging-face-4d1a887e500d),
+and a detailed tutorial will be added in [Documentation](https://www.colossalai.org/docs/get_started/installation) very soon.
diff --git a/examples/language/opt/benchmark.sh b/examples/language/opt/benchmark.sh
new file mode 100644
index 000000000..f02f7629a
--- /dev/null
+++ b/examples/language/opt/benchmark.sh
@@ -0,0 +1,21 @@
+export BS=16
+export MEMCAP=0
+export MODEL="6.7b"
+export GPUNUM=1
+
+for MODEL in "6.7b" "13b" "1.3b"
+do
+for GPUNUM in 8 1
+do
+for BS in 16 24 32 8
+do
+for MEMCAP in 0 40
+do
+pkill -9 torchrun
+pkill -9 python
+
+bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM
+done
+done
+done
+done
diff --git a/examples/language/opt/colossalai_zero.py b/examples/language/opt/colossalai_zero.py
new file mode 100644
index 000000000..833745f3e
--- /dev/null
+++ b/examples/language/opt/colossalai_zero.py
@@ -0,0 +1,6 @@
+from colossalai.zero.shard_utils import TensorShardStrategy
+
+zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
+                              tensor_placement_policy="auto",
+                              reuse_fp16_shard=True),
+            optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384))
diff --git a/examples/language/opt/log b/examples/language/opt/log
new file mode 100644
index 000000000..4284d0038
--- /dev/null
+++ b/examples/language/opt/log
@@ -0,0 +1,10 @@
+    PID TTY      STAT   TIME COMMAND
+2767195 pts/19   Ss     0:01 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19
+2810171 pts/19   T      0:00  \_ bash run_clm.sh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19 SHLVL=1 PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt OLDPWD=/home/lcfjr/codes/Titans ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= _CE_CONDA= CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=3 CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_DEFAULT_ENV=cs CONDA_PROMPT_MODIFIER=(cs)  MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: CONDA_PREFIX_1=/home/lcfjr/miniconda3 RSYNC_PROXY=172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 http_proxy=http://172.17.0.1:7890 RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 https_proxy=http://172.17.0.1:7890 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 CUDA_VISIBLE_DEVICES=6 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev _=/usr/bin/bash
+2810176 pts/19   Tl     0:01  |   \_ /home/lcfjr/miniconda3/envs/cs/bin/python /home/lcfjr/miniconda3/envs/cs/bin/torchrun --nproc_per_node 1 --master_port 19198 run_clm.py --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --model_name_or_path facebook/opt-1.3b --output_dir /home/lcfjr/codes/ColossalAI/examples/language/opt --mem_cap 0 --per_device_train_batch_size 16 SHELL=/usr/bin/zsh LSCOLORS=Gxfxcxdxbxegedabagacad LESS=-R GPUNUM=1 CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle LC_ADDRESS=en_US.UTF-8 LC_NAME=en_US.UTF-8 GITTOKEN=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 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LC_MONETARY=en_US.UTF-8 ENV=/usr/share/modules/init/profile.sh PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt LOGNAME=lcfjr XDG_SESSION_TYPE=tty CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs MODULESHOME=/usr/share/modules MANPATH=: BS=16 MOTD_SHOWN=pam RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 HOME=/home/lcfjr LC_PAPER=en_US.UTF-8 LANG=en_US.UTF-8 LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: MODEL=1.3b CONDA_PROMPT_MODIFIER=(cs)  LC_TERMINAL=iTerm2 https_proxy=http://172.17.0.1:7890 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 CUDA_VISIBLE_DEVICES=6 MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 XDG_SESSION_CLASS=user LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 PYTHONPATH=/home/lcfjr/codes/ColossalAI: LC_IDENTIFICATION=en_US.UTF-8 TERM=xterm-256color ZSH=/home/lcfjr/.oh-my-zsh _CE_CONDA= DATA=/data/scratch/cifar-10 USER=lcfjr CONDA_SHLVL=3 LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 LC_TERMINAL_VERSION=3.4.15 RSYNC_PROXY=172.17.0.1:7890 SHLVL=1 BASH_ENV=/usr/share/modules/init/bash PAGER=less LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 XDG_SESSION_ID=6572 http_proxy=http://172.17.0.1:7890 CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python MEMCAP=0 XDG_RUNTIME_DIR=/run/user/1008 SSH_CLIENT=124.14.224.115 17177 10086 CONDA_DEFAULT_ENV=cs LC_TIME=en_US.UTF-8 CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ all_proxy=socks5://172.17.0.1:7890 PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus SSH_TTY=/dev/pts/19 CONDA_PREFIX_1=/home/lcfjr/miniconda3 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev LC_NUMERIC=en_US.UTF-8 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 OLDPWD=/home/lcfjr/codes/Titans MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl BASH_FUNC_switchml%%=() {  typeset swfound=1;  if [ "${MODULES_USE_COMPAT_VERSION:-0}" = '1' ]; then  typeset swname='main';  if [ -e /usr/lib/x86_64-linux-gnu/modulecmd.tcl ]; then  typeset swfound=0;  unset MODULES_USE_COMPAT_VERSION;  fi;  else  typeset swname='compatibility';  if [ -e /usr/lib/x86_64-linux-gnu/modulecmd-compat ]; then  typeset swfound=0;  MODULES_USE_COMPAT_VERSION=1;  export MODULES_USE_COMPAT_VERSION;  fi;  fi;  if [ $swfound -eq 0 ]; then  echo "Switching to Modules $swname version";  source /usr/share/modules/init/bash;  else  echo "Cannot switch to Modules $swname version, command not found";  return 1;  fi } BASH_FUNC_module%%=() {  _module_raw "$@" 2>&1 } BASH_FUNC__module_raw%%=() {  unset _mlshdbg;  if [ "${MODULES_SILENT_SHELL_DEBUG:-0}" = '1' ]; then  case "$-" in   *v*x*)  set +vx;  _mlshdbg='vx'  ;;  *v*)  set +v;  _mlshdbg='v'  ;;  *x*)  set +x;  _mlshdbg='x'  ;;  *)  _mlshdbg=''  ;;  esac;  fi;  unset _mlre _mlIFS;  if [ -n "${IFS+x}" ]; then  _mlIFS=$IFS;  fi;  IFS=' ';  for _mlv in ${MODULES_RUN_QUARANTINE:-};  do  if [ "${_mlv}" = "${_mlv##*[!A-Za-z0-9_]}" -a "${_mlv}" = "${_mlv#[0-9]}" ]; then  if [ -n "`eval 'echo ${'$_mlv'+x}'`" ]; then  _mlre="${_mlre:-}${_mlv}_modquar='`eval 'echo ${'$_mlv'}'`' ";  fi;  _mlrv="MODULES_RUNENV_${_mlv}";  _mlre="${_mlre:-}${_mlv}='`eval 'echo ${'$_mlrv':-}'`' ";  fi;  done;  if [ -n "${_mlre:-}" ]; then  eval `eval ${_mlre}/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash '"$@"'`;  else  eval `/usr/bin/tclsh8.6 /usr/lib/x86_64-linux-gnu/modulecmd.tcl bash "$@"`;  fi;  _mlstatus=$?;  if [ -n "${_mlIFS+x}" ]; then  IFS=$_mlIFS;  else  unset IFS;  fi;  unset _mlre _mlv _mlrv _mlIFS;  if [ -n "${_mlshdbg:-}" ]; then  set -$_mlshdbg;  fi;  unset _mlshdbg;  return $_mlstatus } _=/home/lcfjr/miniconda3/envs/cs/bin/torchrun
+2810184 pts/19   Z     24:41  |       \_ [python] <defunct>
+2813011 pts/19   R+     0:00  \_ ps ef LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6572 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 17177 10086 SSH_CONNECTION=124.14.224.115 17177 59.108.228.2 10086 SSH_TTY=/dev/pts/19 SHLVL=1 PWD=/home/lcfjr/codes/ColossalAI/examples/language/opt OLDPWD=/home/lcfjr/codes/Titans ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: CONDA_EXE=/home/lcfjr/miniconda3/bin/conda _CE_M= _CE_CONDA= CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=3 CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_DEFAULT_ENV=cs CONDA_PROMPT_MODIFIER=(cs)  MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules LOADEDMODULES=proxy/0.0.1-gcc-9.3.0 MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: CONDA_PREFIX_1=/home/lcfjr/miniconda3 RSYNC_PROXY=172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890 _LMFILES_=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0 https_proxy_modshare=http:1:7890:1://172.17.0.1:1 http_proxy=http://172.17.0.1:7890 RSYNC_PROXY_modshare=7890:1:172.17.0.1:1 http_proxy_modshare=http:1:7890:1://172.17.0.1:1 https_proxy=http://172.17.0.1:7890 all_proxy_modshare=socks5:1:7890:1://172.17.0.1:1 LOADEDMODULES_modshare=proxy/0.0.1-gcc-9.3.0:1 _LMFILES__modshare=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2/proxy/0.0.1-gcc-9.3.0:1 CUDA_VISIBLE_DEVICES=6 CONDA_PREFIX_2=/home/lcfjr/miniconda3/envs/dev _=/usr/bin/ps
+2666493 pts/35   Ss+    0:00 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6555 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 33038 10086 SSH_CONNECTION=124.14.224.115 33038 59.108.228.2 10086 SSH_TTY=/dev/pts/35
+2656881 pts/24   Ss+    0:01 -zsh LC_TERMINAL_VERSION=3.4.15 LANG=en_US.UTF-8 LC_TERMINAL=iTerm2 USER=lcfjr LOGNAME=lcfjr HOME=/home/lcfjr PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin SHELL=/usr/bin/zsh TERM=xterm-256color XDG_SESSION_ID=6551 XDG_RUNTIME_DIR=/run/user/1008 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus XDG_SESSION_TYPE=tty XDG_SESSION_CLASS=user MOTD_SHOWN=pam LC_NUMERIC=en_US.UTF-8 LC_TIME=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 SSH_CLIENT=124.14.224.115 12979 10086 SSH_CONNECTION=124.14.224.115 12979 59.108.228.2 10086 SSH_TTY=/dev/pts/24
+2673174 pts/36   Ss+    0:00 /usr/bin/zsh USER=lcfjr SSH_CLIENT=124.14.224.115 24967 10086 LC_TIME=en_US.UTF-8 XDG_SESSION_TYPE=tty SHLVL=1 MOTD_SHOWN=pam HOME=/home/lcfjr OLDPWD=/home/lcfjr LC_MONETARY=en_US.UTF-8 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus LOGNAME=lcfjr _=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/node XDG_SESSION_CLASS=user XDG_SESSION_ID=6542 PATH=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/bin/remote-cli:/home/lcfjr/miniconda3/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin LC_ADDRESS=en_US.UTF-8 XDG_RUNTIME_DIR=/run/user/1008 LANG=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 SHELL=/usr/bin/zsh LC_NAME=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 PWD=/home/lcfjr/codes/RecSysDemo SSH_CONNECTION=124.14.224.115 24967 59.108.228.2 10086 LC_NUMERIC=en_US.UTF-8 LC_PAPER=en_US.UTF-8 ZSH=/home/lcfjr/.oh-my-zsh PAGER=less LESS=-R LSCOLORS=Gxfxcxdxbxegedabagacad CONDA_EXE=/home/lcfjr/miniconda3/bin/conda CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=1 CONDA_PREFIX=/home/lcfjr/miniconda3 CONDA_DEFAULT_ENV=base CONDA_PROMPT_MODIFIER=(base)  MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl ENV=/usr/share/modules/init/profile.sh MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 BASH_ENV=/usr/share/modules/init/bash MODULESHOME=/usr/share/modules MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle MANPATH=: CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ GITTOKEN=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 DATA=/data/scratch/cifar-10 PYTHONPATH=/home/lcfjr/codes/ColossalAI: BROWSER=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/bin/helpers/browser.sh TERM_PROGRAM=vscode TERM_PROGRAM_VERSION=1.64.2 COLORTERM=truecolor VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-fba67a188a.sock GIT_ASKPASS=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/extensions/git/dist/askpass.sh VSCODE_GIT_ASKPASS_NODE=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/node VSCODE_GIT_ASKPASS_EXTRA_ARGS= VSCODE_GIT_ASKPASS_MAIN=/home/lcfjr/.vscode-server/bin/f80445acd5a3dadef24aa209168452a3d97cc326/extensions/git/dist/askpass-main.js VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-0c9910f5-ef18-4234-ba4e-523ff58da4be.sock TERM=xterm-256color
+ 303953 pts/11   Ss+    0:00 -zsh BASH_ENV=/usr/share/modules/init/bash CONDA_DEFAULT_ENV=cs CONDA_EXE=/home/lcfjr/miniconda3/bin/conda CONDA_PREFIX=/home/lcfjr/miniconda3/envs/cs CONDA_PREFIX_1=/home/lcfjr/miniconda3 CONDA_PROMPT_MODIFIER=(cs)  CONDA_PYTHON_EXE=/home/lcfjr/miniconda3/bin/python CONDA_SHLVL=2 CUDA_HOME=/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/cuda-11.3.1-e4ejcraos3skqdcti64yorl6rrk5et47/ CUDA_VISIBLE_DEVICES=5 DATA=/data/scratch/cifar-10 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus ENV=/usr/share/modules/init/profile.sh FPATH=/usr/share/modules/init/zsh-functions:/home/lcfjr/.oh-my-zsh/plugins/git:/home/lcfjr/.oh-my-zsh/functions:/home/lcfjr/.oh-my-zsh/completions:/home/lcfjr/.oh-my-zsh/cache/completions:/usr/local/share/zsh/site-functions:/usr/share/zsh/vendor-functions:/usr/share/zsh/vendor-completions:/usr/share/zsh/functions/Calendar:/usr/share/zsh/functions/Chpwd:/usr/share/zsh/functions/Completion:/usr/share/zsh/functions/Completion/AIX:/usr/share/zsh/functions/Completion/BSD:/usr/share/zsh/functions/Completion/Base:/usr/share/zsh/functions/Completion/Cygwin:/usr/share/zsh/functions/Completion/Darwin:/usr/share/zsh/functions/Completion/Debian:/usr/share/zsh/functions/Completion/Linux:/usr/share/zsh/functions/Completion/Mandriva:/usr/share/zsh/functions/Completion/Redhat:/usr/share/zsh/functions/Completion/Solaris:/usr/share/zsh/functions/Completion/Unix:/usr/share/zsh/functions/Completion/X:/usr/share/zsh/functions/Completion/Zsh:/usr/share/zsh/functions/Completion/openSUSE:/usr/share/zsh/functions/Exceptions:/usr/share/zsh/functions/MIME:/usr/share/zsh/functions/Math:/usr/share/zsh/functions/Misc:/usr/share/zsh/functions/Newuser:/usr/share/zsh/functions/Prompts:/usr/share/zsh/functions/TCP:/usr/share/zsh/functions/VCS_Info:/usr/share/zsh/functions/VCS_Info/Backends:/usr/share/zsh/functions/Zftp:/usr/share/zsh/functions/Zle GITTOKEN=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 HOME=/home/lcfjr LANG=en_US.UTF-8 LC_ADDRESS=en_US.UTF-8 LC_IDENTIFICATION=en_US.UTF-8 LC_MEASUREMENT=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_NAME=en_US.UTF-8 LC_NUMERIC=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_TELEPHONE=en_US.UTF-8 LC_TERMINAL=iTerm2 LC_TERMINAL_VERSION=3.4.15 LC_TIME=en_US.UTF-8 LESS=-R LOADEDMODULES= LOGNAME=lcfjr LSCOLORS=Gxfxcxdxbxegedabagacad LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: MANPATH=: MODULEPATH=/opt/lcsoftware/spack/share/spack/modules/linux-ubuntu20.04-zen2 MODULEPATH_modshare=/etc/environment-modules/modules:1:/usr/share/modules/$MODULE_VERSION/modulefiles:1:/usr/share/modules/modulefiles:1:/usr/share/modules/versions:1 MODULESHOME=/usr/share/modules MODULES_CMD=/usr/lib/x86_64-linux-gnu/modulecmd.tcl MOTD_SHOWN=pam OLDPWD=/home/lcfjr/codes/shenggui/OPT-Demo/logs PAGER=less PATH=/home/lcfjr/miniconda3/envs/cs/bin:/home/lcfjr/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin PWD=/home/lcfjr/codes/shenggui/OPT-Demo PYTHONPATH=/home/lcfjr/codes/ColossalAI: SHELL=/usr/bin/zsh SHLVL=1 SSH_CLIENT=113.208.117.206 52011 10086 SSH_CONNECTION=113.208.117.206 52011 59.108.228.2 10086 SSH_TTY=/dev/pts/10 TERM=screen TMUX=/tmp//tmux-1008/default,303952,0 TMUX_PANE=%0 USER=lcfjr XDG_RUNTIME_DIR=/run/user/1008 XDG_SESSION_CLASS=user XDG_SESSION_ID=174 XDG_SESSION_TYPE=tty ZSH=/home/lcfjr/.oh-my-zsh _=/usr/bin/tmux _CE_CONDA= _CE_M=
diff --git a/examples/language/opt/requirements.txt b/examples/language/opt/requirements.txt
new file mode 100644
index 000000000..47bec60d2
--- /dev/null
+++ b/examples/language/opt/requirements.txt
@@ -0,0 +1,5 @@
+colossalai
+torch >= 1.8.1
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
diff --git a/examples/language/opt/run_clm.py b/examples/language/opt/run_clm.py
new file mode 100755
index 000000000..b9283de08
--- /dev/null
+++ b/examples/language/opt/run_clm.py
@@ -0,0 +1,593 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+
+import math
+import os
+import random
+import time
+from itertools import chain
+
+import datasets
+import torch
+import torch.distributed as dist
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from packaging import version
+from titans.utils import barrier_context
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from utils import colo_memory_cap
+
+import colossalai
+import transformers
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.gemini import ChunkManager, GeminiManager
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.nn.parallel import ZeroDDP
+from colossalai.tensor import ProcessGroup
+from colossalai.utils import get_current_device, get_dataloader
+from colossalai.utils.checkpoint import load_checkpoint, save_checkpoint
+from colossalai.utils.model.colo_init_context import ColoInitContext
+from colossalai.zero import ZeroOptimizer
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    GPT2Tokenizer,
+    OPTForCausalLM,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+)
+from transformers.utils.versions import require_version
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def get_time_stamp():
+    torch.cuda.synchronize()
+    return time.time()
+
+
+def parse_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument("--train_file",
+                        type=str,
+                        default=None,
+                        help="A csv or a json file containing the training data.")
+    parser.add_argument("--validation_file",
+                        type=str,
+                        default=None,
+                        help="A csv or a json file containing the validation data.")
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument("--num_warmup_steps",
+                        type=int,
+                        default=0,
+                        help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--block_size",
+        type=int,
+        default=None,
+        help=("Optional input sequence length after tokenization. The training dataset will be truncated in block of"
+              " this size for training. Default to the model max input length for single sentence inputs (take into"
+              " account special tokens)."),
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument("--overwrite_cache",
+                        type=bool,
+                        default=False,
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument("--no_keep_linebreaks",
+                        action="store_true",
+                        help="Do not keep line breaks when using TXT files.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_model_id",
+                        type=str,
+                        help="The name of the repository to keep in sync with the local `output_dir`.")
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=('The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+              ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+              "Only applicable when `--with_tracking` is passed."),
+    )
+
+    parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
+    parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+
+    return args
+
+
+def main():
+    args = parse_args()
+    disable_existing_loggers()
+    colossalai.launch_from_torch(config=dict())
+    logger = get_dist_logger()
+    is_main_process = gpc.get_local_rank(ParallelMode.DATA) == 0
+
+    if is_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+        logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
+
+    # Handle the repository creation
+    with barrier_context():
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    logger.info("Start preparing dataset", ranks=[0])
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+                **dataset_args,
+            )
+    logger.info("Dataset is prepared", ranks=[0])
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    logger.info("Model config has been created", ranks=[0])
+
+    if args.model_name_or_path == 'facebook/opt-13b':
+        tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+    else:
+        print(f'load model from {args.model_name_or_path}')
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    logger.info(f"{tokenizer.__class__.__name__} has been created", ranks=[0])
+
+    if args.init_in_cpu:
+        init_dev = torch.device('cpu')
+    else:
+        init_dev = get_current_device()
+
+    # build model
+    if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
+        # currently, there has a bug in pretrained opt-13b
+        # we can not import it until huggingface fix it
+        logger.info("Train a new model from scratch", ranks=[0])
+        with ColoInitContext(device=init_dev):
+            model = OPTForCausalLM(config)
+    else:
+        logger.info("Finetune a pre-trained model", ranks=[0])
+        with ColoInitContext(device=init_dev):
+            model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
+                                                   from_tf=bool(".ckpt" in args.model_name_or_path),
+                                                   config=config,
+                                                   local_files_only=False)
+
+    # enable graident checkpointing
+    model.gradient_checkpointing_enable()
+
+    PLACEMENT_POLICY = 'auto'
+    cai_version = colossalai.__version__
+    logger.info(f'using Colossal-AI version {cai_version}')
+    if version.parse(cai_version) > version.parse("0.1.10"):
+        from colossalai.gemini import GeminiManager
+        from colossalai.gemini.chunk import init_chunk_manager
+        chunk_manager = init_chunk_manager(model=model, init_device=get_current_device(), search_range_mb=32)
+        gemini_manager = GeminiManager(PLACEMENT_POLICY, chunk_manager)
+        model = ZeroDDP(model, gemini_manager, pin_memory=True)
+    elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
+        from colossalai.gemini import ChunkManager, GeminiManager
+        pg = ProcessGroup()
+        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
+        chunk_manager = ChunkManager(chunk_size,
+                                     pg,
+                                     enable_distributed_storage=True,
+                                     init_device=GeminiManager.get_default_device(PLACEMENT_POLICY))
+
+    logger.info(f'{model.__class__.__name__} has been created', ranks=[0])
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name])
+
+    with barrier_context(executor_rank=0, parallel_mode=ParallelMode.DATA):
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+
+    if args.block_size is None:
+        block_size = tokenizer.model_max_length
+        if block_size > 1024:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --block_size xxx.")
+        block_size = 1024
+    else:
+        if args.block_size > tokenizer.model_max_length:
+            logger.warning(f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
+                           f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}.")
+        block_size = min(args.block_size, tokenizer.model_max_length)
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i:i + block_size] for i in range(0, total_length, block_size)
+               ] for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+    with barrier_context(executor_rank=0, parallel_mode=ParallelMode.DATA):
+        lm_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            load_from_cache_file=not args.overwrite_cache,
+            desc=f"Grouping texts in chunks of {block_size}",
+        )
+
+    train_dataset = lm_datasets["train"]
+    eval_dataset = lm_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    # for index in random.sample(range(len(train_dataset)), 3):
+    #     logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    train_dataloader = get_dataloader(train_dataset,
+                                      shuffle=True,
+                                      add_sampler=True,
+                                      collate_fn=default_data_collator,
+                                      batch_size=args.per_device_train_batch_size)
+    eval_dataloader = DataLoader(eval_dataset,
+                                 collate_fn=default_data_collator,
+                                 batch_size=args.per_device_eval_batch_size)
+    logger.info("Dataloaders have been created", ranks=[0])
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=args.learning_rate)
+    optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**14)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * gpc.get_world_size(ParallelMode.DATA)
+
+    logger.info("***** Running training *****", ranks=[0])
+    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
+    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}", ranks=[0])
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}", ranks=[0])
+    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process)
+    completed_steps = 0
+    starting_epoch = 0
+    global_step = 0
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+
+        if completed_steps >= args.max_train_steps:
+            break
+
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            batch = {k: v.cuda() for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs['loss']
+            optimizer.backward(loss)
+
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            global_step += 1
+            logger.info("Global step {} finished".format(global_step + 1), ranks=[0])
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                batch = {k: v.cuda() for k, v in batch.items()}
+                outputs = model(**batch)
+
+        loss = outputs['loss'].unsqueeze(0)
+        losses.append(loss)
+
+        losses = torch.cat(losses)
+        losses = losses[:len(eval_dataset)]
+        try:
+            eval_loss = torch.mean(losses)
+            perplexity = math.exp(eval_loss)
+        except OverflowError:
+            perplexity = float("inf")
+
+        logger.info(f"Epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}", ranks=[0])
+
+    if args.output_dir is not None:
+        model_state = model.state_dict()
+        if is_main_process:
+            torch.save(model_state, args.output_dir + '/epoch_{}_model.pth'.format(completed_steps))
+        dist.barrier()
+        # load_state = torch.load(args.output_dir + '/epoch_{}_model.pth'.format(completed_steps))
+        # model.load_state_dict(load_state, strict=False)
+
+    logger.info("Training finished", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/opt/run_clm.sh b/examples/language/opt/run_clm.sh
new file mode 100644
index 000000000..858d3325a
--- /dev/null
+++ b/examples/language/opt/run_clm.sh
@@ -0,0 +1,22 @@
+set -x
+export BS=${1:-16}
+export MEMCAP=${2:-0}
+export MODEL=${3:-"125m"}
+export GPUNUM=${4:-1}
+
+# make directory for logs
+mkdir -p ./logs
+
+export MODLE_PATH="facebook/opt-${MODEL}"
+
+# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
+torchrun \
+  --nproc_per_node ${GPUNUM} \
+  --master_port 19198 \
+  run_clm.py \
+  --dataset_name wikitext \
+  --dataset_config_name wikitext-2-raw-v1 \
+  --output_dir $PWD \
+  --mem_cap ${MEMCAP} \
+  --model_name_or_path ${MODLE_PATH} \
+  --per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
diff --git a/examples/language/opt/utils.py b/examples/language/opt/utils.py
new file mode 100644
index 000000000..a7651e5e4
--- /dev/null
+++ b/examples/language/opt/utils.py
@@ -0,0 +1,28 @@
+import torch
+import torch.distributed as dist
+
+
+def memory_cap(size_in_GB):
+    print(f"use only {size_in_GB} GB of CUDA memory")
+    assert dist.is_initialized(), "memory_cap must be used after dist init"
+    local_rank = dist.get_rank()
+    cuda_capacity = torch.cuda.get_device_properties(local_rank).total_memory
+    size_in_B = (size_in_GB * 1024**3)
+    if size_in_B > cuda_capacity:
+        print(f'memory_cap is uselsess since {cuda_capacity / 1024**3} less than {size_in_GB}')
+        return
+    fraction = (size_in_GB * 1024**3) / cuda_capacity
+    print(f'mem faction is {fraction}')
+    torch.cuda.set_per_process_memory_fraction(fraction, local_rank)
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print("Using {} GB of GPU memory".format(size_in_GB))
+
+
+if __name__ == '__main__':
+    memory_cap(40)