diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..650ee88 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,28 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: doc/code-docs/source/conf.py + fail_on_warning: false + +# Optionally build your docs in additional formats such as PDF +formats: + - pdf + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: doc/code-docs/requirements.txt diff --git a/doc/code-docs/Makefile b/doc/code-docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/doc/code-docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/code-docs/make.bat b/doc/code-docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/doc/code-docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/doc/code-docs/requirements.txt b/doc/code-docs/requirements.txt new file mode 100644 index 0000000..9a4bb3d --- /dev/null +++ b/doc/code-docs/requirements.txt @@ -0,0 +1,6 @@ +Sphinx +sphinx-autobuild +recommonmark +sphinx_rtd_theme +sphinx_markdown_tables +autodoc_pydantic==1.9 \ No newline at end of file diff --git a/doc/code-docs/source/checkpoint.rst b/doc/code-docs/source/checkpoint.rst new file mode 100644 index 0000000..3ceed08 --- /dev/null +++ b/doc/code-docs/source/checkpoint.rst @@ -0,0 +1,2 @@ +Model Checkpointing +=================== \ No newline at end of file diff --git a/doc/code-docs/source/conf.py b/doc/code-docs/source/conf.py new file mode 100644 index 0000000..5986f06 --- /dev/null +++ b/doc/code-docs/source/conf.py @@ -0,0 +1,62 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +import os +import sys + +project = "InternLM" +copyright = "2023, InternLM Team" +author = "InternLM Team" +release = "v0.2.0" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "recommonmark", + "sphinx_rtd_theme", + "sphinx.ext.viewcode", + "sphinx.ext.autodoc", + "sphinxcontrib.autodoc_pydantic", + "sphinx.ext.autosectionlabel", + "sphinx.ext.napoleon", +] + +pygments_style = "sphinx" + +# autodoc_pyandtic config +autodoc_pydantic_model_show_field_summary = False +autodoc_pydantic_field_signature_prefix = " " +autodoc_pydantic_model_signature_prefix = "class" +autodoc_pydantic_model_show_json = False +autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_model_show_config_member = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_model_show_validator_members = False +autodoc_pydantic_model_summary_list_order = "bysource" +autodoc_pydantic_model_member_order = "bysource" +autodoc_pydantic_field_list_validators = False + +templates_path = ["_templates"] + +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] + +sys.path.insert(0, os.path.abspath("../../../")) + +# Prepend module names to class descriptions +add_module_names = True + +autoclass_content = "init" + +autodoc_mock_imports = ["apex", "torch"] diff --git a/doc/code-docs/source/index.rst b/doc/code-docs/source/index.rst new file mode 100644 index 0000000..3011df6 --- /dev/null +++ b/doc/code-docs/source/index.rst @@ -0,0 +1,70 @@ +.. InternLM documentation master file, created by + sphinx-quickstart on Mon Aug 28 17:33:28 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +InternLM +======== + +Environment Setup +------------------- + +.. toctree:: + :maxdepth: 2 + + install + +Model Setup +------------------- + +.. toctree:: + :maxdepth: 2 + + initialize + +Training API +------------------- + +.. toctree:: + :maxdepth: 2 + + training + +Parallel Training +------------------- + +.. toctree:: + :maxdepth: 2 + + parallel + +Model Checkpointing +------------------- + +.. toctree:: + :maxdepth: 2 + + checkpoint + +Profiler +------------------- + +.. toctree:: + :maxdepth: 2 + + profiler + +Monitor +------------------- + +.. toctree:: + :maxdepth: 2 + + monitor + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/doc/code-docs/source/initialize.rst b/doc/code-docs/source/initialize.rst new file mode 100644 index 0000000..a638c33 --- /dev/null +++ b/doc/code-docs/source/initialize.rst @@ -0,0 +1,35 @@ +Training Setup +============== + +.. _InternLM-args: + +Argument Parsing +---------------- +InternLM uses the `argparse `_ library to supply commandline +configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default +parser with some builtin arguments, users can add custom parameters to this parser. + +.. code-block:: python + + # Get InternLM default parser + parser = internlm.initialize.get_default_parser() + # Add new argument + parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.") + cmd_args = parser.parse_args() + +.. autofunction:: internlm.initialize.get_default_parser + + +.. _InternLM-init: + +Model Initialization +------------------------- + +Optimizer Initialization +------------------------- + +Dataloader Initialization +------------------------- + +Trainer Initialization +------------------------- diff --git a/doc/code-docs/source/install.md b/doc/code-docs/source/install.md new file mode 100644 index 0000000..26f57c0 --- /dev/null +++ b/doc/code-docs/source/install.md @@ -0,0 +1,70 @@ +## Installation + +### Environment Preparation +The required packages and corresponding version are shown as follows: +- Python == 3.10 +- GCC == 10.2.0 +- MPFR == 4.1.0 +- CUDA >= 11.7 +- Pytorch >= 1.13.1 +- Transformers >= 4.28.0 +- Flash-Attention >= v1.0.5 +- Apex == 23.05 +- GPU with Ampere or Hopper architecture (such as H100, A100) +- Linux OS + +After installing the above dependencies, some system environment variables need to be updated: +```bash +export CUDA_PATH={path_of_cuda_11.7} +export GCC_HOME={path_of_gcc_10.2.0} +export MPFR_HOME={path_of_mpfr_4.1.0} +export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH +export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH +export CC=${GCC_HOME}/bin/gcc +export CXX=${GCC_HOME}/bin/c++ +``` + +### Environment Installation +Clone the project `internlm` and its dependent submodules from the github repository, as follows: +```bash +git clone git@github.com:InternLM/InternLM.git --recurse-submodules +``` + +It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files: +```bash +conda create --name internlm-env python=3.10 -y +conda activate internlm-env +cd internlm +pip install -r requirements/torch.txt +pip install -r requirements/runtime.txt +``` + +Install flash-attention (version v1.0.5): +```bash +cd ./third_party/flash-attention +python setup.py install +cd ./csrc +cd fused_dense_lib && pip install -v . +cd ../xentropy && pip install -v . +cd ../rotary && pip install -v . +cd ../layer_norm && pip install -v . +cd ../../../../ +``` + +Install Apex (version 23.05): +```bash +cd ./third_party/apex +pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +cd ../../ +``` + +### Environment Image +Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows: + +```bash +# pull image +docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +# start container +docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos +docker exec -it myinternlm bash +``` diff --git a/doc/code-docs/source/monitor.rst b/doc/code-docs/source/monitor.rst new file mode 100644 index 0000000..ff8cd1b --- /dev/null +++ b/doc/code-docs/source/monitor.rst @@ -0,0 +1,10 @@ +Monitor and Alert +================= + + +Monitoring +----------------- + + +Alerting +----------------- diff --git a/doc/code-docs/source/parallel.rst b/doc/code-docs/source/parallel.rst new file mode 100644 index 0000000..3515847 --- /dev/null +++ b/doc/code-docs/source/parallel.rst @@ -0,0 +1,23 @@ +Parallel Training +================= + +.. 整体说一下并行配置使用方式,接下来再分模块详细说明 + +Tensor Parallel +----------------- + + +Pipeline Parallel +----------------- + + +Sequence Parallel +----------------- + + +Data Parallel +----------------- + + +ZeRO1.5 +----------------- \ No newline at end of file diff --git a/doc/code-docs/source/profiler.rst b/doc/code-docs/source/profiler.rst new file mode 100644 index 0000000..c10f425 --- /dev/null +++ b/doc/code-docs/source/profiler.rst @@ -0,0 +1,11 @@ +Profiler +======== + +.. 可介绍torch profiler, memory profiler的使用 + +Torch Profiler +----------------- + + +Memory Profiler +----------------- \ No newline at end of file diff --git a/doc/code-docs/source/training.rst b/doc/code-docs/source/training.rst new file mode 100644 index 0000000..e9ee124 --- /dev/null +++ b/doc/code-docs/source/training.rst @@ -0,0 +1,2 @@ +Training API +============ \ No newline at end of file