2023-04-06 08:23:39 +00:00
# coding=utf-8
2023-07-21 06:39:01 +00:00
import os
2023-05-18 12:05:59 +00:00
import re
2023-06-15 07:21:26 +00:00
from collections import abc as container_abcs
from collections import defaultdict
from itertools import chain
2023-04-04 07:23:01 +00:00
from pathlib import Path
2023-05-18 12:05:59 +00:00
from typing import Iterator , List , Mapping , Optional , OrderedDict , Tuple
2023-04-04 07:23:01 +00:00
import torch
2023-04-06 08:23:39 +00:00
import torch . nn as nn
2023-10-07 02:45:52 +00:00
from packaging . version import Version
2023-06-15 07:21:26 +00:00
from torch . optim import Optimizer
[Feature] LoRA rebased to main branch (#5622)
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [moe] merge moe into main (#4978)
* update moe module
* support openmoe
* [hotfix] fix grad accumulation plus clipping for gemini (#5002)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915)
* Add layer norm gradients all-reduce for sequence parallel.
* skip pipeline inference test
* [hotfix] fixing polices of sequence parallel (#4922)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
---------
Co-authored-by: littsk <1214689160@qq.com>
* Hotfix/add grad all reduce for sequence parallel (#4927)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
* fix bug using wrong variables
---------
Co-authored-by: littsk <1214689160@qq.com>
* fix policy initialization
* fix bloom and chatglm policices
* polish code of handling layernorm
* fix moe module
* polish code of class initializing
---------
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [format] applied code formatting on changed files in pull request 4926 (#5007)
Co-authored-by: github-actions <github-actions@github.com>
* [Inference] Fix bug in ChatGLM2 Tensor Parallelism (#5014)
* fix bug
* fix
* fix multiquery
* fix multiquery
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [misc] add code owners (#5024)
* [moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method
* unshard optim ckpt
* optim io
* update transformer version
* update requirements
* update ckpt
* update ckpt
* update ckpt
* fix engine
* fix engine
* Support mtbench (#5025)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [moe]: fix ep/tp tests, add hierarchical all2all (#4982)
* fix: add warning for EP different behavior
* fix: use shard_data in ep & tp model
* to: add used_capacity
* fix: fix router test
* feat: add create_ep_node_group
* feat: add create_ep_hierarchical_group fn
* feat: add HierarchicalAllToAll
* test: add hierarchical all2all test
* fix: fix test errors
* fix: simplify create_ep_hierarchical_group
* fix: add hierarchical_alltoall arg
* fix: fix environ typo
* revert: revert process mesh order
* to: add todo mark
* fix: skip hierarchical_comm if torch < 1.13.1
* [shardformer] Fix serialization error with Tensor Parallel state saving (#5018)
* Fix serialization error with Tensor Parallel state saving
* Refactor state_dict CPU transfer using tree_map
* [gemini] gemini support tensor parallelism. (#4942)
* [colossalai]fix typo
* [inference] Add smmoothquant for llama (#4904)
* [inference] add int8 rotary embedding kernel for smoothquant (#4843)
* [inference] add smoothquant llama attention (#4850)
* add smoothquant llama attention
* remove uselss code
* remove useless code
* fix import error
* rename file name
* [inference] add silu linear fusion for smoothquant llama mlp (#4853)
* add silu linear
* update skip condition
* catch smoothquant cuda lib exception
* prcocess exception for tests
* [inference] add llama mlp for smoothquant (#4854)
* add llama mlp for smoothquant
* fix down out scale
* remove duplicate lines
* add llama mlp check
* delete useless code
* [inference] add smoothquant llama (#4861)
* add smoothquant llama
* fix attention accuracy
* fix accuracy
* add kv cache and save pretrained
* refactor example
* delete smooth
* refactor code
* [inference] add smooth function and delete useless code for smoothquant (#4895)
* add smooth function and delete useless code
* update datasets
* remove duplicate import
* delete useless file
* refactor codes (#4902)
* rafactor code
* add license
* add torch-int and smoothquant license
* Update flash_attention_patch.py
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
* [kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921)
* [kernel] support pure fp16 for cpu adam (#4896)
* [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919)
* [kernel] fix cpu adam
* [test] update gemini optim test
* [format] applied code formatting on changed files in pull request 4908 (#4918)
Co-authored-by: github-actions <github-actions@github.com>
* [gemini] support gradient accumulation (#4869)
* add test
* fix no_sync bug in low level zero plugin
* fix test
* add argument for grad accum
* add grad accum in backward hook for gemini
* finish implementation, rewrite tests
* fix test
* skip stuck model in low level zero test
* update doc
* optimize communication & fix gradient checkpoint
* modify doc
* cleaning codes
* update cpu adam fp16 case
* [hotfix] fix torch 2.0 compatibility (#4936)
* [hotfix] fix launch
* [test] fix test gemini optim
* [shardformer] fix vit
* [test] add no master test for low level zero plugin (#4934)
* [format] applied code formatting on changed files in pull request 4820 (#4886)
Co-authored-by: github-actions <github-actions@github.com>
* [nfc] fix some typo with colossalai/ docs/ etc. (#4920)
* [Refactor] Integrated some lightllm kernels into token-attention (#4946)
* add some req for inference
* clean codes
* add codes
* add some lightllm deps
* clean codes
* hello
* delete rms files
* add some comments
* add comments
* add doc
* add lightllm deps
* add lightllm cahtglm2 kernels
* add lightllm cahtglm2 kernels
* replace rotary embedding with lightllm kernel
* add some commnets
* add some comments
* add some comments
* add
* replace fwd kernel att1
* fix a arg
* add
* add
* fix token attention
* add some comments
* clean codes
* modify comments
* fix readme
* fix bug
* fix bug
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
* [test] merge old components to test to model zoo (#4945)
* [test] add custom models in model zoo
* [test] update legacy test
* [test] update model zoo
* [test] update gemini test
* [test] remove components to test
* [inference] add reference and fix some bugs (#4937)
* add reference and fix some bugs
* update gptq init
---------
Co-authored-by: Xu Kai <xukai16@foxamil.com>
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
* fix
fix
fix
* update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
* support fused layernorm
support fused layernorm
support fused layernorm
* update fusedlayernorm
update fusedlayernorm
update fusedlayernorm
* add sequence parallel to gemini
add sequence parallel to gemini
* fix
* fix comments
fix comments
fix comments
* fix
* fix t5
* clear cache
* fix
* activate ci
* activate ci
* fix
* fix
* fix
* fix
* revert
* modify tp gather method
modify tp gather method
modify tp gather method
modify tp gather method
* fix test
---------
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
* [hotfix] Suport extra_kwargs in ShardConfig (#5031)
* [refactor]: replace inference args with extra_kwargs in ShardConfig
* modify shardconfig
* polish code
* fix policy bug in llama
* fix bug in auto policy
* remove setattr in ShardConfig
* fix wrong EOS token in ColossalChat
* [Kernels]Update triton kernels into 2.1.0 (#5046)
* update flash-context-attention
* adding kernels
* fix
* reset
* add build script
* add building process
* add llama2 exmaple
* add colossal-llama2 test
* clean
* fall back test setting
* fix test file
* clean
* clean
* clean
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)
* Use p2p
* Cannot bidirectonal send p2p
* Refactor tensor creation and serialization in P2P
communication
* Fix llama forward args in flash attention
* Add flop estimate from megatron
* Support loading weight not in weight_map when strict=False in hybrid_parallel
* Use send_forward_recv_backward, etc in 1f1b
* Use dataclass for metdata
Remove torch.cuda.synchronize() as suggested
* Add comment about the torch.cuda.synchronize for potential error
* Typo
* Update hybrid_parallel_checkpoint_io.py
* Update p2p.py
* Update one_f_one_b.py
* Update p2p.py
---------
Co-authored-by: flybird11111 <1829166702@qq.com>
* [gemini] gemini support extra-dp (#5043)
* support ddp
* fix
* fix
* fix
fix
* support ddp
* fix
* fix
* fix
fix
* simplify tests
* fix
* fix
* fix
fix
fix
* fix
* [shardformer] fix llama error when transformers upgraded. (#5055)
* fix-llama
* Update llama.py
* [hotfix]: modify create_ep_hierarchical_group and add test (#5032)
* feat: modify create_ep_hierarchical_group args
* test: add ep tests
* fix: remove get_process_group_ranks
* fix: fix src_rank
* [exampe] fix llama example' loss error when using gemini plugin (#5060)
fix llama example
* [inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998)
* support only tp
* enable tp
* add support for bloom (#5008)
* [refactor] refactor gptq and smoothquant llama (#5012)
* refactor gptq and smoothquant llama
* fix import error
* fix linear import torch-int
* fix smoothquant llama import error
* fix import accelerate error
* fix bug
* fix import smooth cuda
* fix smoothcuda
* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)
merge chatglm with pp and tp
* [Refactor] remove useless inference code (#5022)
* remove useless code
* fix quant model
* fix test import bug
* mv original inference legacy
* fix chatglm2
* [Refactor] refactor policy search and quant type controlling in inference (#5035)
* [Refactor] refactor policy search and quant type controling in inference
* [inference] update readme (#5051)
* update readme
* update readme
* fix architecture
* fix table
* fix table
* [inference] udpate example (#5053)
* udpate example
* fix run.sh
* fix rebase bug
* fix some errors
* update readme
* add some features
* update interface
* update readme
* update benchmark
* add requirements-infer
---------
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel
* add req
* clean
* clean
* delete build.sh
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [misc] remove outdated submodule (#5070)
* [npu] add npu support for gemini and zero (#5067)
* [npu] setup device utils (#5047)
* [npu] add npu device support
* [npu] support low level zero
* [test] update npu zero plugin test
* [hotfix] fix import
* [test] recover tests
* [npu] gemini support npu (#5052)
* [npu] refactor device utils
* [gemini] support npu
* [example] llama2+gemini support npu
* [kernel] add arm cpu adam kernel (#5065)
* [kernel] add arm cpu adam
* [optim] update adam optimizer
* [kernel] arm cpu adam remove bf16 support
* [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
* [inference] update examples and engine (#5073)
* update examples and engine
* fix choices
* update example
* [format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
* [hotfix/hybridengine] Fix init model with random parameters in benchmark (#5074)
* fix init model with random parameters
* fix example
* [inference] refactor examples and fix schedule (#5077)
* [setup] refactor infer setup
* [hotfix] fix infenrece behavior on 1 1 gpu
* [exmaple] refactor inference examples
* fix thrust-transform-reduce error (#5078)
* [nfc] fix typo in docs/ (#4972)
* [nfc] fix typo and author name (#5089)
* [gemini]fix gemini optimzer, saving Shardformer in Gemini got list assignment index out of range (#5085)
* [Hotfix] Fix model policy matching strategy in ShardFormer (#5064)
* hotfix/Fix get model policy strategy in ShardFormer
* fix bug in auto policy
* [shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn
* fix
fix
* [npu] add npu support for hybrid plugin and llama (#5090)
* llama 3d
* update
* fix autocast
* [Feature] Add document retrieval QA (#5020)
* add langchain
* add langchain
* Add files via upload
* add langchain
* fix style
* fix style: remove extra space
* add pytest; modified retriever
* add pytest; modified retriever
* add tests to build_on_pr.yml
* fix build_on_pr.yml
* fix build on pr; fix environ vars
* seperate unit tests for colossalqa from build from pr
* fix container setting; fix environ vars
* commented dev code
* add incremental update
* remove stale code
* fix style
* change to sha3 224
* fix retriever; fix style; add unit test for document loader
* fix ci workflow config
* fix ci workflow config
* add set cuda visible device script in ci
* fix doc string
* fix style; update readme; refactored
* add force log info
* change build on pr, ignore colossalqa
* fix docstring, captitalize all initial letters
* fix indexing; fix text-splitter
* remove debug code, update reference
* reset previous commit
* update LICENSE update README add key-value mode, fix bugs
* add files back
* revert force push
* remove junk file
* add test files
* fix retriever bug, add intent classification
* change conversation chain design
* rewrite prompt and conversation chain
* add ui v1
* ui v1
* fix atavar
* add header
* Refactor the RAG Code and support Pangu
* Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo.
* resolved conversation. tested scripts under examples. web demo still buggy
* fix ci tests
* Some modifications to add ChatGPT api
* modify llm.py and remove unnecessary files
* Delete applications/ColossalQA/examples/ui/test_frontend_input.json
* Remove OpenAI api key
* add colossalqa
* move files
* move files
* move files
* move files
* fix style
* Add Readme and fix some bugs.
* Add something to readme and modify some code
* modify a directory name for clarity
* remove redundant directory
* Correct a type in llm.py
* fix AI prefix
* fix test_memory.py
* fix conversation
* fix some erros and typos
* Fix a missing import in RAG_ChatBot.py
* add colossalcloud LLM wrapper, correct issues in code review
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* remove duplicate import (#5100)
* fix typo change lazy_iniy to lazy_init (#5099)
* [nfc] fix typo change directoty to directory (#5111)
* [FEATURE] Add Safety Eval Datasets to ColossalEval (#5095)
* add safetybench and cvalues(responsibility) eval dataset
* Modify code according to review suggestions
---------
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* [hotfix] fixed memory usage of shardformer module replacement (#5122)
* [shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088)
* [shardformer] implement policy for all GPT-J models and test
* [shardformer] support interleaved pipeline parallel for bert finetune
* [shardformer] shardformer support falcon (#4883)
* [shardformer]: fix interleaved pipeline for bert model (#5048)
* [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093)
* Add Mistral support for Shardformer (#5103)
* [shardformer] add tests to mistral (#5105)
---------
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
* [doc] add moe news (#5128)
* [doc] add moe news
* [doc] add moe news
* [doc] add moe news
* [doc] updated paper citation (#5131)
* fix typo change JOSNL TO JSONL etc. (#5116)
* [format] applied code formatting on changed files in pull request 5088 (#5127)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5124 (#5125)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5115 (#5118)
Co-authored-by: github-actions <github-actions@github.com>
* [accelerator] init the accelerator module (#5129)
* [accelerator] init the accelerator module
* polish code
* polish code
* polish code
* polish code
* [npu] support triangle attention for llama (#5130)
* update fused attn
* update spda
* tri attn
* update triangle
* import
* fix
* fix
* [plugin]fix 3d checkpoint load when booster boost without optimizer. (#5135)
* fix 3d checkpoint load when booster boost without optimizer
fix 3d checkpoint load when booster boost without optimizer
* test ci
* revert ci
* fix
fix
* [ColossalQA] refactor server and webui & add new feature (#5138)
* refactor server and webui & add new feature
* add requirements
* modify readme and ui
* [doc] fix colossalqa document (#5146)
* fix doc
* modify doc
* fix (#5158)
fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* [gemini] hotfix NaN loss while using Gemini + tensor_parallel (#5150)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* [colossalqa] fix pangu api (#5170)
* fix pangu api
* add comment
* [ColossalEval] Support GSM, Data Leakage Evaluation and Tensor Parallel (#5169)
* Support GSM, Data Leakage Evaluation and Tensor Parallel
* remove redundant code and update inference.py in examples/gpt_evaluation
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [shardformer] llama support DistCrossEntropy (#5176)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* fix ci
* fix ci
---------
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* Fix ColossalEval (#5186)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc] update pytorch version in documents. (#5177)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* update pytorch version in documents
* polish readme in application/chat (#5194)
* [pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)
* test: add more p2p tests
* fix: remove send_forward_recv_forward as p2p op list need to use the same group
* fix: make send and receive atomic
* feat: update P2PComm fn
* feat: add metadata cache in 1f1b
* feat: add metadata cache in interleaved pp
* feat: modify is_xx_stage fn
* revert: add _broadcast_object_list
* feat: add interleaved pp in llama policy
* feat: set NCCL_BUFFSIZE in HybridParallelPlugin
* Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>
* [doc] Update required third-party library list for testing and torch comptibility checking (#5207)
* doc/update requirements-test.txt
* update torch-cuda compatibility check
* support linear accumulation fusion (#5199)
support linear accumulation fusion
support linear accumulation fusion
fix
* [pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader
* feat: add run_forward_only, support arbitrary bs
* chore: modify ci script
* [pipeline]: add p2p fallback order and fix interleaved pp deadlock (#5214)
* fix: add fallback order option and update 1f1b
* fix: fix deadlock comm in interleaved pp
* test: modify p2p test
* [devops] update torch versoin in ci (#5217)
* fix-test (#5210)
fix-test
fix-test
* fix flash attn (#5209)
* [nfc] fix typo colossalai/shardformer/ (#5133)
* [Colossal-LLaMA-2] Release Colossal-LLaMA-2-13b-base model (#5224)
* update readme
* update readme
* update link
* update
* update readme
* update
* update
* update
* update title
* update example
* update example
* fix content
* add conclusion
* add license
* update
* update
* update version
* fix minor
* [doc] Update README.md of Colossal-LLAMA2 (#5233)
* Update README.md
* Update README.md
* [doc] Make leaderboard format more uniform and good-looking (#5231)
* Make leaderboard format more unifeid and good-looking
* Update README.md
* Update README.md
* [doc] add Colossal-LLaMA-2-13B (#5234)
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [format] applied code formatting on changed files in pull request 5234 (#5235)
Co-authored-by: github-actions <github-actions@github.com>
* [doc] SwiftInfer release (#5236)
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [npu] use extension for op builder (#5172)
* update extension
* update cpu adam
* update is
* add doc for cpu adam
* update kernel
* update commit
* update flash
* update memory efficient
* update flash attn
* update flash attention loader
* update api
* fix
* update doc
* update example time limit
* reverse change
* fix doc
* remove useless kernel
* fix
* not use warning
* update
* update
* [pipeline] A more general _communicate in p2p (#5062)
* A more general _communicate
* feat: finish tree_flatten version p2p
* fix: update p2p api calls
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [npu] change device to accelerator api (#5239)
* update accelerator
* fix timer
* fix amp
* update
* fix
* update bug
* add error raise
* fix autocast
* fix set device
* remove doc accelerator
* update doc
* update doc
* update doc
* use nullcontext
* update cpu
* update null context
* change time limit for example
* udpate
* update
* update
* update
* [npu] polish accelerator code
---------
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
* [hotfix] removed unused flag (#5242)
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* [hotfix] fix 3d plugin test (#5292)
* fix bug for mefture (#5299)
* [NFC] polish applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py code style (#5228)
* fix some typo (#5307)
* [feat] refactored extension module (#5298)
* [feat] refactored extension module
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* [workflow] updated CI image (#5318)
* [accelerator] fixed npu api
* [tests] fix t5 test. (#5322)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* fix t5 test
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] added docs for extensions (#5324)
* [doc] added docs for extensions
* polish
* polish
* fix typo under extensions/ (#5330)
* fix typo change dosen't to doesn't (#5308)
* [extension] fixed exception catch (#5342)
* [Chat] fix sft loss nan (#5345)
* fix script
* fix script
* fix chat nan
* fix chat nan
* [checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
* [checkpointio] fix hybrid parallel optim checkpoint
* [extension] fix cuda extension
* [checkpointio] fix gemini optimizer checkpoint
* polish code
* [fix] remove unnecessary dp_size assert (#5351)
* fix: remove unnecessary assert
* test: add more 3d plugin tests
* fix: add warning
* [gemini] fix param op hook when output is tuple (#5355)
* [gemini] fix param op hook when output is tuple
* [gemini] fix param op hook
* [llama] fix dataloader for hybrid parallel (#5358)
* [plugin] refactor prepare dataloader
* [plugin] update train script
* [llama] update training script (#5360)
* [llama] update training script
* [doc] polish docstr
* [llama] add flash attn patch for npu (#5362)
* [llama] fix neftune & pbar with start_step (#5364)
* [eval] update llama npu eval (#5366)
* [llama] polish training script and fix optim ckpt (#5368)
* [lr-scheduler] fix load state dict and add test (#5369)
* [llama] fix memory issue (#5371)
* [llama] fix memory issue
* [llama] add comment
* [moe] init mixtral impl
* [moe] update capacity computing (#5253)
* [moe] top2 allow uneven input
* [moe] update capacity computing
* [moe] remove debug info
* [moe] update capacity computing
* [moe] update capacity computing
* [moe] support mixtral (#5309)
* [moe] add mixtral block for single expert
* [moe] mixtral block fwd support uneven ep
* [moe] mixtral block bwd support uneven ep
* [moe] add mixtral moe layer
* [moe] simplify replace
* [meo] support save sharded mixtral
* [meo] support load sharded mixtral
* [meo] support save sharded optim
* [meo] integrate moe manager into plug
* [meo] fix optimizer load
* [meo] fix mixtral layer
* [moe] fix mixtral checkpoint io (#5314)
* [moe] fix mixtral forward default value (#5329)
* [moe] fix mixtral optim checkpoint (#5344)
* [moe] fix tests
* [release] update version (#5380)
* [llama] fix training and inference scripts (#5384)
* [llama] refactor inference example to fit sft
* [llama] fix training script to fit gemini
* [llama] fix inference script
* [doc] Fix typo (#5361)
* [doc] updated installation command (#5389)
* [hotfix] fix variable type for top_p (#5313)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] Fix wrong import in meta_registry (#5392)
* [extension] hotfix jit extension setup (#5402)
* [example] reuse flash attn patch (#5400)
* [fsdp] impl save/load shard model/optimizer (#5357)
* [setup] fixed nightly release (#5388)
* [shardformer]gather llama logits (#5398)
* gather llama logits
* fix
* update requirements (#5407)
* [workflow] added pypi channel (#5412)
* [doc] fix blog link
* [doc] fix blog link
* fix sft single turn inference example (#5416)
* [example]add gpt2 benchmark example script. (#5295)
* benchmark gpt2
* fix
fix
fix
fix
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* fix
* fix
* fix
fix
* fix
fix
fix
* fix
fix
* benchmark gpt2
* fix
fix
fix
fix
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* fix
fix
* fix
fix
fix
* fix
* fix
fix
fix
fix
fix
* fix
* Update shardformer.py
---------
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
* [doc] sora release (#5425)
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [devops] fix extention building (#5427)
* [hotfix] fix sd vit import error (#5420)
* fix import error
* Update dpt_depth.py
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] fix typo of openmoe model source (#5403)
* [doc] update some translations with README-zh-Hans.md (#5382)
* [hotfix] fix typo change _descrption to _description (#5331)
* [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
* [eval-hotfix] set few_shot_data to None when few shot is disabled (#5422)
* [hotfix] fix typo change MoECheckpintIO to MoECheckpointIO (#5335)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [doc] Fix typo s/infered/inferred/ (#5288)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
* [hotfix] fix stable diffusion inference bug. (#5289)
* Update train_ddp.yaml
delete "strategy" to fix DDP config loading bug in "main.py"
* Update train_ddp.yaml
fix inference with scripts/txt2img.py config file load bug.
* Update README.md
add pretrain model test code.
* [colossal-llama2] add stream chat examlple for chat version model (#5428)
* add stream chat for chat version
* remove os.system clear
* modify function name
* [release] update version (#5411)
* fix tensor data update for gemini loss caluculation (#5442)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [doc] fix ColossalMoE readme (#5599)
* fix readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [zero] support multiple (partial) backward passes (#5596)
* [zero] support multiple (partial) backward passes
* [misc] update requirements
* [shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)
* fix no pad token bug
* fixed some auto parallel codegen bug, but might not run on torch 2.1
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] fix pipeline grad ckpt (#5620)
* [shardformer] fix pipeline grad ckpt
* [lora] add lora APIs for booster, support lora for TorchDDP (#4981)
* add apis and peft requirement
* add liscense and implement apis
* add checkpointio apis
* add torchddp fwd_bwd test
* add support_lora methods
* add checkpointio test and debug
* delete unneeded codes
* remove peft from LICENSE
* add concrete methods for enable_lora
* simplify enable_lora api
* fix requirements
* [LowLevelZero] low level zero support lora (#5153)
* low level zero support lora
low level zero support lora
* add checkpoint test
* add checkpoint test
* fix
* fix
* fix
* fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* test ci
* git # This is a combination of 3 commits.
Update low_level_zero_plugin.py
Update low_level_zero_plugin.py
fix
fix
fix
* fix naming
fix naming
fix naming
fix
* [feature] qlora support
* qlora follow commit
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* migrate qutization folder to colossalai/
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gptj sp fix
* remove redundancies from pre-commit
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Jun Gao <imgaojun@gmail.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Elsa Granger <zeyugao@outlook.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: JIMMY ZHAO <knightyzhao@gmail.com>
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
Co-authored-by: 李文军 <40464906+liwenjuna@users.noreply.github.com>
Co-authored-by: yixiaoer <miyaku@yixiaoer.sg>
Co-authored-by: CZYCW <czyczf@163.com>
Co-authored-by: Stephan Kölker <stephankoe@users.noreply.github.com>
Co-authored-by: QinLuo <eric.x.sun@gmail.com>
Co-authored-by: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Co-authored-by: Luo Yihang <luo_yihang@outlook.com>
Co-authored-by: Dongruixuan Li <dongruixuan@hotmail.com>
Co-authored-by: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Co-authored-by: Youngon <Youngon_wyl@163.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-23 09:57:44 +00:00
from torch . utils . _pytree import tree_map
2023-05-18 12:05:59 +00:00
2023-08-25 14:04:57 +00:00
from colossalai . tensor . d_tensor import (
is_customized_distributed_tensor ,
is_distributed_tensor ,
to_global ,
to_global_for_customized_distributed_tensor ,
)
2023-04-06 08:23:39 +00:00
SAFE_WEIGHTS_NAME = " model.safetensors "
2023-04-12 08:02:17 +00:00
WEIGHTS_NAME = " pytorch_model.bin "
2023-06-15 07:21:26 +00:00
STATES_NAME = " pytorch_optim.bin "
2023-04-06 08:23:39 +00:00
SAFE_WEIGHTS_INDEX_NAME = " model.safetensors.index.json "
2023-04-12 08:02:17 +00:00
WEIGHTS_INDEX_NAME = " pytorch_model.bin.index.json "
2023-06-15 07:21:26 +00:00
STATES_INDEX_NAME = " pytorch_optim.bin.index.json "
GROUP_FILE_NAME = " pytorch_optim_group.bin "
2023-04-04 07:23:01 +00:00
# ======================================
# General helper functions
# ======================================
2023-05-18 12:05:59 +00:00
2023-04-04 07:23:01 +00:00
def calculate_tensor_size ( tensor : torch . Tensor ) - > float :
"""
Calculate the size of a parameter in MB . Used to compute whether a group of params exceed the shard size .
If so , a new shard should be created .
Args :
2023-05-15 03:46:25 +00:00
tensor ( torch . Tensor ) : the tensor to calculate size for .
2023-04-04 07:23:01 +00:00
Returns :
float : size of the tensor in MB .
"""
return tensor . numel ( ) * tensor . element_size ( ) / 1024 / 1024
2023-05-18 12:05:59 +00:00
2023-04-04 07:23:01 +00:00
def is_safetensors_available ( ) - > bool :
"""
Check whether safetensors is available .
Returns :
bool : whether safetensors is available .
"""
try :
return True
except ImportError :
return False
def is_dtensor_checkpoint ( checkpoint_file_path : str ) - > bool :
"""
Check whether the checkpoint file is a dtensor checkpoint .
Args :
checkpoint_file_path ( str ) : path to the checkpoint file .
Returns :
bool : whether the checkpoint file is a dtensor checkpoint .
"""
2023-09-19 06:20:26 +00:00
if checkpoint_file_path . endswith ( " .*.safetensors " ) or checkpoint_file_path . endswith ( " .*.bin " ) :
2023-04-04 07:23:01 +00:00
return True
else :
return False
def is_safetensor_checkpoint ( checkpoint_file_path : str ) - > bool :
"""
Check whether the checkpoint file is a safetensor checkpoint .
Args :
checkpoint_file_path ( str ) : path to the checkpoint file .
Returns :
bool : whether the checkpoint file is a safetensor checkpoint .
"""
2023-09-19 06:20:26 +00:00
if checkpoint_file_path . endswith ( " .safetensors " ) :
2023-04-04 07:23:01 +00:00
return True
else :
return False
2023-08-31 06:50:47 +00:00
def search_tp_partition_dim ( current_shape : torch . Size , original_shape : torch . Size , tp_size : int ) - > Optional [ int ] :
2023-08-25 14:04:57 +00:00
"""
2023-08-31 06:50:47 +00:00
Given the current shape of parameter and the shape of parameter before sharding ,
return the dimension along which the parameter is sharded when using tensor parallel .
If tensor parallel is not used , return None .
2023-08-25 14:04:57 +00:00
Args :
2023-08-31 06:50:47 +00:00
current_shape ( torch . Size ) : The current shape of parameter after sharding .
original_shape ( torch . Size ) : The shape of parameter before sharding .
tp_size ( int ) : The size of tp group .
2023-08-25 14:04:57 +00:00
Returns :
2023-08-31 06:50:47 +00:00
Optional [ int ] : The dimension along which parameter is partitioned .
2023-08-25 14:04:57 +00:00
"""
2023-08-31 06:50:47 +00:00
partition_dim = None
for dim , length in enumerate ( original_shape ) :
if length > current_shape [ dim ] :
partition_dim = dim
break
if partition_dim is not None :
2023-09-19 06:20:26 +00:00
assert (
original_shape [ partition_dim ] == tp_size * current_shape [ partition_dim ]
) , f " The parameter isn ' t evenly distributed among tensor parallel group: \
2023-08-31 06:50:47 +00:00
shape before sharding { original_shape } , shape after sharding { current_shape } "
return partition_dim
2023-08-25 14:04:57 +00:00
[Feature] LoRA rebased to main branch (#5622)
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [moe] merge moe into main (#4978)
* update moe module
* support openmoe
* [hotfix] fix grad accumulation plus clipping for gemini (#5002)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915)
* Add layer norm gradients all-reduce for sequence parallel.
* skip pipeline inference test
* [hotfix] fixing polices of sequence parallel (#4922)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
---------
Co-authored-by: littsk <1214689160@qq.com>
* Hotfix/add grad all reduce for sequence parallel (#4927)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
* fix bug using wrong variables
---------
Co-authored-by: littsk <1214689160@qq.com>
* fix policy initialization
* fix bloom and chatglm policices
* polish code of handling layernorm
* fix moe module
* polish code of class initializing
---------
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [format] applied code formatting on changed files in pull request 4926 (#5007)
Co-authored-by: github-actions <github-actions@github.com>
* [Inference] Fix bug in ChatGLM2 Tensor Parallelism (#5014)
* fix bug
* fix
* fix multiquery
* fix multiquery
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [misc] add code owners (#5024)
* [moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method
* unshard optim ckpt
* optim io
* update transformer version
* update requirements
* update ckpt
* update ckpt
* update ckpt
* fix engine
* fix engine
* Support mtbench (#5025)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [moe]: fix ep/tp tests, add hierarchical all2all (#4982)
* fix: add warning for EP different behavior
* fix: use shard_data in ep & tp model
* to: add used_capacity
* fix: fix router test
* feat: add create_ep_node_group
* feat: add create_ep_hierarchical_group fn
* feat: add HierarchicalAllToAll
* test: add hierarchical all2all test
* fix: fix test errors
* fix: simplify create_ep_hierarchical_group
* fix: add hierarchical_alltoall arg
* fix: fix environ typo
* revert: revert process mesh order
* to: add todo mark
* fix: skip hierarchical_comm if torch < 1.13.1
* [shardformer] Fix serialization error with Tensor Parallel state saving (#5018)
* Fix serialization error with Tensor Parallel state saving
* Refactor state_dict CPU transfer using tree_map
* [gemini] gemini support tensor parallelism. (#4942)
* [colossalai]fix typo
* [inference] Add smmoothquant for llama (#4904)
* [inference] add int8 rotary embedding kernel for smoothquant (#4843)
* [inference] add smoothquant llama attention (#4850)
* add smoothquant llama attention
* remove uselss code
* remove useless code
* fix import error
* rename file name
* [inference] add silu linear fusion for smoothquant llama mlp (#4853)
* add silu linear
* update skip condition
* catch smoothquant cuda lib exception
* prcocess exception for tests
* [inference] add llama mlp for smoothquant (#4854)
* add llama mlp for smoothquant
* fix down out scale
* remove duplicate lines
* add llama mlp check
* delete useless code
* [inference] add smoothquant llama (#4861)
* add smoothquant llama
* fix attention accuracy
* fix accuracy
* add kv cache and save pretrained
* refactor example
* delete smooth
* refactor code
* [inference] add smooth function and delete useless code for smoothquant (#4895)
* add smooth function and delete useless code
* update datasets
* remove duplicate import
* delete useless file
* refactor codes (#4902)
* rafactor code
* add license
* add torch-int and smoothquant license
* Update flash_attention_patch.py
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
* [kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921)
* [kernel] support pure fp16 for cpu adam (#4896)
* [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919)
* [kernel] fix cpu adam
* [test] update gemini optim test
* [format] applied code formatting on changed files in pull request 4908 (#4918)
Co-authored-by: github-actions <github-actions@github.com>
* [gemini] support gradient accumulation (#4869)
* add test
* fix no_sync bug in low level zero plugin
* fix test
* add argument for grad accum
* add grad accum in backward hook for gemini
* finish implementation, rewrite tests
* fix test
* skip stuck model in low level zero test
* update doc
* optimize communication & fix gradient checkpoint
* modify doc
* cleaning codes
* update cpu adam fp16 case
* [hotfix] fix torch 2.0 compatibility (#4936)
* [hotfix] fix launch
* [test] fix test gemini optim
* [shardformer] fix vit
* [test] add no master test for low level zero plugin (#4934)
* [format] applied code formatting on changed files in pull request 4820 (#4886)
Co-authored-by: github-actions <github-actions@github.com>
* [nfc] fix some typo with colossalai/ docs/ etc. (#4920)
* [Refactor] Integrated some lightllm kernels into token-attention (#4946)
* add some req for inference
* clean codes
* add codes
* add some lightllm deps
* clean codes
* hello
* delete rms files
* add some comments
* add comments
* add doc
* add lightllm deps
* add lightllm cahtglm2 kernels
* add lightllm cahtglm2 kernels
* replace rotary embedding with lightllm kernel
* add some commnets
* add some comments
* add some comments
* add
* replace fwd kernel att1
* fix a arg
* add
* add
* fix token attention
* add some comments
* clean codes
* modify comments
* fix readme
* fix bug
* fix bug
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
* [test] merge old components to test to model zoo (#4945)
* [test] add custom models in model zoo
* [test] update legacy test
* [test] update model zoo
* [test] update gemini test
* [test] remove components to test
* [inference] add reference and fix some bugs (#4937)
* add reference and fix some bugs
* update gptq init
---------
Co-authored-by: Xu Kai <xukai16@foxamil.com>
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
* fix
fix
fix
* update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
* support fused layernorm
support fused layernorm
support fused layernorm
* update fusedlayernorm
update fusedlayernorm
update fusedlayernorm
* add sequence parallel to gemini
add sequence parallel to gemini
* fix
* fix comments
fix comments
fix comments
* fix
* fix t5
* clear cache
* fix
* activate ci
* activate ci
* fix
* fix
* fix
* fix
* revert
* modify tp gather method
modify tp gather method
modify tp gather method
modify tp gather method
* fix test
---------
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
* [hotfix] Suport extra_kwargs in ShardConfig (#5031)
* [refactor]: replace inference args with extra_kwargs in ShardConfig
* modify shardconfig
* polish code
* fix policy bug in llama
* fix bug in auto policy
* remove setattr in ShardConfig
* fix wrong EOS token in ColossalChat
* [Kernels]Update triton kernels into 2.1.0 (#5046)
* update flash-context-attention
* adding kernels
* fix
* reset
* add build script
* add building process
* add llama2 exmaple
* add colossal-llama2 test
* clean
* fall back test setting
* fix test file
* clean
* clean
* clean
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)
* Use p2p
* Cannot bidirectonal send p2p
* Refactor tensor creation and serialization in P2P
communication
* Fix llama forward args in flash attention
* Add flop estimate from megatron
* Support loading weight not in weight_map when strict=False in hybrid_parallel
* Use send_forward_recv_backward, etc in 1f1b
* Use dataclass for metdata
Remove torch.cuda.synchronize() as suggested
* Add comment about the torch.cuda.synchronize for potential error
* Typo
* Update hybrid_parallel_checkpoint_io.py
* Update p2p.py
* Update one_f_one_b.py
* Update p2p.py
---------
Co-authored-by: flybird11111 <1829166702@qq.com>
* [gemini] gemini support extra-dp (#5043)
* support ddp
* fix
* fix
* fix
fix
* support ddp
* fix
* fix
* fix
fix
* simplify tests
* fix
* fix
* fix
fix
fix
* fix
* [shardformer] fix llama error when transformers upgraded. (#5055)
* fix-llama
* Update llama.py
* [hotfix]: modify create_ep_hierarchical_group and add test (#5032)
* feat: modify create_ep_hierarchical_group args
* test: add ep tests
* fix: remove get_process_group_ranks
* fix: fix src_rank
* [exampe] fix llama example' loss error when using gemini plugin (#5060)
fix llama example
* [inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998)
* support only tp
* enable tp
* add support for bloom (#5008)
* [refactor] refactor gptq and smoothquant llama (#5012)
* refactor gptq and smoothquant llama
* fix import error
* fix linear import torch-int
* fix smoothquant llama import error
* fix import accelerate error
* fix bug
* fix import smooth cuda
* fix smoothcuda
* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)
merge chatglm with pp and tp
* [Refactor] remove useless inference code (#5022)
* remove useless code
* fix quant model
* fix test import bug
* mv original inference legacy
* fix chatglm2
* [Refactor] refactor policy search and quant type controlling in inference (#5035)
* [Refactor] refactor policy search and quant type controling in inference
* [inference] update readme (#5051)
* update readme
* update readme
* fix architecture
* fix table
* fix table
* [inference] udpate example (#5053)
* udpate example
* fix run.sh
* fix rebase bug
* fix some errors
* update readme
* add some features
* update interface
* update readme
* update benchmark
* add requirements-infer
---------
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel
* add req
* clean
* clean
* delete build.sh
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [misc] remove outdated submodule (#5070)
* [npu] add npu support for gemini and zero (#5067)
* [npu] setup device utils (#5047)
* [npu] add npu device support
* [npu] support low level zero
* [test] update npu zero plugin test
* [hotfix] fix import
* [test] recover tests
* [npu] gemini support npu (#5052)
* [npu] refactor device utils
* [gemini] support npu
* [example] llama2+gemini support npu
* [kernel] add arm cpu adam kernel (#5065)
* [kernel] add arm cpu adam
* [optim] update adam optimizer
* [kernel] arm cpu adam remove bf16 support
* [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
* [inference] update examples and engine (#5073)
* update examples and engine
* fix choices
* update example
* [format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
* [hotfix/hybridengine] Fix init model with random parameters in benchmark (#5074)
* fix init model with random parameters
* fix example
* [inference] refactor examples and fix schedule (#5077)
* [setup] refactor infer setup
* [hotfix] fix infenrece behavior on 1 1 gpu
* [exmaple] refactor inference examples
* fix thrust-transform-reduce error (#5078)
* [nfc] fix typo in docs/ (#4972)
* [nfc] fix typo and author name (#5089)
* [gemini]fix gemini optimzer, saving Shardformer in Gemini got list assignment index out of range (#5085)
* [Hotfix] Fix model policy matching strategy in ShardFormer (#5064)
* hotfix/Fix get model policy strategy in ShardFormer
* fix bug in auto policy
* [shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn
* fix
fix
* [npu] add npu support for hybrid plugin and llama (#5090)
* llama 3d
* update
* fix autocast
* [Feature] Add document retrieval QA (#5020)
* add langchain
* add langchain
* Add files via upload
* add langchain
* fix style
* fix style: remove extra space
* add pytest; modified retriever
* add pytest; modified retriever
* add tests to build_on_pr.yml
* fix build_on_pr.yml
* fix build on pr; fix environ vars
* seperate unit tests for colossalqa from build from pr
* fix container setting; fix environ vars
* commented dev code
* add incremental update
* remove stale code
* fix style
* change to sha3 224
* fix retriever; fix style; add unit test for document loader
* fix ci workflow config
* fix ci workflow config
* add set cuda visible device script in ci
* fix doc string
* fix style; update readme; refactored
* add force log info
* change build on pr, ignore colossalqa
* fix docstring, captitalize all initial letters
* fix indexing; fix text-splitter
* remove debug code, update reference
* reset previous commit
* update LICENSE update README add key-value mode, fix bugs
* add files back
* revert force push
* remove junk file
* add test files
* fix retriever bug, add intent classification
* change conversation chain design
* rewrite prompt and conversation chain
* add ui v1
* ui v1
* fix atavar
* add header
* Refactor the RAG Code and support Pangu
* Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo.
* resolved conversation. tested scripts under examples. web demo still buggy
* fix ci tests
* Some modifications to add ChatGPT api
* modify llm.py and remove unnecessary files
* Delete applications/ColossalQA/examples/ui/test_frontend_input.json
* Remove OpenAI api key
* add colossalqa
* move files
* move files
* move files
* move files
* fix style
* Add Readme and fix some bugs.
* Add something to readme and modify some code
* modify a directory name for clarity
* remove redundant directory
* Correct a type in llm.py
* fix AI prefix
* fix test_memory.py
* fix conversation
* fix some erros and typos
* Fix a missing import in RAG_ChatBot.py
* add colossalcloud LLM wrapper, correct issues in code review
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* remove duplicate import (#5100)
* fix typo change lazy_iniy to lazy_init (#5099)
* [nfc] fix typo change directoty to directory (#5111)
* [FEATURE] Add Safety Eval Datasets to ColossalEval (#5095)
* add safetybench and cvalues(responsibility) eval dataset
* Modify code according to review suggestions
---------
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* [hotfix] fixed memory usage of shardformer module replacement (#5122)
* [shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088)
* [shardformer] implement policy for all GPT-J models and test
* [shardformer] support interleaved pipeline parallel for bert finetune
* [shardformer] shardformer support falcon (#4883)
* [shardformer]: fix interleaved pipeline for bert model (#5048)
* [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093)
* Add Mistral support for Shardformer (#5103)
* [shardformer] add tests to mistral (#5105)
---------
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
* [doc] add moe news (#5128)
* [doc] add moe news
* [doc] add moe news
* [doc] add moe news
* [doc] updated paper citation (#5131)
* fix typo change JOSNL TO JSONL etc. (#5116)
* [format] applied code formatting on changed files in pull request 5088 (#5127)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5124 (#5125)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5115 (#5118)
Co-authored-by: github-actions <github-actions@github.com>
* [accelerator] init the accelerator module (#5129)
* [accelerator] init the accelerator module
* polish code
* polish code
* polish code
* polish code
* [npu] support triangle attention for llama (#5130)
* update fused attn
* update spda
* tri attn
* update triangle
* import
* fix
* fix
* [plugin]fix 3d checkpoint load when booster boost without optimizer. (#5135)
* fix 3d checkpoint load when booster boost without optimizer
fix 3d checkpoint load when booster boost without optimizer
* test ci
* revert ci
* fix
fix
* [ColossalQA] refactor server and webui & add new feature (#5138)
* refactor server and webui & add new feature
* add requirements
* modify readme and ui
* [doc] fix colossalqa document (#5146)
* fix doc
* modify doc
* fix (#5158)
fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* [gemini] hotfix NaN loss while using Gemini + tensor_parallel (#5150)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* [colossalqa] fix pangu api (#5170)
* fix pangu api
* add comment
* [ColossalEval] Support GSM, Data Leakage Evaluation and Tensor Parallel (#5169)
* Support GSM, Data Leakage Evaluation and Tensor Parallel
* remove redundant code and update inference.py in examples/gpt_evaluation
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [shardformer] llama support DistCrossEntropy (#5176)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* fix ci
* fix ci
---------
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* Fix ColossalEval (#5186)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc] update pytorch version in documents. (#5177)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* update pytorch version in documents
* polish readme in application/chat (#5194)
* [pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)
* test: add more p2p tests
* fix: remove send_forward_recv_forward as p2p op list need to use the same group
* fix: make send and receive atomic
* feat: update P2PComm fn
* feat: add metadata cache in 1f1b
* feat: add metadata cache in interleaved pp
* feat: modify is_xx_stage fn
* revert: add _broadcast_object_list
* feat: add interleaved pp in llama policy
* feat: set NCCL_BUFFSIZE in HybridParallelPlugin
* Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>
* [doc] Update required third-party library list for testing and torch comptibility checking (#5207)
* doc/update requirements-test.txt
* update torch-cuda compatibility check
* support linear accumulation fusion (#5199)
support linear accumulation fusion
support linear accumulation fusion
fix
* [pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader
* feat: add run_forward_only, support arbitrary bs
* chore: modify ci script
* [pipeline]: add p2p fallback order and fix interleaved pp deadlock (#5214)
* fix: add fallback order option and update 1f1b
* fix: fix deadlock comm in interleaved pp
* test: modify p2p test
* [devops] update torch versoin in ci (#5217)
* fix-test (#5210)
fix-test
fix-test
* fix flash attn (#5209)
* [nfc] fix typo colossalai/shardformer/ (#5133)
* [Colossal-LLaMA-2] Release Colossal-LLaMA-2-13b-base model (#5224)
* update readme
* update readme
* update link
* update
* update readme
* update
* update
* update
* update title
* update example
* update example
* fix content
* add conclusion
* add license
* update
* update
* update version
* fix minor
* [doc] Update README.md of Colossal-LLAMA2 (#5233)
* Update README.md
* Update README.md
* [doc] Make leaderboard format more uniform and good-looking (#5231)
* Make leaderboard format more unifeid and good-looking
* Update README.md
* Update README.md
* [doc] add Colossal-LLaMA-2-13B (#5234)
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [format] applied code formatting on changed files in pull request 5234 (#5235)
Co-authored-by: github-actions <github-actions@github.com>
* [doc] SwiftInfer release (#5236)
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [npu] use extension for op builder (#5172)
* update extension
* update cpu adam
* update is
* add doc for cpu adam
* update kernel
* update commit
* update flash
* update memory efficient
* update flash attn
* update flash attention loader
* update api
* fix
* update doc
* update example time limit
* reverse change
* fix doc
* remove useless kernel
* fix
* not use warning
* update
* update
* [pipeline] A more general _communicate in p2p (#5062)
* A more general _communicate
* feat: finish tree_flatten version p2p
* fix: update p2p api calls
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [npu] change device to accelerator api (#5239)
* update accelerator
* fix timer
* fix amp
* update
* fix
* update bug
* add error raise
* fix autocast
* fix set device
* remove doc accelerator
* update doc
* update doc
* update doc
* use nullcontext
* update cpu
* update null context
* change time limit for example
* udpate
* update
* update
* update
* [npu] polish accelerator code
---------
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
* [hotfix] removed unused flag (#5242)
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* [hotfix] fix 3d plugin test (#5292)
* fix bug for mefture (#5299)
* [NFC] polish applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py code style (#5228)
* fix some typo (#5307)
* [feat] refactored extension module (#5298)
* [feat] refactored extension module
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* [workflow] updated CI image (#5318)
* [accelerator] fixed npu api
* [tests] fix t5 test. (#5322)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* fix t5 test
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] added docs for extensions (#5324)
* [doc] added docs for extensions
* polish
* polish
* fix typo under extensions/ (#5330)
* fix typo change dosen't to doesn't (#5308)
* [extension] fixed exception catch (#5342)
* [Chat] fix sft loss nan (#5345)
* fix script
* fix script
* fix chat nan
* fix chat nan
* [checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
* [checkpointio] fix hybrid parallel optim checkpoint
* [extension] fix cuda extension
* [checkpointio] fix gemini optimizer checkpoint
* polish code
* [fix] remove unnecessary dp_size assert (#5351)
* fix: remove unnecessary assert
* test: add more 3d plugin tests
* fix: add warning
* [gemini] fix param op hook when output is tuple (#5355)
* [gemini] fix param op hook when output is tuple
* [gemini] fix param op hook
* [llama] fix dataloader for hybrid parallel (#5358)
* [plugin] refactor prepare dataloader
* [plugin] update train script
* [llama] update training script (#5360)
* [llama] update training script
* [doc] polish docstr
* [llama] add flash attn patch for npu (#5362)
* [llama] fix neftune & pbar with start_step (#5364)
* [eval] update llama npu eval (#5366)
* [llama] polish training script and fix optim ckpt (#5368)
* [lr-scheduler] fix load state dict and add test (#5369)
* [llama] fix memory issue (#5371)
* [llama] fix memory issue
* [llama] add comment
* [moe] init mixtral impl
* [moe] update capacity computing (#5253)
* [moe] top2 allow uneven input
* [moe] update capacity computing
* [moe] remove debug info
* [moe] update capacity computing
* [moe] update capacity computing
* [moe] support mixtral (#5309)
* [moe] add mixtral block for single expert
* [moe] mixtral block fwd support uneven ep
* [moe] mixtral block bwd support uneven ep
* [moe] add mixtral moe layer
* [moe] simplify replace
* [meo] support save sharded mixtral
* [meo] support load sharded mixtral
* [meo] support save sharded optim
* [meo] integrate moe manager into plug
* [meo] fix optimizer load
* [meo] fix mixtral layer
* [moe] fix mixtral checkpoint io (#5314)
* [moe] fix mixtral forward default value (#5329)
* [moe] fix mixtral optim checkpoint (#5344)
* [moe] fix tests
* [release] update version (#5380)
* [llama] fix training and inference scripts (#5384)
* [llama] refactor inference example to fit sft
* [llama] fix training script to fit gemini
* [llama] fix inference script
* [doc] Fix typo (#5361)
* [doc] updated installation command (#5389)
* [hotfix] fix variable type for top_p (#5313)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] Fix wrong import in meta_registry (#5392)
* [extension] hotfix jit extension setup (#5402)
* [example] reuse flash attn patch (#5400)
* [fsdp] impl save/load shard model/optimizer (#5357)
* [setup] fixed nightly release (#5388)
* [shardformer]gather llama logits (#5398)
* gather llama logits
* fix
* update requirements (#5407)
* [workflow] added pypi channel (#5412)
* [doc] fix blog link
* [doc] fix blog link
* fix sft single turn inference example (#5416)
* [example]add gpt2 benchmark example script. (#5295)
* benchmark gpt2
* fix
fix
fix
fix
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* fix
* fix
* fix
fix
* fix
fix
fix
* fix
fix
* benchmark gpt2
* fix
fix
fix
fix
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* fix
fix
* fix
fix
fix
* fix
* fix
fix
fix
fix
fix
* fix
* Update shardformer.py
---------
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
* [doc] sora release (#5425)
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [devops] fix extention building (#5427)
* [hotfix] fix sd vit import error (#5420)
* fix import error
* Update dpt_depth.py
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] fix typo of openmoe model source (#5403)
* [doc] update some translations with README-zh-Hans.md (#5382)
* [hotfix] fix typo change _descrption to _description (#5331)
* [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
* [eval-hotfix] set few_shot_data to None when few shot is disabled (#5422)
* [hotfix] fix typo change MoECheckpintIO to MoECheckpointIO (#5335)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [doc] Fix typo s/infered/inferred/ (#5288)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
* [hotfix] fix stable diffusion inference bug. (#5289)
* Update train_ddp.yaml
delete "strategy" to fix DDP config loading bug in "main.py"
* Update train_ddp.yaml
fix inference with scripts/txt2img.py config file load bug.
* Update README.md
add pretrain model test code.
* [colossal-llama2] add stream chat examlple for chat version model (#5428)
* add stream chat for chat version
* remove os.system clear
* modify function name
* [release] update version (#5411)
* fix tensor data update for gemini loss caluculation (#5442)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [doc] fix ColossalMoE readme (#5599)
* fix readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [zero] support multiple (partial) backward passes (#5596)
* [zero] support multiple (partial) backward passes
* [misc] update requirements
* [shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)
* fix no pad token bug
* fixed some auto parallel codegen bug, but might not run on torch 2.1
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] fix pipeline grad ckpt (#5620)
* [shardformer] fix pipeline grad ckpt
* [lora] add lora APIs for booster, support lora for TorchDDP (#4981)
* add apis and peft requirement
* add liscense and implement apis
* add checkpointio apis
* add torchddp fwd_bwd test
* add support_lora methods
* add checkpointio test and debug
* delete unneeded codes
* remove peft from LICENSE
* add concrete methods for enable_lora
* simplify enable_lora api
* fix requirements
* [LowLevelZero] low level zero support lora (#5153)
* low level zero support lora
low level zero support lora
* add checkpoint test
* add checkpoint test
* fix
* fix
* fix
* fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* test ci
* git # This is a combination of 3 commits.
Update low_level_zero_plugin.py
Update low_level_zero_plugin.py
fix
fix
fix
* fix naming
fix naming
fix naming
fix
* [feature] qlora support
* qlora follow commit
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* migrate qutization folder to colossalai/
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gptj sp fix
* remove redundancies from pre-commit
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Jun Gao <imgaojun@gmail.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Elsa Granger <zeyugao@outlook.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: JIMMY ZHAO <knightyzhao@gmail.com>
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
Co-authored-by: 李文军 <40464906+liwenjuna@users.noreply.github.com>
Co-authored-by: yixiaoer <miyaku@yixiaoer.sg>
Co-authored-by: CZYCW <czyczf@163.com>
Co-authored-by: Stephan Kölker <stephankoe@users.noreply.github.com>
Co-authored-by: QinLuo <eric.x.sun@gmail.com>
Co-authored-by: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Co-authored-by: Luo Yihang <luo_yihang@outlook.com>
Co-authored-by: Dongruixuan Li <dongruixuan@hotmail.com>
Co-authored-by: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Co-authored-by: Youngon <Youngon_wyl@163.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-23 09:57:44 +00:00
def search_padding_dim ( global_shape : torch . Size , original_shape : torch . Size ) - > Optional [ int ] :
padding_dim = None
for dim , length in enumerate ( global_shape ) :
if length > original_shape [ dim ] :
padding_dim = dim
break
return padding_dim
2023-04-06 08:23:39 +00:00
# ======================================
2023-08-25 14:04:57 +00:00
# Helper classes and functions for saving shard file
2023-04-06 08:23:39 +00:00
# ======================================
2023-07-07 08:33:06 +00:00
2023-08-25 14:04:57 +00:00
class StateDictSharder :
def __init__ ( self , size_per_shard : int ) - > None :
self . max_shard_size = size_per_shard
self . current_block = OrderedDict ( )
self . current_block_size = 0
2023-08-31 06:50:47 +00:00
def append_param ( self , name : str , tensor : torch . Tensor ) - > Tuple [ Optional [ OrderedDict ] , int ] :
2023-08-25 14:04:57 +00:00
tensor_size = calculate_tensor_size ( tensor )
ret_block = None
ret_block_size = 0
# before we return the current block and create a new block,
# we need to ensure that the current block is not empty
if self . current_block_size + tensor_size > self . max_shard_size and self . current_block_size > 0 :
ret_block = self . current_block
ret_block_size = self . current_block_size
self . current_block = OrderedDict ( )
self . current_block_size = 0
self . current_block [ name ] = tensor
self . current_block_size + = tensor_size
return ret_block , ret_block_size
2023-08-31 06:50:47 +00:00
def append_optim_state ( self , param_id : int , state : OrderedDict ) - > Tuple [ Optional [ OrderedDict ] , int ] :
# A state might contain more than one tensors.
# e.g. each Adam state includes: 'step', 'exp_avg', 'exp_avg_sq'
state_size = 0
isDTensor = False
for state_tensor in state . values ( ) :
# When state_tensor is not of Tensor class,
# e.g., a SGD optimizer with momentum set to 0 can have None as state
# The calculation of tensor size should be skipped to avoid error.
if not isinstance ( state_tensor , torch . Tensor ) :
continue
# If the states are stored as DTensors, mark isDTensor as true.
if is_distributed_tensor ( state_tensor ) :
isDTensor = True
state_size + = calculate_tensor_size ( state_tensor )
ret_block = None
ret_block_size = 0
# directly return if state is stored as distributed tensor
if isDTensor :
return ret_block , ret_block_size
# before we return the current block and create a new block,
# we need to ensure that the current block is not empty
if self . current_block_size + state_size > self . max_shard_size and self . current_block_size > 0 :
ret_block = self . current_block
ret_block_size = self . current_block_size
self . current_block = OrderedDict ( )
self . current_block_size = 0
self . current_block [ param_id ] = state
self . current_block_size + = state_size
return ret_block , ret_block_size
def gather_distributed_param ( param : torch . Tensor , keep_vars : bool = False ) - > torch . Tensor :
"""
Gather the complete parameter for saving if passed in param is distributed under tp setting .
Args :
param ( torch . Tensor ) : A model parameter , might be d_tensor .
keep_vars ( bool , optional ) : Whether to return the parameter in calculation graph . Defaults to False .
Returns :
torch . Tensor : the complete parameter
"""
param_ = param if keep_vars else param . detach ( )
if is_distributed_tensor ( param_ ) :
return to_global ( param_ )
elif is_customized_distributed_tensor ( param_ ) :
return to_global_for_customized_distributed_tensor ( param_ )
else :
return param_
2023-08-25 14:04:57 +00:00
2023-09-19 06:20:26 +00:00
def save_state_dict_shards (
sharded_state_dict : Iterator [ Tuple [ OrderedDict , int ] ] ,
checkpoint : str ,
index_file : " CheckpointIndexFile " ,
base_filename : str ,
is_master : bool ,
use_safetensors : bool = False ,
use_pp_format : bool = False ,
) - > int :
"""
2023-07-21 06:39:01 +00:00
Save sharded state dict only on master rank , this method can be used by both model and optimizer states .
Args :
sharded_state_dict ( Iterator [ Tuple [ OrderedDict , int ] ] ) : a generator of shards , each shard contains state dict and shard size .
checkpoint ( str ) : The path of checkpoint directory as string .
index_file ( CheckpointIndexFile ) : The index file object to be updated .
base_filename ( str ) : Decides the prefix of filenames of shards .
2023-09-01 09:40:01 +00:00
is_master ( bool ) : Whether current rank is main process .
use_safetensors ( bool , optional ) : Whether to use safetensors to save checkpoint . Defaults to False .
use_pp_format : ( bool , optional ) : Whether to save the files in pipeline format including stage information . Defaults to False .
2023-07-21 06:39:01 +00:00
Returns :
int : the total size of shards
2023-09-19 06:20:26 +00:00
"""
2023-07-21 06:39:01 +00:00
total_size = 0
2023-09-01 09:40:01 +00:00
shard_filenames = [ ]
2023-07-21 06:39:01 +00:00
for idx , shard_pair in enumerate ( sharded_state_dict ) :
2023-08-25 14:04:57 +00:00
shard , current_size = shard_pair
2023-07-21 06:39:01 +00:00
if not is_master :
2023-08-25 14:04:57 +00:00
del shard
2023-07-21 06:39:01 +00:00
continue
shard_file = get_shard_filename ( base_filename , idx )
total_size = total_size + current_size
for key in shard . keys ( ) :
index_file . append_weight_map ( key , shard_file )
checkpoint_file_path = os . path . join ( checkpoint , shard_file )
# Only save on master rank.
save_state_dict ( shard , checkpoint_file_path , use_safetensors = use_safetensors )
2023-09-01 09:40:01 +00:00
shard_filenames . append ( shard_file )
2023-08-25 14:04:57 +00:00
del shard
2023-07-21 06:39:01 +00:00
2023-09-01 09:40:01 +00:00
# Clean folder, deleted unneeded files.
clean_folder ( checkpoint , base_filename , shard_filenames , is_master = is_master , use_pp_format = use_pp_format )
2023-07-21 06:39:01 +00:00
return total_size
2023-06-15 07:21:26 +00:00
def shard_model_checkpoint ( state_dict : torch . Tensor , max_shard_size : int = 1024 ) - > Iterator [ Tuple [ OrderedDict , int ] ] :
2023-04-06 08:23:39 +00:00
"""
Splits a model state dictionary in sub - checkpoints so that the final size of each sub - checkpoint does not exceed a
given size .
"""
2023-08-31 06:50:47 +00:00
state_dict_sharder = StateDictSharder ( max_shard_size )
2023-04-06 08:23:39 +00:00
for key , weight in state_dict . items ( ) :
2023-06-26 07:50:07 +00:00
if not is_distributed_tensor ( weight ) :
2023-08-31 06:50:47 +00:00
block , block_size = state_dict_sharder . append_param ( key , weight )
2023-05-18 12:05:59 +00:00
2023-08-31 06:50:47 +00:00
if block != None :
yield block , block_size
2023-04-06 08:23:39 +00:00
2023-08-31 06:50:47 +00:00
# Return the last block in sharder.
yield state_dict_sharder . current_block , state_dict_sharder . current_block_size
2023-04-06 08:23:39 +00:00
2023-06-15 07:21:26 +00:00
def shard_optimizer_checkpoint ( state_dict : dict , max_shard_size : int = 1024 ) - > Iterator [ Tuple [ OrderedDict , int ] ] :
"""
Splits an optimizer state dictionary in sub - checkpoints so that the final size of each sub - checkpoint does not exceed a
given size .
"""
# Only split state_dict['state']; state_dict['param_group'] is not considered in this function.
2023-09-19 06:20:26 +00:00
states = state_dict [ " state " ]
2023-08-31 06:50:47 +00:00
state_dict_sharder = StateDictSharder ( max_shard_size )
2023-06-15 07:21:26 +00:00
for param_id , state in states . items ( ) :
2023-08-31 06:50:47 +00:00
block , block_size = state_dict_sharder . append_optim_state ( param_id , state )
if block != None :
yield block , block_size
2023-06-15 07:21:26 +00:00
2023-08-31 06:50:47 +00:00
# Return the last block in sharder.
yield state_dict_sharder . current_block , state_dict_sharder . current_block_size
2023-06-15 07:21:26 +00:00
2023-06-16 06:14:05 +00:00
2023-08-31 06:50:47 +00:00
# ======================================
# Helper functions for saving state dict
# ======================================
2023-06-16 06:14:05 +00:00
2023-06-15 07:21:26 +00:00
2023-08-31 06:50:47 +00:00
def save_state_dict ( state_dict : dict , checkpoint_file_path : str , use_safetensors : bool ) - > None :
"""
Save state dict to checkpoint .
Args :
state_dict ( dict ) : state dict .
checkpoint_file_path ( str ) : path to the checkpoint file .
use_safetensors ( bool ) : whether to use safetensors to save the checkpoint .
"""
[Feature] LoRA rebased to main branch (#5622)
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [moe] merge moe into main (#4978)
* update moe module
* support openmoe
* [hotfix] fix grad accumulation plus clipping for gemini (#5002)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915)
* Add layer norm gradients all-reduce for sequence parallel.
* skip pipeline inference test
* [hotfix] fixing polices of sequence parallel (#4922)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
---------
Co-authored-by: littsk <1214689160@qq.com>
* Hotfix/add grad all reduce for sequence parallel (#4927)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
* fix bug using wrong variables
---------
Co-authored-by: littsk <1214689160@qq.com>
* fix policy initialization
* fix bloom and chatglm policices
* polish code of handling layernorm
* fix moe module
* polish code of class initializing
---------
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [format] applied code formatting on changed files in pull request 4926 (#5007)
Co-authored-by: github-actions <github-actions@github.com>
* [Inference] Fix bug in ChatGLM2 Tensor Parallelism (#5014)
* fix bug
* fix
* fix multiquery
* fix multiquery
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [misc] add code owners (#5024)
* [moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method
* unshard optim ckpt
* optim io
* update transformer version
* update requirements
* update ckpt
* update ckpt
* update ckpt
* fix engine
* fix engine
* Support mtbench (#5025)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [moe]: fix ep/tp tests, add hierarchical all2all (#4982)
* fix: add warning for EP different behavior
* fix: use shard_data in ep & tp model
* to: add used_capacity
* fix: fix router test
* feat: add create_ep_node_group
* feat: add create_ep_hierarchical_group fn
* feat: add HierarchicalAllToAll
* test: add hierarchical all2all test
* fix: fix test errors
* fix: simplify create_ep_hierarchical_group
* fix: add hierarchical_alltoall arg
* fix: fix environ typo
* revert: revert process mesh order
* to: add todo mark
* fix: skip hierarchical_comm if torch < 1.13.1
* [shardformer] Fix serialization error with Tensor Parallel state saving (#5018)
* Fix serialization error with Tensor Parallel state saving
* Refactor state_dict CPU transfer using tree_map
* [gemini] gemini support tensor parallelism. (#4942)
* [colossalai]fix typo
* [inference] Add smmoothquant for llama (#4904)
* [inference] add int8 rotary embedding kernel for smoothquant (#4843)
* [inference] add smoothquant llama attention (#4850)
* add smoothquant llama attention
* remove uselss code
* remove useless code
* fix import error
* rename file name
* [inference] add silu linear fusion for smoothquant llama mlp (#4853)
* add silu linear
* update skip condition
* catch smoothquant cuda lib exception
* prcocess exception for tests
* [inference] add llama mlp for smoothquant (#4854)
* add llama mlp for smoothquant
* fix down out scale
* remove duplicate lines
* add llama mlp check
* delete useless code
* [inference] add smoothquant llama (#4861)
* add smoothquant llama
* fix attention accuracy
* fix accuracy
* add kv cache and save pretrained
* refactor example
* delete smooth
* refactor code
* [inference] add smooth function and delete useless code for smoothquant (#4895)
* add smooth function and delete useless code
* update datasets
* remove duplicate import
* delete useless file
* refactor codes (#4902)
* rafactor code
* add license
* add torch-int and smoothquant license
* Update flash_attention_patch.py
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
* [kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921)
* [kernel] support pure fp16 for cpu adam (#4896)
* [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919)
* [kernel] fix cpu adam
* [test] update gemini optim test
* [format] applied code formatting on changed files in pull request 4908 (#4918)
Co-authored-by: github-actions <github-actions@github.com>
* [gemini] support gradient accumulation (#4869)
* add test
* fix no_sync bug in low level zero plugin
* fix test
* add argument for grad accum
* add grad accum in backward hook for gemini
* finish implementation, rewrite tests
* fix test
* skip stuck model in low level zero test
* update doc
* optimize communication & fix gradient checkpoint
* modify doc
* cleaning codes
* update cpu adam fp16 case
* [hotfix] fix torch 2.0 compatibility (#4936)
* [hotfix] fix launch
* [test] fix test gemini optim
* [shardformer] fix vit
* [test] add no master test for low level zero plugin (#4934)
* [format] applied code formatting on changed files in pull request 4820 (#4886)
Co-authored-by: github-actions <github-actions@github.com>
* [nfc] fix some typo with colossalai/ docs/ etc. (#4920)
* [Refactor] Integrated some lightllm kernels into token-attention (#4946)
* add some req for inference
* clean codes
* add codes
* add some lightllm deps
* clean codes
* hello
* delete rms files
* add some comments
* add comments
* add doc
* add lightllm deps
* add lightllm cahtglm2 kernels
* add lightllm cahtglm2 kernels
* replace rotary embedding with lightllm kernel
* add some commnets
* add some comments
* add some comments
* add
* replace fwd kernel att1
* fix a arg
* add
* add
* fix token attention
* add some comments
* clean codes
* modify comments
* fix readme
* fix bug
* fix bug
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
* [test] merge old components to test to model zoo (#4945)
* [test] add custom models in model zoo
* [test] update legacy test
* [test] update model zoo
* [test] update gemini test
* [test] remove components to test
* [inference] add reference and fix some bugs (#4937)
* add reference and fix some bugs
* update gptq init
---------
Co-authored-by: Xu Kai <xukai16@foxamil.com>
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
* fix
fix
fix
* update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
* support fused layernorm
support fused layernorm
support fused layernorm
* update fusedlayernorm
update fusedlayernorm
update fusedlayernorm
* add sequence parallel to gemini
add sequence parallel to gemini
* fix
* fix comments
fix comments
fix comments
* fix
* fix t5
* clear cache
* fix
* activate ci
* activate ci
* fix
* fix
* fix
* fix
* revert
* modify tp gather method
modify tp gather method
modify tp gather method
modify tp gather method
* fix test
---------
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
* [hotfix] Suport extra_kwargs in ShardConfig (#5031)
* [refactor]: replace inference args with extra_kwargs in ShardConfig
* modify shardconfig
* polish code
* fix policy bug in llama
* fix bug in auto policy
* remove setattr in ShardConfig
* fix wrong EOS token in ColossalChat
* [Kernels]Update triton kernels into 2.1.0 (#5046)
* update flash-context-attention
* adding kernels
* fix
* reset
* add build script
* add building process
* add llama2 exmaple
* add colossal-llama2 test
* clean
* fall back test setting
* fix test file
* clean
* clean
* clean
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)
* Use p2p
* Cannot bidirectonal send p2p
* Refactor tensor creation and serialization in P2P
communication
* Fix llama forward args in flash attention
* Add flop estimate from megatron
* Support loading weight not in weight_map when strict=False in hybrid_parallel
* Use send_forward_recv_backward, etc in 1f1b
* Use dataclass for metdata
Remove torch.cuda.synchronize() as suggested
* Add comment about the torch.cuda.synchronize for potential error
* Typo
* Update hybrid_parallel_checkpoint_io.py
* Update p2p.py
* Update one_f_one_b.py
* Update p2p.py
---------
Co-authored-by: flybird11111 <1829166702@qq.com>
* [gemini] gemini support extra-dp (#5043)
* support ddp
* fix
* fix
* fix
fix
* support ddp
* fix
* fix
* fix
fix
* simplify tests
* fix
* fix
* fix
fix
fix
* fix
* [shardformer] fix llama error when transformers upgraded. (#5055)
* fix-llama
* Update llama.py
* [hotfix]: modify create_ep_hierarchical_group and add test (#5032)
* feat: modify create_ep_hierarchical_group args
* test: add ep tests
* fix: remove get_process_group_ranks
* fix: fix src_rank
* [exampe] fix llama example' loss error when using gemini plugin (#5060)
fix llama example
* [inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998)
* support only tp
* enable tp
* add support for bloom (#5008)
* [refactor] refactor gptq and smoothquant llama (#5012)
* refactor gptq and smoothquant llama
* fix import error
* fix linear import torch-int
* fix smoothquant llama import error
* fix import accelerate error
* fix bug
* fix import smooth cuda
* fix smoothcuda
* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)
merge chatglm with pp and tp
* [Refactor] remove useless inference code (#5022)
* remove useless code
* fix quant model
* fix test import bug
* mv original inference legacy
* fix chatglm2
* [Refactor] refactor policy search and quant type controlling in inference (#5035)
* [Refactor] refactor policy search and quant type controling in inference
* [inference] update readme (#5051)
* update readme
* update readme
* fix architecture
* fix table
* fix table
* [inference] udpate example (#5053)
* udpate example
* fix run.sh
* fix rebase bug
* fix some errors
* update readme
* add some features
* update interface
* update readme
* update benchmark
* add requirements-infer
---------
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel
* add req
* clean
* clean
* delete build.sh
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [misc] remove outdated submodule (#5070)
* [npu] add npu support for gemini and zero (#5067)
* [npu] setup device utils (#5047)
* [npu] add npu device support
* [npu] support low level zero
* [test] update npu zero plugin test
* [hotfix] fix import
* [test] recover tests
* [npu] gemini support npu (#5052)
* [npu] refactor device utils
* [gemini] support npu
* [example] llama2+gemini support npu
* [kernel] add arm cpu adam kernel (#5065)
* [kernel] add arm cpu adam
* [optim] update adam optimizer
* [kernel] arm cpu adam remove bf16 support
* [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
* [inference] update examples and engine (#5073)
* update examples and engine
* fix choices
* update example
* [format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
* [hotfix/hybridengine] Fix init model with random parameters in benchmark (#5074)
* fix init model with random parameters
* fix example
* [inference] refactor examples and fix schedule (#5077)
* [setup] refactor infer setup
* [hotfix] fix infenrece behavior on 1 1 gpu
* [exmaple] refactor inference examples
* fix thrust-transform-reduce error (#5078)
* [nfc] fix typo in docs/ (#4972)
* [nfc] fix typo and author name (#5089)
* [gemini]fix gemini optimzer, saving Shardformer in Gemini got list assignment index out of range (#5085)
* [Hotfix] Fix model policy matching strategy in ShardFormer (#5064)
* hotfix/Fix get model policy strategy in ShardFormer
* fix bug in auto policy
* [shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn
* fix
fix
* [npu] add npu support for hybrid plugin and llama (#5090)
* llama 3d
* update
* fix autocast
* [Feature] Add document retrieval QA (#5020)
* add langchain
* add langchain
* Add files via upload
* add langchain
* fix style
* fix style: remove extra space
* add pytest; modified retriever
* add pytest; modified retriever
* add tests to build_on_pr.yml
* fix build_on_pr.yml
* fix build on pr; fix environ vars
* seperate unit tests for colossalqa from build from pr
* fix container setting; fix environ vars
* commented dev code
* add incremental update
* remove stale code
* fix style
* change to sha3 224
* fix retriever; fix style; add unit test for document loader
* fix ci workflow config
* fix ci workflow config
* add set cuda visible device script in ci
* fix doc string
* fix style; update readme; refactored
* add force log info
* change build on pr, ignore colossalqa
* fix docstring, captitalize all initial letters
* fix indexing; fix text-splitter
* remove debug code, update reference
* reset previous commit
* update LICENSE update README add key-value mode, fix bugs
* add files back
* revert force push
* remove junk file
* add test files
* fix retriever bug, add intent classification
* change conversation chain design
* rewrite prompt and conversation chain
* add ui v1
* ui v1
* fix atavar
* add header
* Refactor the RAG Code and support Pangu
* Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo.
* resolved conversation. tested scripts under examples. web demo still buggy
* fix ci tests
* Some modifications to add ChatGPT api
* modify llm.py and remove unnecessary files
* Delete applications/ColossalQA/examples/ui/test_frontend_input.json
* Remove OpenAI api key
* add colossalqa
* move files
* move files
* move files
* move files
* fix style
* Add Readme and fix some bugs.
* Add something to readme and modify some code
* modify a directory name for clarity
* remove redundant directory
* Correct a type in llm.py
* fix AI prefix
* fix test_memory.py
* fix conversation
* fix some erros and typos
* Fix a missing import in RAG_ChatBot.py
* add colossalcloud LLM wrapper, correct issues in code review
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* remove duplicate import (#5100)
* fix typo change lazy_iniy to lazy_init (#5099)
* [nfc] fix typo change directoty to directory (#5111)
* [FEATURE] Add Safety Eval Datasets to ColossalEval (#5095)
* add safetybench and cvalues(responsibility) eval dataset
* Modify code according to review suggestions
---------
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* [hotfix] fixed memory usage of shardformer module replacement (#5122)
* [shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088)
* [shardformer] implement policy for all GPT-J models and test
* [shardformer] support interleaved pipeline parallel for bert finetune
* [shardformer] shardformer support falcon (#4883)
* [shardformer]: fix interleaved pipeline for bert model (#5048)
* [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093)
* Add Mistral support for Shardformer (#5103)
* [shardformer] add tests to mistral (#5105)
---------
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
* [doc] add moe news (#5128)
* [doc] add moe news
* [doc] add moe news
* [doc] add moe news
* [doc] updated paper citation (#5131)
* fix typo change JOSNL TO JSONL etc. (#5116)
* [format] applied code formatting on changed files in pull request 5088 (#5127)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5124 (#5125)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5115 (#5118)
Co-authored-by: github-actions <github-actions@github.com>
* [accelerator] init the accelerator module (#5129)
* [accelerator] init the accelerator module
* polish code
* polish code
* polish code
* polish code
* [npu] support triangle attention for llama (#5130)
* update fused attn
* update spda
* tri attn
* update triangle
* import
* fix
* fix
* [plugin]fix 3d checkpoint load when booster boost without optimizer. (#5135)
* fix 3d checkpoint load when booster boost without optimizer
fix 3d checkpoint load when booster boost without optimizer
* test ci
* revert ci
* fix
fix
* [ColossalQA] refactor server and webui & add new feature (#5138)
* refactor server and webui & add new feature
* add requirements
* modify readme and ui
* [doc] fix colossalqa document (#5146)
* fix doc
* modify doc
* fix (#5158)
fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* [gemini] hotfix NaN loss while using Gemini + tensor_parallel (#5150)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* [colossalqa] fix pangu api (#5170)
* fix pangu api
* add comment
* [ColossalEval] Support GSM, Data Leakage Evaluation and Tensor Parallel (#5169)
* Support GSM, Data Leakage Evaluation and Tensor Parallel
* remove redundant code and update inference.py in examples/gpt_evaluation
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [shardformer] llama support DistCrossEntropy (#5176)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* fix ci
* fix ci
---------
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* Fix ColossalEval (#5186)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc] update pytorch version in documents. (#5177)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* update pytorch version in documents
* polish readme in application/chat (#5194)
* [pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)
* test: add more p2p tests
* fix: remove send_forward_recv_forward as p2p op list need to use the same group
* fix: make send and receive atomic
* feat: update P2PComm fn
* feat: add metadata cache in 1f1b
* feat: add metadata cache in interleaved pp
* feat: modify is_xx_stage fn
* revert: add _broadcast_object_list
* feat: add interleaved pp in llama policy
* feat: set NCCL_BUFFSIZE in HybridParallelPlugin
* Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>
* [doc] Update required third-party library list for testing and torch comptibility checking (#5207)
* doc/update requirements-test.txt
* update torch-cuda compatibility check
* support linear accumulation fusion (#5199)
support linear accumulation fusion
support linear accumulation fusion
fix
* [pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader
* feat: add run_forward_only, support arbitrary bs
* chore: modify ci script
* [pipeline]: add p2p fallback order and fix interleaved pp deadlock (#5214)
* fix: add fallback order option and update 1f1b
* fix: fix deadlock comm in interleaved pp
* test: modify p2p test
* [devops] update torch versoin in ci (#5217)
* fix-test (#5210)
fix-test
fix-test
* fix flash attn (#5209)
* [nfc] fix typo colossalai/shardformer/ (#5133)
* [Colossal-LLaMA-2] Release Colossal-LLaMA-2-13b-base model (#5224)
* update readme
* update readme
* update link
* update
* update readme
* update
* update
* update
* update title
* update example
* update example
* fix content
* add conclusion
* add license
* update
* update
* update version
* fix minor
* [doc] Update README.md of Colossal-LLAMA2 (#5233)
* Update README.md
* Update README.md
* [doc] Make leaderboard format more uniform and good-looking (#5231)
* Make leaderboard format more unifeid and good-looking
* Update README.md
* Update README.md
* [doc] add Colossal-LLaMA-2-13B (#5234)
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [format] applied code formatting on changed files in pull request 5234 (#5235)
Co-authored-by: github-actions <github-actions@github.com>
* [doc] SwiftInfer release (#5236)
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [npu] use extension for op builder (#5172)
* update extension
* update cpu adam
* update is
* add doc for cpu adam
* update kernel
* update commit
* update flash
* update memory efficient
* update flash attn
* update flash attention loader
* update api
* fix
* update doc
* update example time limit
* reverse change
* fix doc
* remove useless kernel
* fix
* not use warning
* update
* update
* [pipeline] A more general _communicate in p2p (#5062)
* A more general _communicate
* feat: finish tree_flatten version p2p
* fix: update p2p api calls
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [npu] change device to accelerator api (#5239)
* update accelerator
* fix timer
* fix amp
* update
* fix
* update bug
* add error raise
* fix autocast
* fix set device
* remove doc accelerator
* update doc
* update doc
* update doc
* use nullcontext
* update cpu
* update null context
* change time limit for example
* udpate
* update
* update
* update
* [npu] polish accelerator code
---------
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
* [hotfix] removed unused flag (#5242)
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* [hotfix] fix 3d plugin test (#5292)
* fix bug for mefture (#5299)
* [NFC] polish applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py code style (#5228)
* fix some typo (#5307)
* [feat] refactored extension module (#5298)
* [feat] refactored extension module
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* [workflow] updated CI image (#5318)
* [accelerator] fixed npu api
* [tests] fix t5 test. (#5322)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* fix t5 test
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] added docs for extensions (#5324)
* [doc] added docs for extensions
* polish
* polish
* fix typo under extensions/ (#5330)
* fix typo change dosen't to doesn't (#5308)
* [extension] fixed exception catch (#5342)
* [Chat] fix sft loss nan (#5345)
* fix script
* fix script
* fix chat nan
* fix chat nan
* [checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
* [checkpointio] fix hybrid parallel optim checkpoint
* [extension] fix cuda extension
* [checkpointio] fix gemini optimizer checkpoint
* polish code
* [fix] remove unnecessary dp_size assert (#5351)
* fix: remove unnecessary assert
* test: add more 3d plugin tests
* fix: add warning
* [gemini] fix param op hook when output is tuple (#5355)
* [gemini] fix param op hook when output is tuple
* [gemini] fix param op hook
* [llama] fix dataloader for hybrid parallel (#5358)
* [plugin] refactor prepare dataloader
* [plugin] update train script
* [llama] update training script (#5360)
* [llama] update training script
* [doc] polish docstr
* [llama] add flash attn patch for npu (#5362)
* [llama] fix neftune & pbar with start_step (#5364)
* [eval] update llama npu eval (#5366)
* [llama] polish training script and fix optim ckpt (#5368)
* [lr-scheduler] fix load state dict and add test (#5369)
* [llama] fix memory issue (#5371)
* [llama] fix memory issue
* [llama] add comment
* [moe] init mixtral impl
* [moe] update capacity computing (#5253)
* [moe] top2 allow uneven input
* [moe] update capacity computing
* [moe] remove debug info
* [moe] update capacity computing
* [moe] update capacity computing
* [moe] support mixtral (#5309)
* [moe] add mixtral block for single expert
* [moe] mixtral block fwd support uneven ep
* [moe] mixtral block bwd support uneven ep
* [moe] add mixtral moe layer
* [moe] simplify replace
* [meo] support save sharded mixtral
* [meo] support load sharded mixtral
* [meo] support save sharded optim
* [meo] integrate moe manager into plug
* [meo] fix optimizer load
* [meo] fix mixtral layer
* [moe] fix mixtral checkpoint io (#5314)
* [moe] fix mixtral forward default value (#5329)
* [moe] fix mixtral optim checkpoint (#5344)
* [moe] fix tests
* [release] update version (#5380)
* [llama] fix training and inference scripts (#5384)
* [llama] refactor inference example to fit sft
* [llama] fix training script to fit gemini
* [llama] fix inference script
* [doc] Fix typo (#5361)
* [doc] updated installation command (#5389)
* [hotfix] fix variable type for top_p (#5313)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] Fix wrong import in meta_registry (#5392)
* [extension] hotfix jit extension setup (#5402)
* [example] reuse flash attn patch (#5400)
* [fsdp] impl save/load shard model/optimizer (#5357)
* [setup] fixed nightly release (#5388)
* [shardformer]gather llama logits (#5398)
* gather llama logits
* fix
* update requirements (#5407)
* [workflow] added pypi channel (#5412)
* [doc] fix blog link
* [doc] fix blog link
* fix sft single turn inference example (#5416)
* [example]add gpt2 benchmark example script. (#5295)
* benchmark gpt2
* fix
fix
fix
fix
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* fix
* fix
* fix
fix
* fix
fix
fix
* fix
fix
* benchmark gpt2
* fix
fix
fix
fix
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* fix
fix
* fix
fix
fix
* fix
* fix
fix
fix
fix
fix
* fix
* Update shardformer.py
---------
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
* [doc] sora release (#5425)
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [devops] fix extention building (#5427)
* [hotfix] fix sd vit import error (#5420)
* fix import error
* Update dpt_depth.py
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] fix typo of openmoe model source (#5403)
* [doc] update some translations with README-zh-Hans.md (#5382)
* [hotfix] fix typo change _descrption to _description (#5331)
* [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
* [eval-hotfix] set few_shot_data to None when few shot is disabled (#5422)
* [hotfix] fix typo change MoECheckpintIO to MoECheckpointIO (#5335)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [doc] Fix typo s/infered/inferred/ (#5288)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
* [hotfix] fix stable diffusion inference bug. (#5289)
* Update train_ddp.yaml
delete "strategy" to fix DDP config loading bug in "main.py"
* Update train_ddp.yaml
fix inference with scripts/txt2img.py config file load bug.
* Update README.md
add pretrain model test code.
* [colossal-llama2] add stream chat examlple for chat version model (#5428)
* add stream chat for chat version
* remove os.system clear
* modify function name
* [release] update version (#5411)
* fix tensor data update for gemini loss caluculation (#5442)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [doc] fix ColossalMoE readme (#5599)
* fix readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [zero] support multiple (partial) backward passes (#5596)
* [zero] support multiple (partial) backward passes
* [misc] update requirements
* [shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)
* fix no pad token bug
* fixed some auto parallel codegen bug, but might not run on torch 2.1
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] fix pipeline grad ckpt (#5620)
* [shardformer] fix pipeline grad ckpt
* [lora] add lora APIs for booster, support lora for TorchDDP (#4981)
* add apis and peft requirement
* add liscense and implement apis
* add checkpointio apis
* add torchddp fwd_bwd test
* add support_lora methods
* add checkpointio test and debug
* delete unneeded codes
* remove peft from LICENSE
* add concrete methods for enable_lora
* simplify enable_lora api
* fix requirements
* [LowLevelZero] low level zero support lora (#5153)
* low level zero support lora
low level zero support lora
* add checkpoint test
* add checkpoint test
* fix
* fix
* fix
* fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* test ci
* git # This is a combination of 3 commits.
Update low_level_zero_plugin.py
Update low_level_zero_plugin.py
fix
fix
fix
* fix naming
fix naming
fix naming
fix
* [feature] qlora support
* qlora follow commit
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* migrate qutization folder to colossalai/
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gptj sp fix
* remove redundancies from pre-commit
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Jun Gao <imgaojun@gmail.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Elsa Granger <zeyugao@outlook.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: JIMMY ZHAO <knightyzhao@gmail.com>
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
Co-authored-by: 李文军 <40464906+liwenjuna@users.noreply.github.com>
Co-authored-by: yixiaoer <miyaku@yixiaoer.sg>
Co-authored-by: CZYCW <czyczf@163.com>
Co-authored-by: Stephan Kölker <stephankoe@users.noreply.github.com>
Co-authored-by: QinLuo <eric.x.sun@gmail.com>
Co-authored-by: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Co-authored-by: Luo Yihang <luo_yihang@outlook.com>
Co-authored-by: Dongruixuan Li <dongruixuan@hotmail.com>
Co-authored-by: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Co-authored-by: Youngon <Youngon_wyl@163.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-23 09:57:44 +00:00
# Move all tensors in the state_dict to CPU before saving to avoid serialization issues
state_dict_cpu = tree_map ( lambda x : x . cpu ( ) if torch . is_tensor ( x ) else x , state_dict )
2023-08-31 06:50:47 +00:00
if use_safetensors :
assert is_safetensors_available ( ) , " safetensors is not available. "
2023-09-19 06:20:26 +00:00
assert checkpoint_file_path . endswith (
" .safetensors "
) , " safetensors only supports .safetensors suffix for checkpoint file. "
2023-08-31 06:50:47 +00:00
from safetensors . torch import save_file as safe_save_file
2023-09-19 06:20:26 +00:00
[Feature] LoRA rebased to main branch (#5622)
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [moe] merge moe into main (#4978)
* update moe module
* support openmoe
* [hotfix] fix grad accumulation plus clipping for gemini (#5002)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915)
* Add layer norm gradients all-reduce for sequence parallel.
* skip pipeline inference test
* [hotfix] fixing polices of sequence parallel (#4922)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
---------
Co-authored-by: littsk <1214689160@qq.com>
* Hotfix/add grad all reduce for sequence parallel (#4927)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
* fix bug using wrong variables
---------
Co-authored-by: littsk <1214689160@qq.com>
* fix policy initialization
* fix bloom and chatglm policices
* polish code of handling layernorm
* fix moe module
* polish code of class initializing
---------
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [format] applied code formatting on changed files in pull request 4926 (#5007)
Co-authored-by: github-actions <github-actions@github.com>
* [Inference] Fix bug in ChatGLM2 Tensor Parallelism (#5014)
* fix bug
* fix
* fix multiquery
* fix multiquery
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [misc] add code owners (#5024)
* [moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method
* unshard optim ckpt
* optim io
* update transformer version
* update requirements
* update ckpt
* update ckpt
* update ckpt
* fix engine
* fix engine
* Support mtbench (#5025)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [moe]: fix ep/tp tests, add hierarchical all2all (#4982)
* fix: add warning for EP different behavior
* fix: use shard_data in ep & tp model
* to: add used_capacity
* fix: fix router test
* feat: add create_ep_node_group
* feat: add create_ep_hierarchical_group fn
* feat: add HierarchicalAllToAll
* test: add hierarchical all2all test
* fix: fix test errors
* fix: simplify create_ep_hierarchical_group
* fix: add hierarchical_alltoall arg
* fix: fix environ typo
* revert: revert process mesh order
* to: add todo mark
* fix: skip hierarchical_comm if torch < 1.13.1
* [shardformer] Fix serialization error with Tensor Parallel state saving (#5018)
* Fix serialization error with Tensor Parallel state saving
* Refactor state_dict CPU transfer using tree_map
* [gemini] gemini support tensor parallelism. (#4942)
* [colossalai]fix typo
* [inference] Add smmoothquant for llama (#4904)
* [inference] add int8 rotary embedding kernel for smoothquant (#4843)
* [inference] add smoothquant llama attention (#4850)
* add smoothquant llama attention
* remove uselss code
* remove useless code
* fix import error
* rename file name
* [inference] add silu linear fusion for smoothquant llama mlp (#4853)
* add silu linear
* update skip condition
* catch smoothquant cuda lib exception
* prcocess exception for tests
* [inference] add llama mlp for smoothquant (#4854)
* add llama mlp for smoothquant
* fix down out scale
* remove duplicate lines
* add llama mlp check
* delete useless code
* [inference] add smoothquant llama (#4861)
* add smoothquant llama
* fix attention accuracy
* fix accuracy
* add kv cache and save pretrained
* refactor example
* delete smooth
* refactor code
* [inference] add smooth function and delete useless code for smoothquant (#4895)
* add smooth function and delete useless code
* update datasets
* remove duplicate import
* delete useless file
* refactor codes (#4902)
* rafactor code
* add license
* add torch-int and smoothquant license
* Update flash_attention_patch.py
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
* [kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921)
* [kernel] support pure fp16 for cpu adam (#4896)
* [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919)
* [kernel] fix cpu adam
* [test] update gemini optim test
* [format] applied code formatting on changed files in pull request 4908 (#4918)
Co-authored-by: github-actions <github-actions@github.com>
* [gemini] support gradient accumulation (#4869)
* add test
* fix no_sync bug in low level zero plugin
* fix test
* add argument for grad accum
* add grad accum in backward hook for gemini
* finish implementation, rewrite tests
* fix test
* skip stuck model in low level zero test
* update doc
* optimize communication & fix gradient checkpoint
* modify doc
* cleaning codes
* update cpu adam fp16 case
* [hotfix] fix torch 2.0 compatibility (#4936)
* [hotfix] fix launch
* [test] fix test gemini optim
* [shardformer] fix vit
* [test] add no master test for low level zero plugin (#4934)
* [format] applied code formatting on changed files in pull request 4820 (#4886)
Co-authored-by: github-actions <github-actions@github.com>
* [nfc] fix some typo with colossalai/ docs/ etc. (#4920)
* [Refactor] Integrated some lightllm kernels into token-attention (#4946)
* add some req for inference
* clean codes
* add codes
* add some lightllm deps
* clean codes
* hello
* delete rms files
* add some comments
* add comments
* add doc
* add lightllm deps
* add lightllm cahtglm2 kernels
* add lightllm cahtglm2 kernels
* replace rotary embedding with lightllm kernel
* add some commnets
* add some comments
* add some comments
* add
* replace fwd kernel att1
* fix a arg
* add
* add
* fix token attention
* add some comments
* clean codes
* modify comments
* fix readme
* fix bug
* fix bug
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
* [test] merge old components to test to model zoo (#4945)
* [test] add custom models in model zoo
* [test] update legacy test
* [test] update model zoo
* [test] update gemini test
* [test] remove components to test
* [inference] add reference and fix some bugs (#4937)
* add reference and fix some bugs
* update gptq init
---------
Co-authored-by: Xu Kai <xukai16@foxamil.com>
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
* fix
fix
fix
* update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
* support fused layernorm
support fused layernorm
support fused layernorm
* update fusedlayernorm
update fusedlayernorm
update fusedlayernorm
* add sequence parallel to gemini
add sequence parallel to gemini
* fix
* fix comments
fix comments
fix comments
* fix
* fix t5
* clear cache
* fix
* activate ci
* activate ci
* fix
* fix
* fix
* fix
* revert
* modify tp gather method
modify tp gather method
modify tp gather method
modify tp gather method
* fix test
---------
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
* [hotfix] Suport extra_kwargs in ShardConfig (#5031)
* [refactor]: replace inference args with extra_kwargs in ShardConfig
* modify shardconfig
* polish code
* fix policy bug in llama
* fix bug in auto policy
* remove setattr in ShardConfig
* fix wrong EOS token in ColossalChat
* [Kernels]Update triton kernels into 2.1.0 (#5046)
* update flash-context-attention
* adding kernels
* fix
* reset
* add build script
* add building process
* add llama2 exmaple
* add colossal-llama2 test
* clean
* fall back test setting
* fix test file
* clean
* clean
* clean
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)
* Use p2p
* Cannot bidirectonal send p2p
* Refactor tensor creation and serialization in P2P
communication
* Fix llama forward args in flash attention
* Add flop estimate from megatron
* Support loading weight not in weight_map when strict=False in hybrid_parallel
* Use send_forward_recv_backward, etc in 1f1b
* Use dataclass for metdata
Remove torch.cuda.synchronize() as suggested
* Add comment about the torch.cuda.synchronize for potential error
* Typo
* Update hybrid_parallel_checkpoint_io.py
* Update p2p.py
* Update one_f_one_b.py
* Update p2p.py
---------
Co-authored-by: flybird11111 <1829166702@qq.com>
* [gemini] gemini support extra-dp (#5043)
* support ddp
* fix
* fix
* fix
fix
* support ddp
* fix
* fix
* fix
fix
* simplify tests
* fix
* fix
* fix
fix
fix
* fix
* [shardformer] fix llama error when transformers upgraded. (#5055)
* fix-llama
* Update llama.py
* [hotfix]: modify create_ep_hierarchical_group and add test (#5032)
* feat: modify create_ep_hierarchical_group args
* test: add ep tests
* fix: remove get_process_group_ranks
* fix: fix src_rank
* [exampe] fix llama example' loss error when using gemini plugin (#5060)
fix llama example
* [inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998)
* support only tp
* enable tp
* add support for bloom (#5008)
* [refactor] refactor gptq and smoothquant llama (#5012)
* refactor gptq and smoothquant llama
* fix import error
* fix linear import torch-int
* fix smoothquant llama import error
* fix import accelerate error
* fix bug
* fix import smooth cuda
* fix smoothcuda
* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)
merge chatglm with pp and tp
* [Refactor] remove useless inference code (#5022)
* remove useless code
* fix quant model
* fix test import bug
* mv original inference legacy
* fix chatglm2
* [Refactor] refactor policy search and quant type controlling in inference (#5035)
* [Refactor] refactor policy search and quant type controling in inference
* [inference] update readme (#5051)
* update readme
* update readme
* fix architecture
* fix table
* fix table
* [inference] udpate example (#5053)
* udpate example
* fix run.sh
* fix rebase bug
* fix some errors
* update readme
* add some features
* update interface
* update readme
* update benchmark
* add requirements-infer
---------
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel
* add req
* clean
* clean
* delete build.sh
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [misc] remove outdated submodule (#5070)
* [npu] add npu support for gemini and zero (#5067)
* [npu] setup device utils (#5047)
* [npu] add npu device support
* [npu] support low level zero
* [test] update npu zero plugin test
* [hotfix] fix import
* [test] recover tests
* [npu] gemini support npu (#5052)
* [npu] refactor device utils
* [gemini] support npu
* [example] llama2+gemini support npu
* [kernel] add arm cpu adam kernel (#5065)
* [kernel] add arm cpu adam
* [optim] update adam optimizer
* [kernel] arm cpu adam remove bf16 support
* [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
* [inference] update examples and engine (#5073)
* update examples and engine
* fix choices
* update example
* [format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
* [hotfix/hybridengine] Fix init model with random parameters in benchmark (#5074)
* fix init model with random parameters
* fix example
* [inference] refactor examples and fix schedule (#5077)
* [setup] refactor infer setup
* [hotfix] fix infenrece behavior on 1 1 gpu
* [exmaple] refactor inference examples
* fix thrust-transform-reduce error (#5078)
* [nfc] fix typo in docs/ (#4972)
* [nfc] fix typo and author name (#5089)
* [gemini]fix gemini optimzer, saving Shardformer in Gemini got list assignment index out of range (#5085)
* [Hotfix] Fix model policy matching strategy in ShardFormer (#5064)
* hotfix/Fix get model policy strategy in ShardFormer
* fix bug in auto policy
* [shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn
* fix
fix
* [npu] add npu support for hybrid plugin and llama (#5090)
* llama 3d
* update
* fix autocast
* [Feature] Add document retrieval QA (#5020)
* add langchain
* add langchain
* Add files via upload
* add langchain
* fix style
* fix style: remove extra space
* add pytest; modified retriever
* add pytest; modified retriever
* add tests to build_on_pr.yml
* fix build_on_pr.yml
* fix build on pr; fix environ vars
* seperate unit tests for colossalqa from build from pr
* fix container setting; fix environ vars
* commented dev code
* add incremental update
* remove stale code
* fix style
* change to sha3 224
* fix retriever; fix style; add unit test for document loader
* fix ci workflow config
* fix ci workflow config
* add set cuda visible device script in ci
* fix doc string
* fix style; update readme; refactored
* add force log info
* change build on pr, ignore colossalqa
* fix docstring, captitalize all initial letters
* fix indexing; fix text-splitter
* remove debug code, update reference
* reset previous commit
* update LICENSE update README add key-value mode, fix bugs
* add files back
* revert force push
* remove junk file
* add test files
* fix retriever bug, add intent classification
* change conversation chain design
* rewrite prompt and conversation chain
* add ui v1
* ui v1
* fix atavar
* add header
* Refactor the RAG Code and support Pangu
* Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo.
* resolved conversation. tested scripts under examples. web demo still buggy
* fix ci tests
* Some modifications to add ChatGPT api
* modify llm.py and remove unnecessary files
* Delete applications/ColossalQA/examples/ui/test_frontend_input.json
* Remove OpenAI api key
* add colossalqa
* move files
* move files
* move files
* move files
* fix style
* Add Readme and fix some bugs.
* Add something to readme and modify some code
* modify a directory name for clarity
* remove redundant directory
* Correct a type in llm.py
* fix AI prefix
* fix test_memory.py
* fix conversation
* fix some erros and typos
* Fix a missing import in RAG_ChatBot.py
* add colossalcloud LLM wrapper, correct issues in code review
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* remove duplicate import (#5100)
* fix typo change lazy_iniy to lazy_init (#5099)
* [nfc] fix typo change directoty to directory (#5111)
* [FEATURE] Add Safety Eval Datasets to ColossalEval (#5095)
* add safetybench and cvalues(responsibility) eval dataset
* Modify code according to review suggestions
---------
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* [hotfix] fixed memory usage of shardformer module replacement (#5122)
* [shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088)
* [shardformer] implement policy for all GPT-J models and test
* [shardformer] support interleaved pipeline parallel for bert finetune
* [shardformer] shardformer support falcon (#4883)
* [shardformer]: fix interleaved pipeline for bert model (#5048)
* [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093)
* Add Mistral support for Shardformer (#5103)
* [shardformer] add tests to mistral (#5105)
---------
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
* [doc] add moe news (#5128)
* [doc] add moe news
* [doc] add moe news
* [doc] add moe news
* [doc] updated paper citation (#5131)
* fix typo change JOSNL TO JSONL etc. (#5116)
* [format] applied code formatting on changed files in pull request 5088 (#5127)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5124 (#5125)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5115 (#5118)
Co-authored-by: github-actions <github-actions@github.com>
* [accelerator] init the accelerator module (#5129)
* [accelerator] init the accelerator module
* polish code
* polish code
* polish code
* polish code
* [npu] support triangle attention for llama (#5130)
* update fused attn
* update spda
* tri attn
* update triangle
* import
* fix
* fix
* [plugin]fix 3d checkpoint load when booster boost without optimizer. (#5135)
* fix 3d checkpoint load when booster boost without optimizer
fix 3d checkpoint load when booster boost without optimizer
* test ci
* revert ci
* fix
fix
* [ColossalQA] refactor server and webui & add new feature (#5138)
* refactor server and webui & add new feature
* add requirements
* modify readme and ui
* [doc] fix colossalqa document (#5146)
* fix doc
* modify doc
* fix (#5158)
fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* [gemini] hotfix NaN loss while using Gemini + tensor_parallel (#5150)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* [colossalqa] fix pangu api (#5170)
* fix pangu api
* add comment
* [ColossalEval] Support GSM, Data Leakage Evaluation and Tensor Parallel (#5169)
* Support GSM, Data Leakage Evaluation and Tensor Parallel
* remove redundant code and update inference.py in examples/gpt_evaluation
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [shardformer] llama support DistCrossEntropy (#5176)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* fix ci
* fix ci
---------
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* Fix ColossalEval (#5186)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc] update pytorch version in documents. (#5177)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* update pytorch version in documents
* polish readme in application/chat (#5194)
* [pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)
* test: add more p2p tests
* fix: remove send_forward_recv_forward as p2p op list need to use the same group
* fix: make send and receive atomic
* feat: update P2PComm fn
* feat: add metadata cache in 1f1b
* feat: add metadata cache in interleaved pp
* feat: modify is_xx_stage fn
* revert: add _broadcast_object_list
* feat: add interleaved pp in llama policy
* feat: set NCCL_BUFFSIZE in HybridParallelPlugin
* Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>
* [doc] Update required third-party library list for testing and torch comptibility checking (#5207)
* doc/update requirements-test.txt
* update torch-cuda compatibility check
* support linear accumulation fusion (#5199)
support linear accumulation fusion
support linear accumulation fusion
fix
* [pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader
* feat: add run_forward_only, support arbitrary bs
* chore: modify ci script
* [pipeline]: add p2p fallback order and fix interleaved pp deadlock (#5214)
* fix: add fallback order option and update 1f1b
* fix: fix deadlock comm in interleaved pp
* test: modify p2p test
* [devops] update torch versoin in ci (#5217)
* fix-test (#5210)
fix-test
fix-test
* fix flash attn (#5209)
* [nfc] fix typo colossalai/shardformer/ (#5133)
* [Colossal-LLaMA-2] Release Colossal-LLaMA-2-13b-base model (#5224)
* update readme
* update readme
* update link
* update
* update readme
* update
* update
* update
* update title
* update example
* update example
* fix content
* add conclusion
* add license
* update
* update
* update version
* fix minor
* [doc] Update README.md of Colossal-LLAMA2 (#5233)
* Update README.md
* Update README.md
* [doc] Make leaderboard format more uniform and good-looking (#5231)
* Make leaderboard format more unifeid and good-looking
* Update README.md
* Update README.md
* [doc] add Colossal-LLaMA-2-13B (#5234)
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [format] applied code formatting on changed files in pull request 5234 (#5235)
Co-authored-by: github-actions <github-actions@github.com>
* [doc] SwiftInfer release (#5236)
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [npu] use extension for op builder (#5172)
* update extension
* update cpu adam
* update is
* add doc for cpu adam
* update kernel
* update commit
* update flash
* update memory efficient
* update flash attn
* update flash attention loader
* update api
* fix
* update doc
* update example time limit
* reverse change
* fix doc
* remove useless kernel
* fix
* not use warning
* update
* update
* [pipeline] A more general _communicate in p2p (#5062)
* A more general _communicate
* feat: finish tree_flatten version p2p
* fix: update p2p api calls
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [npu] change device to accelerator api (#5239)
* update accelerator
* fix timer
* fix amp
* update
* fix
* update bug
* add error raise
* fix autocast
* fix set device
* remove doc accelerator
* update doc
* update doc
* update doc
* use nullcontext
* update cpu
* update null context
* change time limit for example
* udpate
* update
* update
* update
* [npu] polish accelerator code
---------
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
* [hotfix] removed unused flag (#5242)
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* [hotfix] fix 3d plugin test (#5292)
* fix bug for mefture (#5299)
* [NFC] polish applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py code style (#5228)
* fix some typo (#5307)
* [feat] refactored extension module (#5298)
* [feat] refactored extension module
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* [workflow] updated CI image (#5318)
* [accelerator] fixed npu api
* [tests] fix t5 test. (#5322)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* fix t5 test
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] added docs for extensions (#5324)
* [doc] added docs for extensions
* polish
* polish
* fix typo under extensions/ (#5330)
* fix typo change dosen't to doesn't (#5308)
* [extension] fixed exception catch (#5342)
* [Chat] fix sft loss nan (#5345)
* fix script
* fix script
* fix chat nan
* fix chat nan
* [checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
* [checkpointio] fix hybrid parallel optim checkpoint
* [extension] fix cuda extension
* [checkpointio] fix gemini optimizer checkpoint
* polish code
* [fix] remove unnecessary dp_size assert (#5351)
* fix: remove unnecessary assert
* test: add more 3d plugin tests
* fix: add warning
* [gemini] fix param op hook when output is tuple (#5355)
* [gemini] fix param op hook when output is tuple
* [gemini] fix param op hook
* [llama] fix dataloader for hybrid parallel (#5358)
* [plugin] refactor prepare dataloader
* [plugin] update train script
* [llama] update training script (#5360)
* [llama] update training script
* [doc] polish docstr
* [llama] add flash attn patch for npu (#5362)
* [llama] fix neftune & pbar with start_step (#5364)
* [eval] update llama npu eval (#5366)
* [llama] polish training script and fix optim ckpt (#5368)
* [lr-scheduler] fix load state dict and add test (#5369)
* [llama] fix memory issue (#5371)
* [llama] fix memory issue
* [llama] add comment
* [moe] init mixtral impl
* [moe] update capacity computing (#5253)
* [moe] top2 allow uneven input
* [moe] update capacity computing
* [moe] remove debug info
* [moe] update capacity computing
* [moe] update capacity computing
* [moe] support mixtral (#5309)
* [moe] add mixtral block for single expert
* [moe] mixtral block fwd support uneven ep
* [moe] mixtral block bwd support uneven ep
* [moe] add mixtral moe layer
* [moe] simplify replace
* [meo] support save sharded mixtral
* [meo] support load sharded mixtral
* [meo] support save sharded optim
* [meo] integrate moe manager into plug
* [meo] fix optimizer load
* [meo] fix mixtral layer
* [moe] fix mixtral checkpoint io (#5314)
* [moe] fix mixtral forward default value (#5329)
* [moe] fix mixtral optim checkpoint (#5344)
* [moe] fix tests
* [release] update version (#5380)
* [llama] fix training and inference scripts (#5384)
* [llama] refactor inference example to fit sft
* [llama] fix training script to fit gemini
* [llama] fix inference script
* [doc] Fix typo (#5361)
* [doc] updated installation command (#5389)
* [hotfix] fix variable type for top_p (#5313)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] Fix wrong import in meta_registry (#5392)
* [extension] hotfix jit extension setup (#5402)
* [example] reuse flash attn patch (#5400)
* [fsdp] impl save/load shard model/optimizer (#5357)
* [setup] fixed nightly release (#5388)
* [shardformer]gather llama logits (#5398)
* gather llama logits
* fix
* update requirements (#5407)
* [workflow] added pypi channel (#5412)
* [doc] fix blog link
* [doc] fix blog link
* fix sft single turn inference example (#5416)
* [example]add gpt2 benchmark example script. (#5295)
* benchmark gpt2
* fix
fix
fix
fix
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* fix
* fix
* fix
fix
* fix
fix
fix
* fix
fix
* benchmark gpt2
* fix
fix
fix
fix
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* fix
fix
* fix
fix
fix
* fix
* fix
fix
fix
fix
fix
* fix
* Update shardformer.py
---------
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
* [doc] sora release (#5425)
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [devops] fix extention building (#5427)
* [hotfix] fix sd vit import error (#5420)
* fix import error
* Update dpt_depth.py
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] fix typo of openmoe model source (#5403)
* [doc] update some translations with README-zh-Hans.md (#5382)
* [hotfix] fix typo change _descrption to _description (#5331)
* [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
* [eval-hotfix] set few_shot_data to None when few shot is disabled (#5422)
* [hotfix] fix typo change MoECheckpintIO to MoECheckpointIO (#5335)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [doc] Fix typo s/infered/inferred/ (#5288)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
* [hotfix] fix stable diffusion inference bug. (#5289)
* Update train_ddp.yaml
delete "strategy" to fix DDP config loading bug in "main.py"
* Update train_ddp.yaml
fix inference with scripts/txt2img.py config file load bug.
* Update README.md
add pretrain model test code.
* [colossal-llama2] add stream chat examlple for chat version model (#5428)
* add stream chat for chat version
* remove os.system clear
* modify function name
* [release] update version (#5411)
* fix tensor data update for gemini loss caluculation (#5442)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [doc] fix ColossalMoE readme (#5599)
* fix readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [zero] support multiple (partial) backward passes (#5596)
* [zero] support multiple (partial) backward passes
* [misc] update requirements
* [shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)
* fix no pad token bug
* fixed some auto parallel codegen bug, but might not run on torch 2.1
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] fix pipeline grad ckpt (#5620)
* [shardformer] fix pipeline grad ckpt
* [lora] add lora APIs for booster, support lora for TorchDDP (#4981)
* add apis and peft requirement
* add liscense and implement apis
* add checkpointio apis
* add torchddp fwd_bwd test
* add support_lora methods
* add checkpointio test and debug
* delete unneeded codes
* remove peft from LICENSE
* add concrete methods for enable_lora
* simplify enable_lora api
* fix requirements
* [LowLevelZero] low level zero support lora (#5153)
* low level zero support lora
low level zero support lora
* add checkpoint test
* add checkpoint test
* fix
* fix
* fix
* fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* test ci
* git # This is a combination of 3 commits.
Update low_level_zero_plugin.py
Update low_level_zero_plugin.py
fix
fix
fix
* fix naming
fix naming
fix naming
fix
* [feature] qlora support
* qlora follow commit
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* migrate qutization folder to colossalai/
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gptj sp fix
* remove redundancies from pre-commit
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Jun Gao <imgaojun@gmail.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Elsa Granger <zeyugao@outlook.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: JIMMY ZHAO <knightyzhao@gmail.com>
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
Co-authored-by: 李文军 <40464906+liwenjuna@users.noreply.github.com>
Co-authored-by: yixiaoer <miyaku@yixiaoer.sg>
Co-authored-by: CZYCW <czyczf@163.com>
Co-authored-by: Stephan Kölker <stephankoe@users.noreply.github.com>
Co-authored-by: QinLuo <eric.x.sun@gmail.com>
Co-authored-by: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Co-authored-by: Luo Yihang <luo_yihang@outlook.com>
Co-authored-by: Dongruixuan Li <dongruixuan@hotmail.com>
Co-authored-by: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Co-authored-by: Youngon <Youngon_wyl@163.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-23 09:57:44 +00:00
safe_save_file ( state_dict_cpu , checkpoint_file_path , metadata = { " format " : " pt " } )
2023-08-31 06:50:47 +00:00
else :
[Feature] LoRA rebased to main branch (#5622)
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [moe] merge moe into main (#4978)
* update moe module
* support openmoe
* [hotfix] fix grad accumulation plus clipping for gemini (#5002)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel (#4926)
* [hotfix] Add layer norm gradients all-reduce for sequence parallel. (#4915)
* Add layer norm gradients all-reduce for sequence parallel.
* skip pipeline inference test
* [hotfix] fixing polices of sequence parallel (#4922)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
---------
Co-authored-by: littsk <1214689160@qq.com>
* Hotfix/add grad all reduce for sequence parallel (#4927)
* Add layer norm gradients all-reduce for sequence parallel.
* fix parameter passing when calling get_autopolicy
* fix bug using wrong variables
---------
Co-authored-by: littsk <1214689160@qq.com>
* fix policy initialization
* fix bloom and chatglm policices
* polish code of handling layernorm
* fix moe module
* polish code of class initializing
---------
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [format] applied code formatting on changed files in pull request 4926 (#5007)
Co-authored-by: github-actions <github-actions@github.com>
* [Inference] Fix bug in ChatGLM2 Tensor Parallelism (#5014)
* fix bug
* fix
* fix multiquery
* fix multiquery
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [misc] add code owners (#5024)
* [moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method
* unshard optim ckpt
* optim io
* update transformer version
* update requirements
* update ckpt
* update ckpt
* update ckpt
* fix engine
* fix engine
* Support mtbench (#5025)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [moe]: fix ep/tp tests, add hierarchical all2all (#4982)
* fix: add warning for EP different behavior
* fix: use shard_data in ep & tp model
* to: add used_capacity
* fix: fix router test
* feat: add create_ep_node_group
* feat: add create_ep_hierarchical_group fn
* feat: add HierarchicalAllToAll
* test: add hierarchical all2all test
* fix: fix test errors
* fix: simplify create_ep_hierarchical_group
* fix: add hierarchical_alltoall arg
* fix: fix environ typo
* revert: revert process mesh order
* to: add todo mark
* fix: skip hierarchical_comm if torch < 1.13.1
* [shardformer] Fix serialization error with Tensor Parallel state saving (#5018)
* Fix serialization error with Tensor Parallel state saving
* Refactor state_dict CPU transfer using tree_map
* [gemini] gemini support tensor parallelism. (#4942)
* [colossalai]fix typo
* [inference] Add smmoothquant for llama (#4904)
* [inference] add int8 rotary embedding kernel for smoothquant (#4843)
* [inference] add smoothquant llama attention (#4850)
* add smoothquant llama attention
* remove uselss code
* remove useless code
* fix import error
* rename file name
* [inference] add silu linear fusion for smoothquant llama mlp (#4853)
* add silu linear
* update skip condition
* catch smoothquant cuda lib exception
* prcocess exception for tests
* [inference] add llama mlp for smoothquant (#4854)
* add llama mlp for smoothquant
* fix down out scale
* remove duplicate lines
* add llama mlp check
* delete useless code
* [inference] add smoothquant llama (#4861)
* add smoothquant llama
* fix attention accuracy
* fix accuracy
* add kv cache and save pretrained
* refactor example
* delete smooth
* refactor code
* [inference] add smooth function and delete useless code for smoothquant (#4895)
* add smooth function and delete useless code
* update datasets
* remove duplicate import
* delete useless file
* refactor codes (#4902)
* rafactor code
* add license
* add torch-int and smoothquant license
* Update flash_attention_patch.py
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
* [kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921)
* [kernel] support pure fp16 for cpu adam (#4896)
* [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919)
* [kernel] fix cpu adam
* [test] update gemini optim test
* [format] applied code formatting on changed files in pull request 4908 (#4918)
Co-authored-by: github-actions <github-actions@github.com>
* [gemini] support gradient accumulation (#4869)
* add test
* fix no_sync bug in low level zero plugin
* fix test
* add argument for grad accum
* add grad accum in backward hook for gemini
* finish implementation, rewrite tests
* fix test
* skip stuck model in low level zero test
* update doc
* optimize communication & fix gradient checkpoint
* modify doc
* cleaning codes
* update cpu adam fp16 case
* [hotfix] fix torch 2.0 compatibility (#4936)
* [hotfix] fix launch
* [test] fix test gemini optim
* [shardformer] fix vit
* [test] add no master test for low level zero plugin (#4934)
* [format] applied code formatting on changed files in pull request 4820 (#4886)
Co-authored-by: github-actions <github-actions@github.com>
* [nfc] fix some typo with colossalai/ docs/ etc. (#4920)
* [Refactor] Integrated some lightllm kernels into token-attention (#4946)
* add some req for inference
* clean codes
* add codes
* add some lightllm deps
* clean codes
* hello
* delete rms files
* add some comments
* add comments
* add doc
* add lightllm deps
* add lightllm cahtglm2 kernels
* add lightllm cahtglm2 kernels
* replace rotary embedding with lightllm kernel
* add some commnets
* add some comments
* add some comments
* add
* replace fwd kernel att1
* fix a arg
* add
* add
* fix token attention
* add some comments
* clean codes
* modify comments
* fix readme
* fix bug
* fix bug
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
* [test] merge old components to test to model zoo (#4945)
* [test] add custom models in model zoo
* [test] update legacy test
* [test] update model zoo
* [test] update gemini test
* [test] remove components to test
* [inference] add reference and fix some bugs (#4937)
* add reference and fix some bugs
* update gptq init
---------
Co-authored-by: Xu Kai <xukai16@foxamil.com>
* [Inference]ADD Bench Chatglm2 script (#4963)
* add bench chatglm
* fix bug and make utils
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Pipeline inference] Combine kvcache with pipeline inference (#4938)
* merge kvcache with pipeline inference and refactor the code structure
* support ppsize > 2
* refactor pipeline code
* do pre-commit
* modify benchmark
* fix bench mark
* polish code
* add docstring and update readme
* refactor the code
* fix some logic bug of ppinfer
* polish readme
* fix typo
* skip infer test
* updated c++17 compiler flags (#4983)
* [Inference] Dynamic Batching Inference, online and offline (#4953)
* [inference] Dynamic Batching for Single and Multiple GPUs (#4831)
* finish batch manager
* 1
* first
* fix
* fix dynamic batching
* llama infer
* finish test
* support different lengths generating
* del prints
* del prints
* fix
* fix bug
---------
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [inference] Async dynamic batching (#4894)
* finish input and output logic
* add generate
* test forward
* 1
* [inference]Re push async dynamic batching (#4901)
* adapt to ray server
* finish async
* finish test
* del test
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* Revert "[inference]Re push async dynamic batching (#4901)" (#4905)
This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a.
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Revert "[inference] Async dynamic batching (#4894)" (#4909)
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* [infer]Add Ray Distributed Environment Init Scripts (#4911)
* Revert "[inference] Async dynamic batching (#4894)"
This reverts commit fced14025043e29ce816b315f440601188f7f79f.
* Add Ray Distributed Environment Init Scripts
* support DynamicBatchManager base function
* revert _set_tokenizer version
* add driver async generate
* add async test
* fix bugs in test_ray_dist.py
* add get_tokenizer.py
* fix code style
* fix bugs about No module named 'pydantic' in ci test
* fix bugs in ci test
* fix bugs in ci test
* fix bugs in ci test
* support dynamic batch for bloom model and is_running function
* [Inference]Test for new Async engine (#4935)
* infer engine
* infer engine
* test engine
* test engine
* new manager
* change step
* add
* test
* fix
* fix
* finish test
* finish test
* finish test
* finish test
* add license
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
* add assertion for config (#4947)
* [Inference] Finish dynamic batching offline test (#4948)
* test
* fix test
* fix quant
* add default
* fix
* fix some bugs
* fix some bugs
* fix
* fix bug
* fix bugs
* reset param
---------
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
* [Kernels]Updated Triton kernels into 2.1.0 and adding flash-decoding for llama token attention (#4965)
* adding flash-decoding
* clean
* adding kernel
* adding flash-decoding
* add integration
* add
* adding kernel
* adding kernel
* adding triton 2.1.0 features for inference
* update bloom triton kernel
* remove useless vllm kernels
* clean codes
* fix
* adding files
* fix readme
* update llama flash-decoding
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* fix ColossalEval (#4992)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc]Update doc for colossal-inference (#4989)
* update doc
* Update README.md
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c1398637310abf92eefa3cd168038ea02f.
* [hotfix] fix the bug of repeatedly storing param group (#4951)
* [doc] add supported feature diagram for hybrid parallel plugin (#4996)
* [Pipeline Inference] Merge pp with tp (#4993)
* refactor pipeline into new CaiInferEngine
* updata llama modeling forward
* merge tp with pp
* update docstring
* optimize test workflow and example
* fix typo
* add assert and todo
* [release] update version (#4995)
* [release] update version
* [hotfix] fix ci
* [gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
[gemini] gemini support tp
* fix
fix
fix
* update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
update checkpointIO
* support fused layernorm
support fused layernorm
support fused layernorm
* update fusedlayernorm
update fusedlayernorm
update fusedlayernorm
* add sequence parallel to gemini
add sequence parallel to gemini
* fix
* fix comments
fix comments
fix comments
* fix
* fix t5
* clear cache
* fix
* activate ci
* activate ci
* fix
* fix
* fix
* fix
* revert
* modify tp gather method
modify tp gather method
modify tp gather method
modify tp gather method
* fix test
---------
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
* [hotfix] Suport extra_kwargs in ShardConfig (#5031)
* [refactor]: replace inference args with extra_kwargs in ShardConfig
* modify shardconfig
* polish code
* fix policy bug in llama
* fix bug in auto policy
* remove setattr in ShardConfig
* fix wrong EOS token in ColossalChat
* [Kernels]Update triton kernels into 2.1.0 (#5046)
* update flash-context-attention
* adding kernels
* fix
* reset
* add build script
* add building process
* add llama2 exmaple
* add colossal-llama2 test
* clean
* fall back test setting
* fix test file
* clean
* clean
* clean
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when `strict=False`, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)
* Use p2p
* Cannot bidirectonal send p2p
* Refactor tensor creation and serialization in P2P
communication
* Fix llama forward args in flash attention
* Add flop estimate from megatron
* Support loading weight not in weight_map when strict=False in hybrid_parallel
* Use send_forward_recv_backward, etc in 1f1b
* Use dataclass for metdata
Remove torch.cuda.synchronize() as suggested
* Add comment about the torch.cuda.synchronize for potential error
* Typo
* Update hybrid_parallel_checkpoint_io.py
* Update p2p.py
* Update one_f_one_b.py
* Update p2p.py
---------
Co-authored-by: flybird11111 <1829166702@qq.com>
* [gemini] gemini support extra-dp (#5043)
* support ddp
* fix
* fix
* fix
fix
* support ddp
* fix
* fix
* fix
fix
* simplify tests
* fix
* fix
* fix
fix
fix
* fix
* [shardformer] fix llama error when transformers upgraded. (#5055)
* fix-llama
* Update llama.py
* [hotfix]: modify create_ep_hierarchical_group and add test (#5032)
* feat: modify create_ep_hierarchical_group args
* test: add ep tests
* fix: remove get_process_group_ranks
* fix: fix src_rank
* [exampe] fix llama example' loss error when using gemini plugin (#5060)
fix llama example
* [inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998)
* support only tp
* enable tp
* add support for bloom (#5008)
* [refactor] refactor gptq and smoothquant llama (#5012)
* refactor gptq and smoothquant llama
* fix import error
* fix linear import torch-int
* fix smoothquant llama import error
* fix import accelerate error
* fix bug
* fix import smooth cuda
* fix smoothcuda
* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)
merge chatglm with pp and tp
* [Refactor] remove useless inference code (#5022)
* remove useless code
* fix quant model
* fix test import bug
* mv original inference legacy
* fix chatglm2
* [Refactor] refactor policy search and quant type controlling in inference (#5035)
* [Refactor] refactor policy search and quant type controling in inference
* [inference] update readme (#5051)
* update readme
* update readme
* fix architecture
* fix table
* fix table
* [inference] udpate example (#5053)
* udpate example
* fix run.sh
* fix rebase bug
* fix some errors
* update readme
* add some features
* update interface
* update readme
* update benchmark
* add requirements-infer
---------
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
* [Kernels]added flash-decoidng of triton (#5063)
* added flash-decoidng of triton based on lightllm kernel
* add req
* clean
* clean
* delete build.sh
---------
Co-authored-by: cuiqing.li <lixx336@gmail.com>
* [misc] remove outdated submodule (#5070)
* [npu] add npu support for gemini and zero (#5067)
* [npu] setup device utils (#5047)
* [npu] add npu device support
* [npu] support low level zero
* [test] update npu zero plugin test
* [hotfix] fix import
* [test] recover tests
* [npu] gemini support npu (#5052)
* [npu] refactor device utils
* [gemini] support npu
* [example] llama2+gemini support npu
* [kernel] add arm cpu adam kernel (#5065)
* [kernel] add arm cpu adam
* [optim] update adam optimizer
* [kernel] arm cpu adam remove bf16 support
* [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
* [inference] update examples and engine (#5073)
* update examples and engine
* fix choices
* update example
* [format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
* [hotfix/hybridengine] Fix init model with random parameters in benchmark (#5074)
* fix init model with random parameters
* fix example
* [inference] refactor examples and fix schedule (#5077)
* [setup] refactor infer setup
* [hotfix] fix infenrece behavior on 1 1 gpu
* [exmaple] refactor inference examples
* fix thrust-transform-reduce error (#5078)
* [nfc] fix typo in docs/ (#4972)
* [nfc] fix typo and author name (#5089)
* [gemini]fix gemini optimzer, saving Shardformer in Gemini got list assignment index out of range (#5085)
* [Hotfix] Fix model policy matching strategy in ShardFormer (#5064)
* hotfix/Fix get model policy strategy in ShardFormer
* fix bug in auto policy
* [shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn
* fix
fix
* [npu] add npu support for hybrid plugin and llama (#5090)
* llama 3d
* update
* fix autocast
* [Feature] Add document retrieval QA (#5020)
* add langchain
* add langchain
* Add files via upload
* add langchain
* fix style
* fix style: remove extra space
* add pytest; modified retriever
* add pytest; modified retriever
* add tests to build_on_pr.yml
* fix build_on_pr.yml
* fix build on pr; fix environ vars
* seperate unit tests for colossalqa from build from pr
* fix container setting; fix environ vars
* commented dev code
* add incremental update
* remove stale code
* fix style
* change to sha3 224
* fix retriever; fix style; add unit test for document loader
* fix ci workflow config
* fix ci workflow config
* add set cuda visible device script in ci
* fix doc string
* fix style; update readme; refactored
* add force log info
* change build on pr, ignore colossalqa
* fix docstring, captitalize all initial letters
* fix indexing; fix text-splitter
* remove debug code, update reference
* reset previous commit
* update LICENSE update README add key-value mode, fix bugs
* add files back
* revert force push
* remove junk file
* add test files
* fix retriever bug, add intent classification
* change conversation chain design
* rewrite prompt and conversation chain
* add ui v1
* ui v1
* fix atavar
* add header
* Refactor the RAG Code and support Pangu
* Refactor the ColossalQA chain to Object-Oriented Programming and the UI demo.
* resolved conversation. tested scripts under examples. web demo still buggy
* fix ci tests
* Some modifications to add ChatGPT api
* modify llm.py and remove unnecessary files
* Delete applications/ColossalQA/examples/ui/test_frontend_input.json
* Remove OpenAI api key
* add colossalqa
* move files
* move files
* move files
* move files
* fix style
* Add Readme and fix some bugs.
* Add something to readme and modify some code
* modify a directory name for clarity
* remove redundant directory
* Correct a type in llm.py
* fix AI prefix
* fix test_memory.py
* fix conversation
* fix some erros and typos
* Fix a missing import in RAG_ChatBot.py
* add colossalcloud LLM wrapper, correct issues in code review
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* remove duplicate import (#5100)
* fix typo change lazy_iniy to lazy_init (#5099)
* [nfc] fix typo change directoty to directory (#5111)
* [FEATURE] Add Safety Eval Datasets to ColossalEval (#5095)
* add safetybench and cvalues(responsibility) eval dataset
* Modify code according to review suggestions
---------
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
* [hotfix] fixed memory usage of shardformer module replacement (#5122)
* [shardformer]: support gpt-j, falcon, Mistral and add interleaved pipeline for bert (#5088)
* [shardformer] implement policy for all GPT-J models and test
* [shardformer] support interleaved pipeline parallel for bert finetune
* [shardformer] shardformer support falcon (#4883)
* [shardformer]: fix interleaved pipeline for bert model (#5048)
* [hotfix]: disable seq parallel for gptj and falcon, and polish code (#5093)
* Add Mistral support for Shardformer (#5103)
* [shardformer] add tests to mistral (#5105)
---------
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
* [doc] add moe news (#5128)
* [doc] add moe news
* [doc] add moe news
* [doc] add moe news
* [doc] updated paper citation (#5131)
* fix typo change JOSNL TO JSONL etc. (#5116)
* [format] applied code formatting on changed files in pull request 5088 (#5127)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5124 (#5125)
Co-authored-by: github-actions <github-actions@github.com>
* [format] applied code formatting on changed files in pull request 5115 (#5118)
Co-authored-by: github-actions <github-actions@github.com>
* [accelerator] init the accelerator module (#5129)
* [accelerator] init the accelerator module
* polish code
* polish code
* polish code
* polish code
* [npu] support triangle attention for llama (#5130)
* update fused attn
* update spda
* tri attn
* update triangle
* import
* fix
* fix
* [plugin]fix 3d checkpoint load when booster boost without optimizer. (#5135)
* fix 3d checkpoint load when booster boost without optimizer
fix 3d checkpoint load when booster boost without optimizer
* test ci
* revert ci
* fix
fix
* [ColossalQA] refactor server and webui & add new feature (#5138)
* refactor server and webui & add new feature
* add requirements
* modify readme and ui
* [doc] fix colossalqa document (#5146)
* fix doc
* modify doc
* fix (#5158)
fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* [gemini] hotfix NaN loss while using Gemini + tensor_parallel (#5150)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* [colossalqa] fix pangu api (#5170)
* fix pangu api
* add comment
* [ColossalEval] Support GSM, Data Leakage Evaluation and Tensor Parallel (#5169)
* Support GSM, Data Leakage Evaluation and Tensor Parallel
* remove redundant code and update inference.py in examples/gpt_evaluation
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [shardformer] llama support DistCrossEntropy (#5176)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* [Colossal-Llama-2] Add finetuning Colossal-Llama-2 example (#4878)
* Add finetuning Colossal-Llama-2 example
* Add finetuning Colossal-Llama-2 example 2
* Add finetuning Colossal-Llama-2 example and support NEFTuning
* Add inference example and refine neftune
* Modify readme file
* update the imports
---------
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* llama support dist-cross
fix
fix
fix
fix
fix
fix
fix
fix
* fix
* fix
* fix
fix
* test ci
* test ci
* fix
* fix ci
* fix ci
---------
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
* Fix ColossalEval (#5186)
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
* [doc] update pytorch version in documents. (#5177)
* fix
aaa
fix
fix
fix
* fix
* fix
* test ci
* fix ci
fix
* update pytorch version in documents
* polish readme in application/chat (#5194)
* [pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)
* test: add more p2p tests
* fix: remove send_forward_recv_forward as p2p op list need to use the same group
* fix: make send and receive atomic
* feat: update P2PComm fn
* feat: add metadata cache in 1f1b
* feat: add metadata cache in interleaved pp
* feat: modify is_xx_stage fn
* revert: add _broadcast_object_list
* feat: add interleaved pp in llama policy
* feat: set NCCL_BUFFSIZE in HybridParallelPlugin
* Improve logic for selecting metrics (#5196)
Co-authored-by: Xu <yuanchen.xu00@gmail.com>
* [doc] Update required third-party library list for testing and torch comptibility checking (#5207)
* doc/update requirements-test.txt
* update torch-cuda compatibility check
* support linear accumulation fusion (#5199)
support linear accumulation fusion
support linear accumulation fusion
fix
* [pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader
* feat: add run_forward_only, support arbitrary bs
* chore: modify ci script
* [pipeline]: add p2p fallback order and fix interleaved pp deadlock (#5214)
* fix: add fallback order option and update 1f1b
* fix: fix deadlock comm in interleaved pp
* test: modify p2p test
* [devops] update torch versoin in ci (#5217)
* fix-test (#5210)
fix-test
fix-test
* fix flash attn (#5209)
* [nfc] fix typo colossalai/shardformer/ (#5133)
* [Colossal-LLaMA-2] Release Colossal-LLaMA-2-13b-base model (#5224)
* update readme
* update readme
* update link
* update
* update readme
* update
* update
* update
* update title
* update example
* update example
* fix content
* add conclusion
* add license
* update
* update
* update version
* fix minor
* [doc] Update README.md of Colossal-LLAMA2 (#5233)
* Update README.md
* Update README.md
* [doc] Make leaderboard format more uniform and good-looking (#5231)
* Make leaderboard format more unifeid and good-looking
* Update README.md
* Update README.md
* [doc] add Colossal-LLaMA-2-13B (#5234)
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [doc] add Colossal-LLaMA-2-13B
* [format] applied code formatting on changed files in pull request 5234 (#5235)
Co-authored-by: github-actions <github-actions@github.com>
* [doc] SwiftInfer release (#5236)
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [doc] SwiftInfer release
* [npu] use extension for op builder (#5172)
* update extension
* update cpu adam
* update is
* add doc for cpu adam
* update kernel
* update commit
* update flash
* update memory efficient
* update flash attn
* update flash attention loader
* update api
* fix
* update doc
* update example time limit
* reverse change
* fix doc
* remove useless kernel
* fix
* not use warning
* update
* update
* [pipeline] A more general _communicate in p2p (#5062)
* A more general _communicate
* feat: finish tree_flatten version p2p
* fix: update p2p api calls
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [npu] change device to accelerator api (#5239)
* update accelerator
* fix timer
* fix amp
* update
* fix
* update bug
* add error raise
* fix autocast
* fix set device
* remove doc accelerator
* update doc
* update doc
* update doc
* use nullcontext
* update cpu
* update null context
* change time limit for example
* udpate
* update
* update
* update
* [npu] polish accelerator code
---------
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
* [hotfix] removed unused flag (#5242)
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* [hotfix] fix 3d plugin test (#5292)
* fix bug for mefture (#5299)
* [NFC] polish applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py code style (#5228)
* fix some typo (#5307)
* [feat] refactored extension module (#5298)
* [feat] refactored extension module
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* polish
* [workflow] updated CI image (#5318)
* [accelerator] fixed npu api
* [tests] fix t5 test. (#5322)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* fix t5 test
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] added docs for extensions (#5324)
* [doc] added docs for extensions
* polish
* polish
* fix typo under extensions/ (#5330)
* fix typo change dosen't to doesn't (#5308)
* [extension] fixed exception catch (#5342)
* [Chat] fix sft loss nan (#5345)
* fix script
* fix script
* fix chat nan
* fix chat nan
* [checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
* [checkpointio] fix hybrid parallel optim checkpoint
* [extension] fix cuda extension
* [checkpointio] fix gemini optimizer checkpoint
* polish code
* [fix] remove unnecessary dp_size assert (#5351)
* fix: remove unnecessary assert
* test: add more 3d plugin tests
* fix: add warning
* [gemini] fix param op hook when output is tuple (#5355)
* [gemini] fix param op hook when output is tuple
* [gemini] fix param op hook
* [llama] fix dataloader for hybrid parallel (#5358)
* [plugin] refactor prepare dataloader
* [plugin] update train script
* [llama] update training script (#5360)
* [llama] update training script
* [doc] polish docstr
* [llama] add flash attn patch for npu (#5362)
* [llama] fix neftune & pbar with start_step (#5364)
* [eval] update llama npu eval (#5366)
* [llama] polish training script and fix optim ckpt (#5368)
* [lr-scheduler] fix load state dict and add test (#5369)
* [llama] fix memory issue (#5371)
* [llama] fix memory issue
* [llama] add comment
* [moe] init mixtral impl
* [moe] update capacity computing (#5253)
* [moe] top2 allow uneven input
* [moe] update capacity computing
* [moe] remove debug info
* [moe] update capacity computing
* [moe] update capacity computing
* [moe] support mixtral (#5309)
* [moe] add mixtral block for single expert
* [moe] mixtral block fwd support uneven ep
* [moe] mixtral block bwd support uneven ep
* [moe] add mixtral moe layer
* [moe] simplify replace
* [meo] support save sharded mixtral
* [meo] support load sharded mixtral
* [meo] support save sharded optim
* [meo] integrate moe manager into plug
* [meo] fix optimizer load
* [meo] fix mixtral layer
* [moe] fix mixtral checkpoint io (#5314)
* [moe] fix mixtral forward default value (#5329)
* [moe] fix mixtral optim checkpoint (#5344)
* [moe] fix tests
* [release] update version (#5380)
* [llama] fix training and inference scripts (#5384)
* [llama] refactor inference example to fit sft
* [llama] fix training script to fit gemini
* [llama] fix inference script
* [doc] Fix typo (#5361)
* [doc] updated installation command (#5389)
* [hotfix] fix variable type for top_p (#5313)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] Fix wrong import in meta_registry (#5392)
* [extension] hotfix jit extension setup (#5402)
* [example] reuse flash attn patch (#5400)
* [fsdp] impl save/load shard model/optimizer (#5357)
* [setup] fixed nightly release (#5388)
* [shardformer]gather llama logits (#5398)
* gather llama logits
* fix
* update requirements (#5407)
* [workflow] added pypi channel (#5412)
* [doc] fix blog link
* [doc] fix blog link
* fix sft single turn inference example (#5416)
* [example]add gpt2 benchmark example script. (#5295)
* benchmark gpt2
* fix
fix
fix
fix
* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed ddp test (#5254)
* [ci] fixed ddp test
* polish
* fix typo in applications/ColossalEval/README.md (#5250)
* [ci] fix shardformer tests. (#5255)
* fix ci
fix
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [doc] fix doc typo (#5256)
* [doc] fix annotation display
* [doc] fix llama2 doc
* [hotfix]: add pp sanity check and fix mbs arg (#5268)
* fix: fix misleading mbs arg
* feat: add pp sanity check
* fix: fix 1f1b sanity check
* [workflow] fixed incomplete bash command (#5272)
* [workflow] fixed oom tests (#5275)
* [workflow] fixed oom tests
* polish
* polish
* polish
* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)
* fix ci
fix
* fix test
* revert: revert p2p
* feat: add enable_metadata_cache option
* revert: enable t5 tests
* fix
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)
* support gradients acc
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
fix
* fix
fix
* fix
fix
fix
* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)
* fix auto loading gpt2 tokenizer (#5279)
* [doc] add llama2-13B disyplay (#5285)
* Update README.md
* fix 13b typo
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* fix llama pretrain (#5287)
* fix
* fix
* fix
fix
* fix
fix
fix
* fix
fix
* benchmark gpt2
* fix
fix
fix
fix
* [workflow] fixed build CI (#5240)
* [workflow] fixed build CI
* polish
* polish
* polish
* polish
* polish
* [ci] fixed booster test (#5251)
* [ci] fixed booster test
* [ci] fixed booster test
* [ci] fixed booster test
* fix
fix
* fix
fix
fix
* fix
* fix
fix
fix
fix
fix
* fix
* Update shardformer.py
---------
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
* [doc] sora release (#5425)
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [doc] sora release
* [devops] fix extention building (#5427)
* [hotfix] fix sd vit import error (#5420)
* fix import error
* Update dpt_depth.py
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [hotfix] fix typo of openmoe model source (#5403)
* [doc] update some translations with README-zh-Hans.md (#5382)
* [hotfix] fix typo change _descrption to _description (#5331)
* [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
* [eval-hotfix] set few_shot_data to None when few shot is disabled (#5422)
* [hotfix] fix typo change MoECheckpintIO to MoECheckpointIO (#5335)
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [doc] Fix typo s/infered/inferred/ (#5288)
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
* [hotfix] fix stable diffusion inference bug. (#5289)
* Update train_ddp.yaml
delete "strategy" to fix DDP config loading bug in "main.py"
* Update train_ddp.yaml
fix inference with scripts/txt2img.py config file load bug.
* Update README.md
add pretrain model test code.
* [colossal-llama2] add stream chat examlple for chat version model (#5428)
* add stream chat for chat version
* remove os.system clear
* modify function name
* [release] update version (#5411)
* fix tensor data update for gemini loss caluculation (#5442)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [doc] fix ColossalMoE readme (#5599)
* fix readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [zero] support multiple (partial) backward passes (#5596)
* [zero] support multiple (partial) backward passes
* [misc] update requirements
* [shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] Fix examples no pad token & auto parallel codegen bug; (#5606)
* fix no pad token bug
* fixed some auto parallel codegen bug, but might not run on torch 2.1
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] fix pipeline grad ckpt (#5620)
* [shardformer] fix pipeline grad ckpt
* [lora] add lora APIs for booster, support lora for TorchDDP (#4981)
* add apis and peft requirement
* add liscense and implement apis
* add checkpointio apis
* add torchddp fwd_bwd test
* add support_lora methods
* add checkpointio test and debug
* delete unneeded codes
* remove peft from LICENSE
* add concrete methods for enable_lora
* simplify enable_lora api
* fix requirements
* [LowLevelZero] low level zero support lora (#5153)
* low level zero support lora
low level zero support lora
* add checkpoint test
* add checkpoint test
* fix
* fix
* fix
* fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* fix
fix
fix
fix
fix
fix
fix
* fix
* test ci
* git # This is a combination of 3 commits.
Update low_level_zero_plugin.py
Update low_level_zero_plugin.py
fix
fix
fix
* fix naming
fix naming
fix naming
fix
* [feature] qlora support
* qlora follow commit
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* migrate qutization folder to colossalai/
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gptj sp fix
* remove redundancies from pre-commit
* minor fixes
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
Co-authored-by: Jianghai <72591262+CjhHa1@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: yuehuayingxueluo <867460659@qq.com>
Co-authored-by: Cuiqing Li <lixx3527@gmail.com>
Co-authored-by: cuiqing.li <lixx336@gmail.com>
Co-authored-by: Yuanchen <70520919+chengeharrison@users.noreply.github.com>
Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: littsk <1214689160@qq.com>
Co-authored-by: Baizhou Zhang <eddiezhang@pku.edu.cn>
Co-authored-by: ppt0011 <143150326+ppt0011@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Xuanlei Zhao <43881818+oahzxl@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Jun Gao <imgaojun@gmail.com>
Co-authored-by: flybird11111 <1829166702@qq.com>
Co-authored-by: Xu Kai <xukai16@foxmail.com>
Co-authored-by: Zian(Andy) Zheng <62330719+Orion-Zheng@users.noreply.github.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: CjhHa1 <cjh18671720497@outlook.com>
Co-authored-by: Xu Kai <xukai16@foxamil.com>
Co-authored-by: Orion-Zheng <zheng_zian@u.nus.edu>
Co-authored-by: Elsa Granger <zeyugao@outlook.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Orion-Zheng <zhengzian@u.nus.edu>
Co-authored-by: Pengtai Xu <henryxu880@gmail.com>
Co-authored-by: eric8607242 <e0928021388@gmail.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Co-authored-by: BlueRum <70618399+ht-zhou@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: JIMMY ZHAO <knightyzhao@gmail.com>
Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
Co-authored-by: 李文军 <40464906+liwenjuna@users.noreply.github.com>
Co-authored-by: yixiaoer <miyaku@yixiaoer.sg>
Co-authored-by: CZYCW <czyczf@163.com>
Co-authored-by: Stephan Kölker <stephankoe@users.noreply.github.com>
Co-authored-by: QinLuo <eric.x.sun@gmail.com>
Co-authored-by: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Co-authored-by: Luo Yihang <luo_yihang@outlook.com>
Co-authored-by: Dongruixuan Li <dongruixuan@hotmail.com>
Co-authored-by: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Co-authored-by: Youngon <Youngon_wyl@163.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-23 09:57:44 +00:00
torch . save ( state_dict_cpu , checkpoint_file_path )
2023-08-31 06:50:47 +00:00
def save_param_groups ( state_dict : dict , group_file_path : str ) - > None :
"""
Save information of param_groups to given file path .
Args :
state_dict ( dict ) : state dict .
group_file_path ( str ) : path to the group file .
"""
param_groups = state_dict [ " param_groups " ]
torch . save ( param_groups , group_file_path )
2023-09-19 06:20:26 +00:00
def clean_folder (
checkpoint_path : str ,
weights_name : str ,
shard_filenames : List [ str ] ,
is_master : bool = True ,
use_pp_format : bool = False ,
) :
2023-09-01 09:40:01 +00:00
"""
Clean the unneeded files in checkpoint directory after shards of state_dict have been saved .
Args :
checkpoint_path ( str ) : Path to the checkpoint directory .
weights_name ( str ) : Decides the prefix of filenames of weight shards .
shard_filenames ( List [ str ] ) : The list of saved shard filenames which should not be removed .
is_master ( bool , optional ) : Whether current rank is main process . Defaults to True .
use_pp_format : ( bool , optional ) : Whether to save the files in pipeline format including stage information . Defaults to False .
"""
if is_master :
for filename in os . listdir ( checkpoint_path ) :
full_filename = os . path . join ( checkpoint_path , filename )
weights_no_suffix = weights_name . replace ( " .bin " , " " ) . replace ( " .safetensors " , " " )
filename_no_suffix = filename . replace ( " .bin " , " " ) . replace ( " .safetensors " , " " )
if not use_pp_format :
reg = re . compile ( r " (.*?)- \ d {5} " )
else :
# When this checkpoint is created by pipeline parallel process, the pattern is a little different.
reg = re . compile ( r " (.*?)-stage- \ d {5} -shard- \ d {5} " )
2023-09-19 06:20:26 +00:00
if (
filename . startswith ( weights_no_suffix )
and os . path . isfile ( full_filename )
and filename not in shard_filenames
and reg . fullmatch ( filename_no_suffix ) is not None
) :
2023-09-01 09:40:01 +00:00
os . remove ( full_filename )
def save_config_file ( model : nn . Module , checkpoint_path : str , is_master : bool = True ) :
"""
Save config . json / generation_config . json if model is a Huggingface pretrained model .
This method can only be called when a model is saved in a sharded way .
Args :
model ( nn . Module ) : The model whose config should be saved if it ' s a huggingface model.
checkpoint_path ( str ) : Path to the checkpoint directory .
is_master ( bool ) : Whether current rank is main process .
"""
2023-09-11 08:24:28 +00:00
try :
from transformers . modeling_utils import PreTrainedModel , get_parameter_dtype
from transformers . modeling_utils import unwrap_model as unwrap_huggingface_model
except ImportError :
return
2023-09-01 09:40:01 +00:00
if not isinstance ( model , PreTrainedModel ) :
return
model = unwrap_huggingface_model ( model )
# save the string version of dtype to the config, e.g. convert torch.float32 => "float32"
dtype = get_parameter_dtype ( model )
model . config . torch_dtype = str ( dtype ) . split ( " . " ) [ 1 ]
# Attach architecture to the config
model . config . architectures = [ model . __class__ . __name__ ]
# Save the config
if is_master :
model . config . save_pretrained ( checkpoint_path )
if model . can_generate ( ) :
model . generation_config . save_pretrained ( checkpoint_path )
2023-08-31 06:50:47 +00:00
def save_dtensor ( name : str , tensor : torch . Tensor , index_file : " CheckpointIndexFile " , use_safetensors : bool ) - > None :
"""
Save distributed tensor to checkpoint . This checkpoint will be a dictionary which contains
only one tensor .
Args :
tensor ( Tensor ) : tensor to be saved .
index_file ( CheckpointIndexFile ) : path to the checkpoint file .
size_per_shard ( int ) : size per shard in MB .
"""
root_path = index_file . root_path
2023-09-19 06:20:26 +00:00
output_root_path = root_path . joinpath ( " dtensor " )
2023-08-31 06:50:47 +00:00
# create directory
output_root_path . mkdir ( exist_ok = True )
# save tensor to this directory
# TODO(YuliangLiu): get index of the tensor shard
# e.g. index =
index = 0
# save tensor to file
ckpt_file_name = generate_dtensor_file_name ( name , index , use_safetensors )
ckpt_file_path = output_root_path . joinpath ( ckpt_file_name )
# dtensor ckpt file always contains only one tensor
state_dict = { name : tensor }
save_state_dict ( state_dict , str ( ckpt_file_path ) , use_safetensors )
# update the weight map
# * means all shards
2023-09-19 06:20:26 +00:00
ckpt_file_name_in_weight_map = " dtensor/ " + generate_dtensor_file_name ( name , " * " , use_safetensors )
2023-08-31 06:50:47 +00:00
index_file . append_weight_map ( name , ckpt_file_name_in_weight_map )
def get_checkpoint_file_suffix ( use_safetensors : bool ) - > str :
"""
Get checkpoint file suffix .
Args :
use_safetensors ( bool ) : whether to use safetensors to save the checkpoint .
Returns :
str : checkpoint file suffix .
"""
if use_safetensors :
2023-09-19 06:20:26 +00:00
return " .safetensors "
2023-08-31 06:50:47 +00:00
else :
2023-09-19 06:20:26 +00:00
return " .bin "
2023-08-31 06:50:47 +00:00
2023-09-19 06:20:26 +00:00
def generate_checkpoint_shard_file_name (
index : int , total_number : int , use_safetensors : bool , prefix : str = None
) - > str :
2023-08-31 06:50:47 +00:00
"""
Generate checkpoint shard file name .
Args :
index ( int ) : index of the shard .
total_number ( int ) : total number of shards .
use_safetensors ( bool ) : whether to use safetensors to save the checkpoint .
prefix ( str ) : prefix of the shard file name . Default : None .
Returns :
str : checkpoint shard file name .
"""
suffix = get_checkpoint_file_suffix ( use_safetensors )
if prefix is None :
return f " { index : 05d } -of- { total_number : 05d } . { suffix } "
else :
return f " { prefix } - { index : 05d } -of- { total_number : 05d } . { suffix } "
2023-06-15 07:21:26 +00:00
2023-08-31 06:50:47 +00:00
def generate_dtensor_file_name ( param_name : str , index : int , use_safetensors : bool ) - > str :
"""
Generate dtensor file name .
Args :
param_name ( str ) : name of the distributed parameter .
index ( int ) : index of the shard .
use_safetensors ( bool ) : whether to use safetensors to save the checkpoint .
2023-06-15 07:21:26 +00:00
2023-08-31 06:50:47 +00:00
Returns :
str : dtensor file name .
"""
suffix = get_checkpoint_file_suffix ( use_safetensors )
2023-09-19 06:20:26 +00:00
return f " { param_name } . { index } . { suffix } "
2023-06-15 07:21:26 +00:00
2023-08-31 06:50:47 +00:00
# ========================================
# Helper functions for loading state dict
# ========================================
2023-06-15 07:21:26 +00:00
2023-05-18 12:05:59 +00:00
def load_shard_state_dict ( checkpoint_file : Path , use_safetensors : bool = False ) :
2023-04-06 08:23:39 +00:00
"""
load shard state dict into model
"""
if use_safetensors and not checkpoint_file . suffix == " .safetensors " :
raise Exception ( " load the model using `safetensors`, but no file endwith .safetensors " )
if use_safetensors :
from safetensors . torch import load_file as safe_load_file
2023-05-18 12:05:59 +00:00
from safetensors . torch import safe_open
2023-09-19 06:20:26 +00:00
2023-04-06 08:23:39 +00:00
with safe_open ( checkpoint_file , framework = " pt " ) as f :
metadata = f . metadata ( )
if metadata [ " format " ] != " pt " :
raise NotImplementedError (
2023-09-19 06:20:26 +00:00
f " Conversion from a { metadata [ ' format ' ] } safetensors archive to PyTorch is not implemented yet. "
)
2023-04-06 08:23:39 +00:00
return safe_load_file ( checkpoint_file )
else :
2023-09-19 06:20:26 +00:00
return torch . load ( checkpoint_file , map_location = torch . device ( " cpu " ) )
2023-05-18 12:05:59 +00:00
2023-09-19 06:20:26 +00:00
def load_state_dict_into_model (
model : nn . Module , state_dict : torch . Tensor , missing_keys : List , strict : bool = False , load_sub_module : bool = True
) :
2023-04-06 08:23:39 +00:00
r """ Copies parameters and buffers from :attr:`state_dict` into
2023-05-18 12:05:59 +00:00
this module and its descendants .
2023-04-06 08:23:39 +00:00
Args :
state_dict ( dict ) : a dict containing parameters and
persistent buffers .
"""
if not isinstance ( state_dict , Mapping ) :
raise TypeError ( " Expected state_dict to be dict-like, got {} . " . format ( type ( state_dict ) ) )
unexpected_keys : List [ str ] = [ ]
sub_missing_keys : List [ str ] = [ ]
error_msgs : List [ str ] = [ ]
# copy state_dict so _load_from_state_dict can modify it
2023-09-19 06:20:26 +00:00
metadata = getattr ( state_dict , " _metadata " , None )
2023-04-06 08:23:39 +00:00
state_dict = OrderedDict ( state_dict )
if metadata is not None :
state_dict . _metadata = metadata
2023-05-05 06:37:21 +00:00
def load ( module : nn . Module , state_dict , prefix = " " , load_sub_module : bool = True ) :
2023-04-06 08:23:39 +00:00
local_metadata = { } if metadata is None else metadata . get ( prefix [ : - 1 ] , { } )
2023-05-05 06:37:21 +00:00
args = ( state_dict , prefix , local_metadata , True , sub_missing_keys , [ ] , error_msgs )
2023-04-06 08:23:39 +00:00
# Parameters of module and children will start with prefix. We can exit early if there are none in this
# state_dict
if len ( [ key for key in state_dict if key . startswith ( prefix ) ] ) > 0 :
module . _load_from_state_dict ( * args )
2023-05-05 06:37:21 +00:00
if load_sub_module :
for name , child in module . _modules . items ( ) :
if child is not None :
load ( child , state_dict , prefix + name + " . " )
2023-04-06 08:23:39 +00:00
2023-05-05 06:37:21 +00:00
load ( model , state_dict , " " , load_sub_module )
2023-04-06 08:23:39 +00:00
del load
2023-05-05 06:37:21 +00:00
missing_keys = missing_keys . append ( sub_missing_keys )
2023-04-06 08:23:39 +00:00
if strict :
if len ( unexpected_keys ) > 0 :
2023-09-19 06:20:26 +00:00
error_msgs = " Unexpected key(s) in state_dict: {} . " . format (
" , " . join ( ' " {} " ' . format ( k ) for k in unexpected_keys )
)
raise RuntimeError (
" Error(s) in loading state_dict for {} : \n \t {} " . format ( model . __class__ . __name__ , " \n \t " . join ( error_msgs ) )
)
2023-05-18 12:05:59 +00:00
2023-06-15 07:21:26 +00:00
def load_param_groups_into_optimizer ( optimizer : Optimizer , param_group_path : str ) - > dict :
"""
Load information of param_groups into an initialized optimizer .
"""
# Load list of param_groups from given file path.
# The params in saved_groups are in the form of integer indices.
2023-09-19 06:20:26 +00:00
saved_groups = torch . load ( param_group_path , map_location = torch . device ( " cpu " ) )
2023-06-15 07:21:26 +00:00
if not isinstance ( saved_groups , List ) :
2023-09-19 06:20:26 +00:00
raise ValueError ( f " The param_groups saved at { param_group_path } is not of List type " )
2023-06-15 07:21:26 +00:00
# The params in param_groups are in the form of pytorch tensors.
# For more details, please view source code of Optimizer class in pytorch.
param_groups = optimizer . param_groups
# Check the compatibility of saved_groups and param_groups.
if len ( param_groups ) != len ( saved_groups ) :
raise ValueError ( " loaded state dict has a different number of original parameter groups " )
2023-09-19 06:20:26 +00:00
param_lens = ( len ( g [ " params " ] ) for g in param_groups )
saved_lens = ( len ( g [ " params " ] ) for g in saved_groups )
2023-06-15 07:21:26 +00:00
if any ( p_len != s_len for p_len , s_len in zip ( param_lens , saved_lens ) ) :
2023-09-19 06:20:26 +00:00
raise ValueError (
" loaded state dict contains a parameter group " " that doesn ' t match the size of optimizer ' s group "
)
2023-06-15 07:21:26 +00:00
# Creating mapping from id to parameters.
id_map = {
2023-09-19 06:20:26 +00:00
old_id : p
for old_id , p in zip (
chain . from_iterable ( ( g [ " params " ] for g in saved_groups ) ) ,
chain . from_iterable ( ( g [ " params " ] for g in param_groups ) ) ,
)
2023-06-15 07:21:26 +00:00
}
# Update parameter groups, setting their 'params' value.
def update_group ( group , new_group ) :
2023-09-19 06:20:26 +00:00
new_group [ " params " ] = group [ " params " ]
2023-06-15 07:21:26 +00:00
return new_group
updated_groups = [ update_group ( g , ng ) for g , ng in zip ( param_groups , saved_groups ) ]
2023-09-19 06:20:26 +00:00
optimizer . __dict__ . update ( { " param_groups " : updated_groups } )
2023-06-15 07:21:26 +00:00
return id_map
2023-08-31 06:50:47 +00:00
def load_states_into_optimizer ( optimizer : Optimizer , state_dict : dict , id_map : dict , strict : bool = False ) :
2023-06-15 07:21:26 +00:00
r """ Copies states from `state_dict` into an Optimizer object.
Args :
optimizer ( Optimizer ) : An initialized Optimizer object to be loaded
2023-08-31 06:50:47 +00:00
state_dict ( dict ) : A mapping from tensor index ( an integer )
2023-06-15 07:21:26 +00:00
to its states to be loaded ( a mapping from state name to a tensor ) .
2023-08-31 06:50:47 +00:00
id_map ( dict ) : A mapping from tensor index ( an integer )
2023-06-15 07:21:26 +00:00
to its corresponding parameter ( a tensor ) whose states will be updated .
2023-08-31 06:50:47 +00:00
strict ( bool , optional ) : If set to True , only load the parameters with its id in id_map . Defaults to False .
2023-06-15 07:21:26 +00:00
"""
2023-08-31 06:50:47 +00:00
# Ensure that the keys of state_dict are integers.
state_dict = { int ( k ) : v for k , v in state_dict . items ( ) }
2023-06-15 07:21:26 +00:00
def cast ( param , value , key = None ) :
r """ Make a deep copy of value, casting all tensors to device of param. """
if isinstance ( value , torch . Tensor ) :
# Floating-point types are a bit special here. They are the only ones
# that are assumed to always match the type of params.
# Make sure state['step'] is not casted https://github.com/pytorch/pytorch/issues/74424
2023-09-19 06:20:26 +00:00
if key != " step " :
2023-06-15 07:21:26 +00:00
if param . is_floating_point ( ) :
value = value . to ( param . dtype )
value = value . to ( param . device )
return value
elif isinstance ( value , dict ) :
return { k : cast ( param , v , key = k ) for k , v in value . items ( ) }
elif isinstance ( value , container_abcs . Iterable ) :
return type ( value ) ( cast ( param , v ) for v in value )
else :
return value
# Copy state assigned to params (and cast tensors to appropriate types).
# State that is not assigned to params is copied as is (needed for
# backward compatibility).
new_states = defaultdict ( dict )
for k , v in state_dict . items ( ) :
if k in id_map :
param = id_map [ k ]
new_states [ param ] = cast ( param , v )
2023-08-31 06:50:47 +00:00
elif not strict :
2023-06-15 07:21:26 +00:00
new_states [ k ] = v
2023-06-16 06:14:05 +00:00
optimizer . state . update ( new_states )
2023-06-15 07:21:26 +00:00
def sharded_optimizer_loading_epilogue ( optimizer : Optimizer ) :
2023-06-16 06:14:05 +00:00
r """ Do the cleaning up work after state_dict has been loaded into optimizer
Args :
optimizer ( Optimizer ) : An optimizer object whose state has just been loaded .
"""
2023-06-15 07:21:26 +00:00
# Do the cleaning up as in src code of Pytorch.
2023-10-07 02:45:52 +00:00
if Version ( torch . __version__ ) > = Version ( " 2.0.0 " ) :
optimizer . _patch_step_function ( ) # To support multiprocessing pickle/unpickle
else :
optimizer . _hook_for_profile ( ) # To support multiprocessing pickle/unpickle.
2023-09-19 06:20:26 +00:00
optimizer . defaults . setdefault ( " differentiable " , False )
2023-06-15 07:21:26 +00:00
2023-04-04 07:23:01 +00:00
def has_index_file ( checkpoint_path : str ) - > Tuple [ bool , Optional [ Path ] ] :
"""
Check whether the checkpoint has an index file .
Args :
checkpoint_path ( str ) : path to the checkpoint .
Returns :
Tuple [ bool , Optional [ Path ] ] : a tuple of ( has_index_file , index_file_path )
"""
checkpoint_path = Path ( checkpoint_path )
if checkpoint_path . is_file ( ) :
# check if it is .index.json
2023-04-12 08:02:17 +00:00
reg = re . compile ( " (.*?).index(( \ ..*)?).json " )
if reg . fullmatch ( checkpoint_path . name ) is not None :
2023-04-04 07:23:01 +00:00
return True , checkpoint_path
else :
return False , None
elif checkpoint_path . is_dir ( ) :
# check if there is only one a file ending with .index.json in this directory
2023-09-19 06:20:26 +00:00
index_files = list ( checkpoint_path . glob ( " *.index.*json " ) )
2023-04-04 07:23:01 +00:00
# if we found a .index.json file, make sure there is only one
if len ( index_files ) > 0 :
2023-09-19 06:20:26 +00:00
assert (
len ( index_files ) == 1
) , f " Expected to find one .index.json file in { checkpoint_path } , but found { len ( index_files ) } "
2023-04-04 07:23:01 +00:00
if len ( index_files ) == 1 :
return True , index_files [ 0 ]
else :
return False , None
2023-05-18 12:05:59 +00:00
else :
2023-09-19 06:20:26 +00:00
raise RuntimeError ( f " Invalid checkpoint path { checkpoint_path } . Expected a file or a directory. " )
2023-04-04 07:23:01 +00:00
def load_state_dict ( checkpoint_file_path : Path ) :
"""
Load state dict from checkpoint .
Args :
checkpoint_file_path ( Path ) : path to the checkpoint file .
Returns :
dict : state dict .
"""
2023-09-19 06:20:26 +00:00
assert not is_dtensor_checkpoint (
checkpoint_file_path
) , f " Cannot load state dict from dtensor checkpoint { checkpoint_file_path } , you should convert the distributed tensors to gathered tensors with our CLI offline. "
2023-04-04 07:23:01 +00:00
if is_safetensor_checkpoint ( checkpoint_file_path ) :
2023-09-19 06:20:26 +00:00
assert (
is_safetensors_available ( )
) , f " Cannot load state dict from safetensor checkpoint { checkpoint_file_path } , because safetensors is not available. Please install safetensors first with pip install safetensors. "
2023-04-04 07:23:01 +00:00
# load with safetensors
from safetensors import safe_open
2023-09-19 06:20:26 +00:00
2023-04-04 07:23:01 +00:00
state_dict = { }
with safe_open ( checkpoint_file_path , framework = " pt " , device = " cpu " ) as f :
for k in f . keys ( ) :
state_dict [ k ] = f . get_tensor ( k )
return state_dict
else :
# load with torch
2023-09-19 06:20:26 +00:00
return torch . load ( checkpoint_file_path , map_location = torch . device ( " cpu " ) )
2023-04-12 08:02:17 +00:00
2023-06-15 07:21:26 +00:00
def add_prefix ( weights_name : str , prefix : Optional [ str ] = None ) - > str :
if prefix is not None and len ( prefix ) > 0 :
2023-04-12 08:02:17 +00:00
splits = weights_name . split ( " . " )
2023-06-15 07:21:26 +00:00
splits = splits [ : - 1 ] + [ prefix ] + splits [ - 1 : ]
2023-04-12 08:02:17 +00:00
weights_name = " . " . join ( splits )
return weights_name
2023-05-05 06:37:21 +00:00
2023-06-15 07:21:26 +00:00
def get_model_base_filenames ( prefix : str = None , use_safetensors : bool = False ) :
2023-05-18 12:05:59 +00:00
"""
2023-06-15 07:21:26 +00:00
generate base model weight filenames
2023-05-18 12:05:59 +00:00
"""
weights_name = SAFE_WEIGHTS_NAME if use_safetensors else WEIGHTS_NAME
2023-06-15 07:21:26 +00:00
weights_name = add_prefix ( weights_name , prefix )
2023-05-18 12:05:59 +00:00
save_index_file = SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME
2023-06-15 07:21:26 +00:00
save_index_file = add_prefix ( save_index_file , prefix )
2023-05-05 06:37:21 +00:00
2023-05-18 12:05:59 +00:00
return weights_name , save_index_file
2023-05-05 06:37:21 +00:00
2023-06-15 07:21:26 +00:00
def get_optimizer_base_filenames ( prefix : str = None ) :
"""
generate base optimizer state filenames
"""
states_name = STATES_NAME
states_name = add_prefix ( states_name , prefix )
save_index_file = STATES_INDEX_NAME
save_index_file = add_prefix ( save_index_file , prefix )
param_group_file = GROUP_FILE_NAME
param_group_file = add_prefix ( param_group_file , prefix )
return states_name , save_index_file , param_group_file
2023-05-05 06:37:21 +00:00
def get_shard_filename ( weights_name : str , idx : int ) :
"""
get shard file name
"""
shard_file = weights_name . replace ( " .bin " , f " - { idx + 1 : 05d } .bin " )
2023-09-01 09:40:01 +00:00
shard_file = shard_file . replace ( " .safetensors " , f " - { idx + 1 : 05d } .safetensors " )
2023-05-18 12:05:59 +00:00
return shard_file