2023-08-04 06:55:31 +00:00
|
|
|
""" PyTorch ChatGLM model. """
|
2024-03-27 03:19:32 +00:00
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
from typing import List, Optional, Tuple
|
2023-08-04 06:55:31 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
import torch.utils.checkpoint
|
|
|
|
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
|
|
from transformers.utils import logging
|
|
|
|
|
|
|
|
from colossalai.pipeline.stage_manager import PipelineStageManager
|
2023-08-22 15:59:31 +00:00
|
|
|
from colossalai.shardformer import ShardConfig
|
2024-03-27 03:19:32 +00:00
|
|
|
from colossalai.shardformer.layer import AttnMaskType, ColoAttention
|
2024-07-10 03:34:25 +00:00
|
|
|
from colossalai.shardformer.layer._operation import (
|
|
|
|
all_to_all_comm,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
gather_sp_output,
|
|
|
|
is_share_sp_tp,
|
2024-07-10 03:34:25 +00:00
|
|
|
split_forward_gather_backward,
|
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
from ..layer import dist_cross_entropy
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
|
2023-08-07 08:41:07 +00:00
|
|
|
def get_flash_core_attention_forward():
|
|
|
|
from .chatglm2_6b.modeling_chatglm import CoreAttention
|
|
|
|
|
|
|
|
def forward(self: CoreAttention, query_layer, key_layer, value_layer, attention_mask):
|
2024-03-27 03:19:32 +00:00
|
|
|
query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
|
|
|
|
if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
|
|
|
|
attention_mask_type = AttnMaskType.CAUSAL
|
|
|
|
attn_bias = torch.zeros(
|
|
|
|
query_layer.shape[0],
|
|
|
|
1,
|
|
|
|
query_layer.shape[2],
|
|
|
|
key_layer.shape[2],
|
|
|
|
dtype=query_layer.dtype,
|
|
|
|
device=query_layer.device,
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2024-03-27 03:19:32 +00:00
|
|
|
temp_mask = (
|
2024-04-25 06:41:17 +00:00
|
|
|
torch.ones(
|
|
|
|
query_layer.shape[2],
|
|
|
|
key_layer.shape[2],
|
|
|
|
dtype=torch.bool,
|
|
|
|
device=query_layer.device,
|
|
|
|
)
|
2024-03-27 03:19:32 +00:00
|
|
|
.tril(diagonal=0)
|
|
|
|
.expand(query_layer.shape[0], 1, -1, -1)
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2024-03-27 03:19:32 +00:00
|
|
|
attn_bias.masked_fill_(temp_mask.logical_not(), torch.finfo(query_layer.dtype).min)
|
|
|
|
else:
|
|
|
|
attention_mask_type = AttnMaskType.CUSTOM
|
|
|
|
if attention_mask is not None:
|
|
|
|
attn_bias = torch.zeros_like(attention_mask, dtype=query_layer.dtype)
|
|
|
|
attn_bias.masked_fill_(attention_mask, torch.finfo(query_layer.dtype).min)
|
|
|
|
dropout_p = self.attention_dropout.p if self.training else 0.0
|
|
|
|
context_layer = ColoAttention.attention(
|
|
|
|
query_layer,
|
|
|
|
key_layer,
|
|
|
|
value_layer,
|
|
|
|
attention_mask=attn_bias,
|
|
|
|
attention_mask_type=attention_mask_type,
|
|
|
|
dropout_p=dropout_p,
|
2024-04-25 06:41:17 +00:00
|
|
|
scale=1.0 / self.norm_factor,
|
2024-03-27 03:19:32 +00:00
|
|
|
)
|
|
|
|
context_layer = context_layer.permute(2, 0, 1, 3)
|
|
|
|
new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
|
|
|
|
context_layer = context_layer.reshape(*new_context_layer_shape)
|
2023-08-07 08:41:07 +00:00
|
|
|
return context_layer
|
|
|
|
|
|
|
|
return forward
|
|
|
|
|
|
|
|
|
|
|
|
def get_jit_fused_glm_block_forward():
|
|
|
|
from .chatglm2_6b.modeling_chatglm import GLMBlock
|
|
|
|
|
|
|
|
def forward(
|
|
|
|
self: GLMBlock,
|
|
|
|
hidden_states,
|
|
|
|
attention_mask,
|
|
|
|
rotary_pos_emb,
|
|
|
|
kv_cache=None,
|
|
|
|
use_cache=True,
|
|
|
|
):
|
|
|
|
# hidden_states: [s, b, h]
|
|
|
|
# Layer norm at the beginning of the transformer layer.
|
|
|
|
layernorm_output = self.input_layernorm(hidden_states)
|
|
|
|
# Self attention.
|
|
|
|
attention_output, kv_cache = self.self_attention(
|
|
|
|
layernorm_output,
|
|
|
|
attention_mask,
|
|
|
|
rotary_pos_emb,
|
|
|
|
kv_cache=kv_cache,
|
|
|
|
use_cache=use_cache,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Residual connection.
|
|
|
|
if self.apply_residual_connection_post_layernorm:
|
|
|
|
residual = layernorm_output
|
|
|
|
else:
|
|
|
|
residual = hidden_states
|
|
|
|
|
|
|
|
layernorm_input = self.dropout_add(attention_output, residual, self.hidden_dropout, self.training)
|
|
|
|
|
|
|
|
# Layer norm post the self attention.
|
|
|
|
layernorm_output = self.post_attention_layernorm(layernorm_input)
|
|
|
|
|
|
|
|
# MLP.
|
|
|
|
mlp_output = self.mlp(layernorm_output)
|
|
|
|
|
|
|
|
# Second residual connection.
|
|
|
|
if self.apply_residual_connection_post_layernorm:
|
|
|
|
residual = layernorm_output
|
|
|
|
else:
|
|
|
|
residual = layernorm_input
|
|
|
|
|
|
|
|
output = self.dropout_add(mlp_output, residual, self.hidden_dropout, self.training)
|
|
|
|
|
|
|
|
return output, kv_cache
|
|
|
|
|
|
|
|
return forward
|
|
|
|
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
class ChatGLMPipelineForwards:
|
2023-09-19 06:20:26 +00:00
|
|
|
"""
|
2023-08-04 06:55:31 +00:00
|
|
|
This class serves as a micro library for ChatGLM model forwards under pipeline parallelism.
|
2023-09-19 06:20:26 +00:00
|
|
|
"""
|
2023-08-04 06:55:31 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def chatglm_model_forward(
|
2024-04-25 06:41:17 +00:00
|
|
|
self: "ChatGLMModel",
|
2023-08-04 06:55:31 +00:00
|
|
|
input_ids,
|
|
|
|
position_ids: Optional[torch.Tensor] = None,
|
|
|
|
attention_mask: Optional[torch.BoolTensor] = None,
|
|
|
|
full_attention_mask: Optional[torch.BoolTensor] = None,
|
|
|
|
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
|
|
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
|
|
use_cache: Optional[bool] = None,
|
|
|
|
output_hidden_states: Optional[bool] = None,
|
|
|
|
return_dict: Optional[bool] = None,
|
|
|
|
stage_manager: Optional[PipelineStageManager] = None,
|
|
|
|
hidden_states: Optional[torch.FloatTensor] = None,
|
|
|
|
stage_index: Optional[List[int]] = None,
|
2023-08-22 15:59:31 +00:00
|
|
|
shard_config: ShardConfig = None,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
force_sp_output_gather: Optional[bool] = True,
|
2023-08-04 06:55:31 +00:00
|
|
|
):
|
|
|
|
logger = logging.get_logger(__name__)
|
2023-09-19 06:20:26 +00:00
|
|
|
output_hidden_states = (
|
|
|
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
|
|
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
2023-08-14 09:43:33 +00:00
|
|
|
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
|
2023-08-04 06:55:31 +00:00
|
|
|
if past_key_values:
|
2023-09-19 06:20:26 +00:00
|
|
|
logger.warning_once("Non-empty past_key_values is not supported for pipeline models at the moment.")
|
2023-08-04 06:55:31 +00:00
|
|
|
past_key_values = None
|
|
|
|
if output_hidden_states:
|
2023-09-19 06:20:26 +00:00
|
|
|
logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
|
2023-08-04 06:55:31 +00:00
|
|
|
output_hidden_states = False
|
|
|
|
if use_cache:
|
2023-09-19 06:20:26 +00:00
|
|
|
logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
|
2023-08-04 06:55:31 +00:00
|
|
|
use_cache = False
|
|
|
|
if stage_manager.is_first_stage():
|
|
|
|
batch_size, seq_length = input_ids.shape
|
|
|
|
if inputs_embeds is None:
|
|
|
|
inputs_embeds = self.embedding(input_ids)
|
|
|
|
hidden_states = inputs_embeds
|
|
|
|
else:
|
|
|
|
seq_length, batch_size = hidden_states.shape[:2]
|
|
|
|
if self.pre_seq_len is not None:
|
|
|
|
if past_key_values is None:
|
2023-09-19 06:20:26 +00:00
|
|
|
past_key_values = self.get_prompt(
|
2024-03-27 03:19:32 +00:00
|
|
|
batch_size=batch_size,
|
|
|
|
device=input_ids.device,
|
|
|
|
dtype=inputs_embeds.dtype,
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
if attention_mask is not None:
|
2023-09-19 06:20:26 +00:00
|
|
|
attention_mask = torch.cat(
|
2024-03-27 03:19:32 +00:00
|
|
|
[
|
|
|
|
attention_mask.new_ones((batch_size, self.pre_seq_len)),
|
|
|
|
attention_mask,
|
|
|
|
],
|
|
|
|
dim=-1,
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
if full_attention_mask is None:
|
|
|
|
if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
|
|
|
|
full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
|
|
|
|
# Support SP + PP
|
|
|
|
sp_size = shard_config.sequence_parallel_size
|
|
|
|
sp_mode = shard_config.sequence_parallelism_mode
|
|
|
|
sp_group = shard_config.sequence_parallel_process_group
|
|
|
|
# For generating full positions ids (the states will be gathered along the seq dim before attention fwd).
|
|
|
|
if sp_mode != "ring_attn" and not stage_manager.is_first_stage():
|
|
|
|
seq_length *= sp_size
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
# Rotary positional embeddings
|
|
|
|
rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
|
|
|
|
if position_ids is not None:
|
|
|
|
rotary_pos_emb = rotary_pos_emb[position_ids]
|
|
|
|
else:
|
|
|
|
rotary_pos_emb = rotary_pos_emb[None, :seq_length]
|
|
|
|
rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
|
|
|
|
if not past_key_values:
|
|
|
|
past_key_values = [None for _ in range(self.num_layers)]
|
|
|
|
presents = () if use_cache else None
|
|
|
|
if self.encoder.gradient_checkpointing and self.encoder.training:
|
|
|
|
if use_cache:
|
|
|
|
logger.warning_once(
|
2023-09-19 06:20:26 +00:00
|
|
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
use_cache = False
|
|
|
|
all_self_attentions = None
|
|
|
|
all_hidden_states = () if output_hidden_states else None
|
|
|
|
start_idx, end_idx = stage_index[0], stage_index[1]
|
2023-08-22 15:59:31 +00:00
|
|
|
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
# Keep the input split across all PP stages
|
|
|
|
if stage_manager.is_first_stage():
|
|
|
|
if shard_config.enable_sequence_parallelism:
|
|
|
|
if sp_mode == "split_gather":
|
|
|
|
hidden_states = split_forward_gather_backward(
|
|
|
|
hidden_states,
|
|
|
|
dim=0,
|
|
|
|
process_group=sp_group,
|
|
|
|
)
|
|
|
|
elif shard_config.sequence_parallelism_mode == "all_to_all":
|
|
|
|
hidden_states = split_forward_gather_backward(
|
|
|
|
hidden_states,
|
|
|
|
dim=0,
|
|
|
|
process_group=shard_config.sequence_parallel_process_group,
|
|
|
|
grad_scale=1 / shard_config.sequence_parallel_size,
|
|
|
|
)
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
for idx in range(start_idx, end_idx):
|
|
|
|
layer = self.encoder._get_layer(idx)
|
|
|
|
if output_hidden_states:
|
|
|
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
|
|
if self.encoder.gradient_checkpointing and self.encoder.training:
|
2023-09-19 06:20:26 +00:00
|
|
|
layer_ret = torch.utils.checkpoint.checkpoint(
|
2024-03-27 03:19:32 +00:00
|
|
|
layer,
|
|
|
|
hidden_states,
|
|
|
|
attention_mask,
|
|
|
|
rotary_pos_emb,
|
|
|
|
past_key_values[idx],
|
|
|
|
use_cache,
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
else:
|
2023-09-19 06:20:26 +00:00
|
|
|
layer_ret = layer(
|
|
|
|
hidden_states,
|
|
|
|
full_attention_mask,
|
|
|
|
rotary_pos_emb,
|
|
|
|
kv_cache=past_key_values[idx],
|
|
|
|
use_cache=use_cache,
|
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
hidden_states, kv_cache = layer_ret
|
|
|
|
if use_cache:
|
|
|
|
presents = presents + (kv_cache,)
|
2023-08-22 15:59:31 +00:00
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
if output_hidden_states:
|
|
|
|
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
|
|
if stage_manager.is_last_stage():
|
|
|
|
# final layer_norm
|
|
|
|
if self.encoder.post_layer_norm:
|
|
|
|
hidden_states = self.encoder.final_layernorm(hidden_states)
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
|
|
|
|
# Gather seq-wise in the final output stage
|
|
|
|
if shard_config.enable_sequence_parallelism:
|
|
|
|
sp_mode = shard_config.sequence_parallelism_mode
|
|
|
|
if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
|
|
|
|
hidden_states = gather_sp_output(hidden_states, shard_config, sp_dim=0)
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
if not return_dict:
|
|
|
|
return tuple(
|
2024-03-27 03:19:32 +00:00
|
|
|
v
|
|
|
|
for v in [
|
|
|
|
hidden_states,
|
|
|
|
presents,
|
|
|
|
all_hidden_states,
|
|
|
|
all_self_attentions,
|
|
|
|
]
|
|
|
|
if v is not None
|
2023-09-19 06:20:26 +00:00
|
|
|
)
|
2023-08-04 06:55:31 +00:00
|
|
|
return BaseModelOutputWithPast(
|
|
|
|
last_hidden_state=hidden_states,
|
|
|
|
past_key_values=presents,
|
|
|
|
hidden_states=all_hidden_states,
|
|
|
|
attentions=all_self_attentions,
|
|
|
|
)
|
|
|
|
else:
|
2023-09-19 06:20:26 +00:00
|
|
|
return {"hidden_states": hidden_states}
|
2023-08-04 06:55:31 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2023-09-19 06:20:26 +00:00
|
|
|
def chatglm_for_conditional_generation_forward(
|
2024-04-25 06:41:17 +00:00
|
|
|
self: "ChatGLMForConditionalGeneration",
|
2023-09-19 06:20:26 +00:00
|
|
|
input_ids: Optional[torch.Tensor] = None,
|
|
|
|
position_ids: Optional[torch.Tensor] = None,
|
|
|
|
attention_mask: Optional[torch.Tensor] = None,
|
|
|
|
past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
|
|
|
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
|
|
labels: Optional[torch.Tensor] = None,
|
|
|
|
use_cache: Optional[bool] = None,
|
|
|
|
output_attentions: Optional[bool] = None,
|
|
|
|
output_hidden_states: Optional[bool] = None,
|
|
|
|
return_dict: Optional[bool] = None,
|
|
|
|
return_last_logit: Optional[bool] = False,
|
|
|
|
stage_manager: Optional[PipelineStageManager] = None,
|
|
|
|
hidden_states: Optional[torch.FloatTensor] = None,
|
|
|
|
stage_index: Optional[List[int]] = None,
|
|
|
|
shard_config: ShardConfig = None,
|
|
|
|
):
|
|
|
|
logging.get_logger(__name__)
|
2023-08-04 06:55:31 +00:00
|
|
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
2023-09-19 06:20:26 +00:00
|
|
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
2023-08-04 06:55:31 +00:00
|
|
|
transformer_outputs = ChatGLMPipelineForwards.chatglm_model_forward(
|
|
|
|
self.transformer,
|
|
|
|
input_ids=input_ids,
|
|
|
|
position_ids=position_ids,
|
|
|
|
attention_mask=attention_mask,
|
|
|
|
past_key_values=past_key_values,
|
|
|
|
inputs_embeds=inputs_embeds,
|
|
|
|
use_cache=use_cache,
|
|
|
|
output_hidden_states=output_hidden_states,
|
|
|
|
return_dict=return_dict,
|
|
|
|
stage_manager=stage_manager,
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
stage_index=stage_index,
|
2023-08-22 15:59:31 +00:00
|
|
|
shard_config=shard_config,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
force_sp_output_gather=False,
|
2023-08-04 06:55:31 +00:00
|
|
|
)
|
|
|
|
if stage_manager.is_last_stage():
|
|
|
|
hidden_states = transformer_outputs[0]
|
|
|
|
if return_last_logit:
|
|
|
|
hidden_states = hidden_states[-1:]
|
|
|
|
lm_logits = self.transformer.output_layer(hidden_states)
|
|
|
|
lm_logits = lm_logits.transpose(0, 1).contiguous()
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
loss = None
|
|
|
|
if labels is not None:
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
# ChatGLM doesn't have lm_head split
|
|
|
|
enable_tp = shard_config.enable_tensor_parallelism
|
|
|
|
shard_config.enable_tensor_parallelism = False
|
|
|
|
loss = dist_cross_entropy(
|
|
|
|
labels,
|
|
|
|
lm_logits,
|
|
|
|
shard_config,
|
|
|
|
self.transformer.output_layer.out_features,
|
|
|
|
lm_logits.dtype,
|
|
|
|
)
|
|
|
|
shard_config.enable_tensor_parallelism = enable_tp
|
|
|
|
|
2023-08-04 06:55:31 +00:00
|
|
|
if not return_dict:
|
|
|
|
output = (lm_logits,) + transformer_outputs[1:]
|
|
|
|
return ((loss,) + output) if loss is not None else output
|
|
|
|
return CausalLMOutputWithPast(
|
|
|
|
loss=loss,
|
|
|
|
logits=lm_logits,
|
|
|
|
past_key_values=transformer_outputs.past_key_values,
|
|
|
|
hidden_states=transformer_outputs.hidden_states,
|
|
|
|
attentions=transformer_outputs.attentions,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
return transformer_outputs
|
2023-08-22 15:59:31 +00:00
|
|
|
|
|
|
|
|
2024-07-10 03:34:25 +00:00
|
|
|
def get_chatglm_sequence_parallel_forward_fn(shard_config: ShardConfig, sp_mode, sp_size, sp_group):
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
|
2023-08-22 15:59:31 +00:00
|
|
|
def forward(
|
|
|
|
self,
|
|
|
|
input_ids,
|
|
|
|
position_ids: Optional[torch.Tensor] = None,
|
|
|
|
attention_mask: Optional[torch.BoolTensor] = None,
|
|
|
|
full_attention_mask: Optional[torch.BoolTensor] = None,
|
|
|
|
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
|
|
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
|
|
use_cache: Optional[bool] = None,
|
|
|
|
output_hidden_states: Optional[bool] = None,
|
|
|
|
return_dict: Optional[bool] = None,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
force_sp_output_gather: Optional[bool] = True,
|
2023-08-22 15:59:31 +00:00
|
|
|
):
|
2023-09-19 06:20:26 +00:00
|
|
|
output_hidden_states = (
|
|
|
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
|
|
|
)
|
2023-08-22 15:59:31 +00:00
|
|
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
2023-09-19 06:20:26 +00:00
|
|
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
2023-08-22 15:59:31 +00:00
|
|
|
|
|
|
|
batch_size, seq_length = input_ids.shape
|
|
|
|
|
|
|
|
if inputs_embeds is None:
|
|
|
|
inputs_embeds = self.embedding(input_ids)
|
|
|
|
|
|
|
|
if self.pre_seq_len is not None:
|
|
|
|
if past_key_values is None:
|
|
|
|
past_key_values = self.get_prompt(
|
|
|
|
batch_size=batch_size,
|
|
|
|
device=input_ids.device,
|
|
|
|
dtype=inputs_embeds.dtype,
|
|
|
|
)
|
|
|
|
if attention_mask is not None:
|
|
|
|
attention_mask = torch.cat(
|
|
|
|
[
|
|
|
|
attention_mask.new_ones((batch_size, self.pre_seq_len)),
|
|
|
|
attention_mask,
|
|
|
|
],
|
|
|
|
dim=-1,
|
|
|
|
)
|
|
|
|
|
|
|
|
if full_attention_mask is None:
|
|
|
|
if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
|
|
|
|
full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
|
|
|
|
|
|
|
|
# Rotary positional embeddings
|
|
|
|
rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
|
|
|
|
if position_ids is not None:
|
|
|
|
rotary_pos_emb = rotary_pos_emb[position_ids]
|
|
|
|
else:
|
|
|
|
rotary_pos_emb = rotary_pos_emb[None, :seq_length]
|
|
|
|
rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
|
|
|
|
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
if sp_mode in ["all_to_all"] and self.training:
|
|
|
|
if use_cache:
|
|
|
|
logger.warning_once(
|
|
|
|
"`use_cache=True` is incompatible with sp mode `{sp_mode}`. Setting `use_cache=False`..."
|
|
|
|
)
|
|
|
|
use_cache = False
|
2024-07-10 03:34:25 +00:00
|
|
|
if sp_mode in ["all_to_all"] and self.training:
|
|
|
|
if use_cache:
|
|
|
|
logger.warning_once(
|
|
|
|
"`use_cache=True` is incompatible with sp mode `{sp_mode}`. Setting `use_cache=False`..."
|
|
|
|
)
|
|
|
|
use_cache = False
|
2023-08-22 15:59:31 +00:00
|
|
|
# Run encoder.
|
|
|
|
# [seq_len, batch_size, hidden_size] -> [seq_len/TP_size, batch_size, hidden_size]
|
2024-07-10 03:34:25 +00:00
|
|
|
if sp_mode in ["split_gather"]:
|
|
|
|
inputs_embeds = split_forward_gather_backward(
|
|
|
|
inputs_embeds,
|
|
|
|
dim=0,
|
|
|
|
process_group=sp_group,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
fp8_communication=shard_config.fp8_communication,
|
2024-07-10 03:34:25 +00:00
|
|
|
)
|
|
|
|
elif sp_mode == "all_to_all":
|
|
|
|
inputs_embeds = split_forward_gather_backward(
|
|
|
|
inputs_embeds,
|
|
|
|
dim=0,
|
|
|
|
process_group=sp_group,
|
|
|
|
grad_scale=1 / sp_size,
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
fp8_communication=shard_config.fp8_communication,
|
2024-07-10 03:34:25 +00:00
|
|
|
)
|
2023-08-22 15:59:31 +00:00
|
|
|
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
|
|
|
|
inputs_embeds,
|
|
|
|
full_attention_mask,
|
|
|
|
rotary_pos_emb=rotary_pos_emb,
|
|
|
|
kv_caches=past_key_values,
|
|
|
|
use_cache=use_cache,
|
|
|
|
output_hidden_states=output_hidden_states,
|
|
|
|
)
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
if shard_config.enable_sequence_parallelism:
|
|
|
|
if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
|
|
|
|
hidden_states = gather_sp_output(hidden_states, shard_config, sp_dim=0)
|
2023-08-22 15:59:31 +00:00
|
|
|
|
|
|
|
if not return_dict:
|
2023-09-19 06:20:26 +00:00
|
|
|
return tuple(
|
|
|
|
v
|
|
|
|
for v in [
|
|
|
|
hidden_states,
|
|
|
|
presents,
|
|
|
|
all_hidden_states,
|
|
|
|
all_self_attentions,
|
|
|
|
]
|
|
|
|
if v is not None
|
|
|
|
)
|
2023-08-22 15:59:31 +00:00
|
|
|
|
|
|
|
return BaseModelOutputWithPast(
|
|
|
|
last_hidden_state=hidden_states,
|
|
|
|
past_key_values=presents,
|
|
|
|
hidden_states=all_hidden_states,
|
|
|
|
attentions=all_self_attentions,
|
|
|
|
)
|
|
|
|
|
|
|
|
return forward
|
2024-07-10 03:34:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_chatglm_sequence_parallel_attention_forward(shard_config: ShardConfig, sp_mode, sp_size, sp_group):
|
|
|
|
from .chatglm2_6b.modeling_chatglm import apply_rotary_pos_emb, split_tensor_along_last_dim
|
|
|
|
|
|
|
|
def forward(
|
|
|
|
self,
|
|
|
|
hidden_states,
|
|
|
|
attention_mask,
|
|
|
|
rotary_pos_emb,
|
|
|
|
kv_cache=None,
|
|
|
|
use_cache=True,
|
|
|
|
):
|
|
|
|
if sp_mode is not None:
|
|
|
|
assert sp_mode in ["all_to_all", "split_gather"], "Invalid sp_mode"
|
|
|
|
assert (sp_size is not None) and (
|
|
|
|
sp_group is not None
|
|
|
|
), "Must specify sp_size and sp_group for sequence parallel"
|
|
|
|
|
|
|
|
mixed_x_layer = self.query_key_value(hidden_states)
|
|
|
|
if self.multi_query_attention:
|
|
|
|
(query_layer, key_layer, value_layer) = mixed_x_layer.split(
|
|
|
|
[
|
|
|
|
self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
|
|
|
|
self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
|
|
|
|
self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
|
|
|
|
],
|
|
|
|
dim=-1,
|
|
|
|
)
|
|
|
|
query_layer = query_layer.view(
|
|
|
|
query_layer.size()[:-1]
|
|
|
|
+ (
|
|
|
|
self.num_attention_heads_per_partition,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
key_layer = key_layer.view(
|
|
|
|
key_layer.size()[:-1]
|
|
|
|
+ (
|
|
|
|
self.num_multi_query_groups_per_partition,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
value_layer = value_layer.view(
|
|
|
|
value_layer.size()[:-1]
|
|
|
|
+ (
|
|
|
|
self.num_multi_query_groups_per_partition,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
new_tensor_shape = mixed_x_layer.size()[:-1] + (
|
|
|
|
self.num_attention_heads_per_partition,
|
|
|
|
3 * self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
|
|
|
|
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
|
|
|
|
(query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
|
|
|
|
|
|
|
# sp: all-to-all comminucation when introducing sequence parallel
|
|
|
|
if sp_mode == "all_to_all":
|
|
|
|
sq, bs, _, _ = value_layer.size()
|
|
|
|
|
|
|
|
query_layer = query_layer.reshape(sq, bs, -1)
|
|
|
|
key_layer = key_layer.reshape(sq, bs, -1)
|
|
|
|
value_layer = value_layer.reshape(sq, bs, -1)
|
|
|
|
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
query_layer = all_to_all_comm(
|
|
|
|
query_layer,
|
|
|
|
sp_group,
|
|
|
|
gather_dim=0,
|
|
|
|
fp8_communication=shard_config.fp8_communication,
|
|
|
|
)
|
|
|
|
key_layer = all_to_all_comm(
|
|
|
|
key_layer,
|
|
|
|
sp_group,
|
|
|
|
gather_dim=0,
|
|
|
|
fp8_communication=shard_config.fp8_communication,
|
|
|
|
)
|
|
|
|
value_layer = all_to_all_comm(
|
|
|
|
value_layer,
|
|
|
|
sp_group,
|
|
|
|
gather_dim=0,
|
|
|
|
fp8_communication=shard_config.fp8_communication,
|
|
|
|
)
|
2024-07-10 03:34:25 +00:00
|
|
|
|
|
|
|
query_layer = query_layer.view(
|
|
|
|
sq * sp_size,
|
|
|
|
bs,
|
|
|
|
self.num_attention_heads_per_partition // sp_size,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
).contiguous()
|
|
|
|
|
|
|
|
key_layer = key_layer.view(
|
|
|
|
sq * sp_size,
|
|
|
|
bs,
|
|
|
|
self.num_attention_heads_per_partition // sp_size,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
).contiguous()
|
|
|
|
|
|
|
|
value_layer = value_layer.view(
|
|
|
|
sq * sp_size,
|
|
|
|
bs,
|
|
|
|
self.num_attention_heads_per_partition // sp_size,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
).contiguous()
|
|
|
|
|
|
|
|
# apply relative positional encoding (rotary embedding)
|
|
|
|
if rotary_pos_emb is not None:
|
|
|
|
query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
|
|
|
|
key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
|
|
|
|
|
|
|
|
# adjust key and value for inference
|
|
|
|
if kv_cache is not None:
|
|
|
|
cache_k, cache_v = kv_cache
|
|
|
|
key_layer = torch.cat((cache_k, key_layer), dim=0)
|
|
|
|
value_layer = torch.cat((cache_v, value_layer), dim=0)
|
|
|
|
if use_cache:
|
|
|
|
kv_cache = (key_layer, value_layer)
|
|
|
|
else:
|
|
|
|
kv_cache = None
|
|
|
|
|
|
|
|
if self.multi_query_attention:
|
|
|
|
key_layer = key_layer.unsqueeze(-2)
|
|
|
|
key_layer = key_layer.expand(
|
|
|
|
-1,
|
|
|
|
-1,
|
|
|
|
-1,
|
|
|
|
self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
|
|
|
|
-1,
|
|
|
|
)
|
|
|
|
key_layer = key_layer.contiguous().view(
|
|
|
|
key_layer.size()[:2]
|
|
|
|
+ (
|
|
|
|
self.num_attention_heads_per_partition,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
value_layer = value_layer.unsqueeze(-2)
|
|
|
|
value_layer = value_layer.expand(
|
|
|
|
-1,
|
|
|
|
-1,
|
|
|
|
-1,
|
|
|
|
self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
|
|
|
|
-1,
|
|
|
|
)
|
|
|
|
value_layer = value_layer.contiguous().view(
|
|
|
|
value_layer.size()[:2]
|
|
|
|
+ (
|
|
|
|
self.num_attention_heads_per_partition // sp_size,
|
|
|
|
self.hidden_size_per_attention_head,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# ==================================
|
|
|
|
# core attention computation
|
|
|
|
# ==================================
|
|
|
|
|
|
|
|
context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
|
|
|
|
if sp_mode == "all_to_all":
|
[zerobubble] rebase main (#6075)
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [fp8] add fp8 comm for low level zero
* [test] add zero fp8 test case
* [Feature] llama shardformer fp8 support (#5938)
* add llama shardformer fp8
* Llama Shardformer Parity
* fix typo
* fix all reduce
* fix pytest failure
* fix reduce op and move function to fp8.py
* fix typo
* [FP8] rebase main (#5963)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* fp8 operators for compressed communication
cast_to_fp8, cast_from_fp8, all_reduce_fp8
* fix scaling algorithm in FP8 casting
* support fp8 communication in pipeline parallelism
* add fp8_communication flag in the script
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* shardformer fp8
* fix rebase
* remove all to all
* fix shardformer fp8 communication training degradation
* [fp8] support all-gather flat tensor (#5932)
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update low_level_optim.py
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
* [fp8]support all2all fp8 (#5953)
* support all2all fp8
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] add fp8 linear (#5967)
* [fp8] add fp8 linear
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [test] fix fp8 linear test condition
* [fp8] support fp8 amp for hybrid parallel plugin (#5975)
* [fp8] support fp8 amp for hybrid parallel plugin
* [test] add fp8 hook test
* [fp8] fix fp8 linear compatibility
* fix (#5976)
* [Feature]: support FP8 communication in DDP, FSDP, Gemini (#5928)
* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* implement communication hook for FSDP params all-gather
* added unit test for fp8 operators
* support fp8 communication in GeminiPlugin
* update training scripts to support fsdp and fp8 communication
* fixed some minor bugs observed in unit test
* add all_gather_into_tensor_flat_fp8
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add skip the test if torch < 2.2.0
* add skip the test if torch < 2.2.0
* add fp8_comm flag
* rebase latest fp8 operators
* rebase latest fp8 operators
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [test ci]Feature/fp8 comm (#5981)
* fix
* fix
* fix
* [fp8] support gemini plugin (#5978)
* [fp8] refactor hook
* [fp8] support gemini plugin
* [example] add fp8 option for llama benchmark
* [fp8] use torch compile (torch >= 2.3.0) (#5979)
* [fp8] use torch compile (torch >= 2.4.0)
* [fp8] set use_fast_accum in linear
* [chore] formal version check
* [chore] fix sig
* [fp8]Moe support fp8 communication (#5977)
* fix
* support moe fp8
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
fix
fi
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] support hybrid parallel plugin (#5982)
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* support fp8 comm for qwen2 model
* fp8
* fix
* bert and bloom
* chatglm and command
* gpt2,gptj,bert, falcon,blip2
* mistral,opy,sam,t5,vit,whisper
* fix
* fix
* fix
* [fp8] refactor fp8 linear with compile (#5993)
* [fp8] refactor fp8 linear with compile
* [fp8] fix linear test
* [fp8] fix linear test
* [fp8] support asynchronous FP8 communication (#5997)
* fix
* fix
* fix
* support async all2all
* support async op for all gather
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [fp8] update torch.compile for linear_fp8 to >= 2.4.0 (#6004)
* [fp8] linear perf enhancement
* [fp8]update reduce-scatter test (#6002)
* fix
* fix
* fix
* fix
* [fp8] add use_fp8 option for MoeHybridParallelPlugin (#6009)
* [fp8] zero support fp8 linear. (#6006)
* fix
* fix
* fix
* zero fp8
* zero fp8
* Update requirements.txt
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
* [fp8] Merge feature/fp8_comm to main branch of Colossalai (#6016)
* add SimPO
* fix dataloader
* remove debug code
* add orpo
* fix style
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix colossalai, transformers version
* fix torch colossalai version
* update transformers version
* [shardformer] DeepseekMoE support (#5871)
* [Feature] deepseek moe expert parallel implement
* [misc] fix typo, remove redundant file (#5867)
* [misc] fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] deepseek support & unit test
* [misc] remove debug code & useless print
* [misc] fix typos (#5872)
* [Feature] remove modeling file, use auto config. (#5884)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [Deepseek] remove redundant code (#5888)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [Feature/deepseek] resolve comment. (#5889)
* [misc] fix typos
* [Feature] deepseek support via auto model, remove modeling file
* [misc] delete useless file
* [misc] fix typos
* [misc] remove redundant code
* [misc] mv module replacement into if branch
* [misc] add some warning message and modify some code in unit test
* [misc] fix typos
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)
* Diffusion Model Inference support
* Stable Diffusion 3 Support
* pixartalpha support
* [HotFix] CI,import,requirements-test for #5838 (#5892)
* [Hot Fix] CI,import,requirements-test
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Feature] Enable PP + SP for llama (#5868)
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* use a one cross entropy func for all shardformer models
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)
* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint
* fix style
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix eval
* hotfix citation
* [zero] support all-gather overlap (#5898)
* [zero] support all-gather overlap
* [zero] add overlap all-gather flag
* [misc] fix typo
* [zero] update api
* fix orpo cross entropy loss
* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)
* Remove unnecessary calls to deepcopy
* Build DimSpec's difference dict only once
This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.
* Fix documentation of DimSpec's difference method
* [ShardFormer] fix qwen2 sp (#5903)
* [compatibility] support torch 2.2 (#5875)
* Support Pytorch 2.2.2
* keep build_on_pr file and update .compatibility
* fix object_to_tensor usage when torch>=2.3.0 (#5820)
* [misc] support torch2.3 (#5893)
* [misc] support torch2.3
* [devops] update compatibility ci
* [devops] update compatibility ci
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] add debug
* [devops] remove debug
* [devops] remove debug
* [release] update version (#5912)
* [plugin] support all-gather overlap for hybrid parallel (#5919)
* [plugin] fixed all-gather overlap support for hybrid parallel
* add kto
* fix style, add kto data sample
* [Examples] Add lazy init to OPT and GPT examples (#5924)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [ColossalChat] Hotfix for ColossalChat (#5910)
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* add ignore and tiny llama
* fix path issue
* run style
* fix issue
* update bash
* fix ddp issue
* add Qwen 1.5 32B
* refactor tokenization
* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)
* cannot access local variable 'default_conversation' where it is not associated with a value
set default value for 'default_conversation'
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix test data
* refactor evaluation
* remove real data path
* remove real data path
* Add n_fused as an input from native_module (#5894)
* [FIX BUG] convert env param to int in (#5934)
* [Hotfix] Fix ZeRO typo #5936
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)
* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix style
* fix style
* fix style
* [shardformer] hotfix attn mask (#5945)
* [shardformer] hotfix attn mask (#5947)
* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
* Distrifusion Support source
* comp comm overlap optimization
* sd3 benchmark
* pixart distrifusion bug fix
* sd3 bug fix and benchmark
* generation bug fix
* naming fix
* add docstring, fix counter and shape error
* add reference
* readme and requirement
* [zero] hotfix update master params (#5951)
* [release] update version (#5952)
* [Chat] Fix lora (#5946)
* fix merging
* remove filepath
* fix style
* Update README.md (#5958)
* [hotfix] Remove unused plan section (#5957)
* remove readme
* fix readme
* update
* [test] add mixtral for sequence classification
* [test] add mixtral transformer test
* [moe] fix plugin
* [test] mixtra pp shard test
* [chore] handle non member group
* [zero] solve hang
* [test] pass mixtral shardformer test
* [moe] implement transit between non moe tp and ep
* [zero] solve hang
* [misc] solve booster hang by rename the variable
* solve hang when parallel mode = pp + dp
* [moe] implement submesh initialization
* [moe] add mixtral dp grad scaling when not all experts are activated
* [chore] manually revert unintended commit
* [chore] trivial fix
* [chore] arg pass & remove drop token
* [test] add mixtral modelling test
* [moe] implement tp
* [moe] test deepseek
* [moe] clean legacy code
* [Feature] MoE Ulysses Support (#5918)
* moe sp support
* moe sp bug solve
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [chore] minor fix
* [moe] init moe plugin comm setting with sp
* moe sp + ep bug fix
* [moe] finalize test (no pp)
* [moe] full test for deepseek and mixtral (pp + sp to fix)
* [chore] minor fix after rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [chore] solve moe ckpt test failure and some other arg pass failure
* [moe] remove ops
* [test] fix test: test_zero1_2
* [bug] fix: somehow logger hangs the program
* [moe] deepseek moe sp support
* [test] add check
* [deepseek] replace attn (a workaround for bug in transformers)
* [misc] skip redunant test
* [misc] remove debug/print code
* [moe] refactor mesh assignment
* Revert "[moe] implement submesh initialization"
This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.
* [chore] change moe_pg_mesh to private
* [misc] remove incompatible test config
* [misc] fix ci failure: change default value to false in moe plugin
* [misc] remove useless condition
* [chore] docstring
* [moe] remove force_overlap_comm flag and add warning instead
* [doc] add MoeHybridParallelPlugin docstring
* [moe] solve dp axis issue
* [chore] remove redundant test case, print string & reduce test tokens
* [feat] Dist Loader for Eval (#5950)
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* support auto distributed data loader
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix tp error
* remove unused parameters
* remove unused
* update inference
* update docs
* update inference
---------
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [lora] lora support hybrid parallel plugin (#5956)
* lora support hybrid plugin
* fix
* fix
* fix
* fix
* Support overall loss, update KTO logging
* [Docs] clarify launch port
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Hotfix] README link (#5966)
* update ignore
* update readme
* run style
* update readme
* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [Chat] fix readme (#5989)
* fix readme
* fix readme, tokenization fully tested
* fix readme, tokenization fully tested
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix sync condition (#6000)
* [plugin] add cast inputs option for zero (#6003)
* [pre-commit.ci] pre-commit autoupdate (#5995)
updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)
* [Feature] Zigzag Ring attention (#5905)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [misc] update compatibility (#6008)
* [misc] update compatibility
* [misc] update requirements
* [devops] disable requirements cache
* [test] fix torch ddp test
* [test] fix rerun on address in use
* [test] fix lazy init
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix the merge
* overlap kv comm with output rescale (#6017)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix the merge
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the merge
* fix
* fix
* fix the merge
* fix
* [misc] Use dist logger in plugins (#6011)
* use dist logger in plugins
* remove trash
* print on rank 0
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* fix
* fix
* fix
* fix
* fix the merge
* fix
* fix
* fix
* fix
---------
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update train_dpo.py
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update low_level_zero_plugin.py
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* [CI] Remove triton version for compatibility bug; update req torch >=2.2 (#6018)
* remove triton version
* remove torch 2.2
* remove torch 2.1
* debug
* remove 2.1 build tests
* require torch >=2.2
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [plugin] hotfix zero plugin (#6036)
* [plugin] hotfix zero plugin
* [plugin] hotfix zero plugin
* [Colossal-LLaMA] Refactor latest APIs (#6030)
* refactor latest code
* update api
* add dummy dataset
* update Readme
* add setup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update files
* add PP support
* update arguments
* update argument
* reorg folder
* update version
* remove IB infor
* update utils
* update readme
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update save for zero
* update save
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add apex
* update
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* add fused norm (#6038)
* [FP8] unsqueeze scale to make it compatible with torch.compile (#6040)
* [colossalai/checkpoint_io/...] fix bug in load_state_dict_into_model; format error msg (#6020)
* fix bug in load_state_dict_into_model; format error msg
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update utils.py
to support checking missing_keys
* Update general_checkpoint_io.py
fix bug in missing_keys error message
* retrigger tests
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [Hotfix] Remove deprecated install (#6042)
* remove deprecated install
* remove unused folder
* [fp8] optimize all-gather (#6043)
* [fp8] optimize all-gather
* [fp8] fix all gather fp8 ring
* [fp8] enable compile
* [fp8] fix all gather fp8 ring
* [fp8] fix linear hook (#6046)
* [fp8] disable all_to_all_fp8 in intranode (#6045)
* enhance all_to_all_fp8 with internode comm control
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* disable some fp8 ops due to performance issue
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6041)
* [release] update version
* [devops] update comp test
* [devops] update comp test debug
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [devops] debug comp test
* [Feature] Split cross-entropy computation in SP (#5959)
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* update softmax_lse shape by new interface
* change tester name
* remove buffer clone; support packed seq layout
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* adapt chatglm, command-R, qwen
* debug
* halfway
* fix cross-PP-stage position id length diff bug
* fix typo
* fix typo
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* unified cross entropy func for all shardformer models
* remove redundant lines
* add basic ring attn; debug cross entropy
* fwd bwd logic complete
* fwd bwd logic complete; add experimental triton rescale
* precision tests passed
* precision tests passed
* fix typos and remove misc files
* add sp_mode to benchmark; fix varlen interface
* update softmax_lse shape by new interface
* add varlen tests
* fix typo
* all tests passed
* add dkv_group; fix mask
* remove debug statements
* add comments
* q1 index only once
* remove events to simplify stream sync
* simplify forward/backward logic
* 2d ring forward passed
* 2d ring backward passed
* fixes
* fix ring attn loss
* 2D ring backward + llama passed
* merge
* update logger
* fix typo
* rebase
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix typo
* remove typos
* fixes
* support GPT
---------
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [hotfix] moe hybrid parallelism benchmark & follow-up fix (#6048)
* [example] pass use_fp8_comm flag to all plugins
* [example] add mixtral benchmark
* [moe] refine assertion and check
* [moe] fix mixtral & add more tests
* [moe] consider checking dp * sp group and moe_dp_group
* [mixtral] remove gate tp & add more tests
* [deepseek] fix tp & sp for deepseek
* [mixtral] minor fix
* [deepseek] add deepseek benchmark
* [fp8] hotfix backward hook (#6053)
* [fp8] hotfix backward hook
* [fp8] hotfix pipeline loss accumulation
* [doc] update sp doc (#6055)
* update sp doc
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* fix the sp
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix the attn
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* [fp8] fix missing fp8_comm flag in mixtral (#6057)
* fix
* fix
* fix
* [fp8] Disable all_gather intranode. Disable Redundant all_gather fp8 (#6059)
* all_gather only internode, fix pytest
* fix cuda arch <89 compile pytest error
* fix pytest failure
* disable all_gather_into_tensor_flat_fp8
* fix fp8 format
* fix pytest
* fix conversations
* fix chunk tuple to list
* [doc] FP8 training and communication document (#6050)
* Add FP8 training and communication document
* add fp8 docstring for plugins
* fix typo
* fix typo
* fix
* fix
* [moe] add parallel strategy for shared_expert && fix test for deepseek (#6063)
* [ColossalEval] support for vllm (#6056)
* support vllm
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* modify vllm and update readme
* run pre-commit
* remove dupilicated lines and refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* update param name
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* refine code
* update readme
* refine code
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [release] update version (#6062)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [feat] moehybrid support zerobubble;
* [fix] fix zerobubble pp for shardformer type input;
* [fix] fix require_grad & deallocate call;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [fix] fix pipeline util func deallocate --> release_tensor_data; fix bwd_b loss bwd branch;
* [fix] fix zerobubble; support shardformer model type;
* [fix] fix test_pipeline_utils ci;
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] fix poc format
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [feat] update test; rm comments;
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix mem check;
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix mem assert;
* [fix] fix fwd branch, fwd pass both micro_batch & internal_inputs'
* [plugin] hybrid support zero bubble pipeline (#6060)
* hybrid support zbv
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* [zerobubble]Support ZeroBubble Pipeline (#6034)
* [feat] add zerobubble pp (just a frame now); add POC test for dx_dw; add test for zerobubble;
* [feat] add dw test;
* [fix] fix weight not close;
* [update] update text;
* [feat] add test run_fwd_bwd automatic scheduling;
* [feat] split communication and calculation; fix pop empty send_bwd_buffer error;
* [feat] add test for p & p grad;
* [feat] add comments for ZBV func;
* [fix] rm useless assign and comments;
* [fix] fix ci test; add pytest;
* [feat] add run_fwd_bwd_with_microbatch (replace input) & test; add p&p.grad assert close test & all pass;
* [feat] add apply v_schedule graph; p & p.grad assert err exist;
* [fix] update
* [feat] fix ci; add assert;
* [feat] fix poc format
* [feat] fix func name & ci; add comments;
* [fix] fix poc test; add comments in poc;
* [feat] add optim backward_b_by_grad
* [feat] fix optimizer bwd b & w; support return accum loss & output
* [feat] add fwd_bwd_step, run_fwd_only;
* [fix] fix optim bwd; add license for v_schedule; remove redundant attributes; fix schedule loop "while"--> "for"; add communication dict;
* [fix] fix communication_map;
* [feat] update test; rm comments;
* [fix] rm zbv in hybridplugin
* [fix] fix optim bwd;
* [fix] fix optim bwd;
* [fix] rm output.data after send fwd;
* [fix] fix bwd step if condition; remove useless comments and format info;
* [fix] fix detach output & release output;
* [fix] rm requir_grad for output;
* [fix] fix requir grad position and detach position and input&output local buffer append position;
* [feat] add memory assertation;
* [fix] fix mem check;
* [fix] mem assertation'
* [fix] fix mem assertation
* [fix] fix mem; use a new model shape; only assert mem less and equal than theo;
* [fix] fix model zoo import;
* [fix] fix redundant detach & clone; add buffer assertation in the end;
* [fix] add output_obj_grad assert None at bwd b step; replace input_obj.require_grad_ with treemap;
* [fix] update optim state dict assert (include param group & state); fix mem assert after add optim;
* [fix] add testcase with microbatch 4;
* hybrid support zbv
* fix
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update zero_bubble_pp.py
* fix
* fix-ci
* fix
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
* fix
* fix
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* fix
* fix
* fix
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: duanjunwen <935724073@qq.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: HangXu <hangxu0304@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: GuangyaoZhang <xjtu521@qq.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: wangbluo <2538539015@qq.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
2024-10-08 07:58:00 +00:00
|
|
|
context_layer = all_to_all_comm(
|
|
|
|
context_layer,
|
|
|
|
sp_group,
|
|
|
|
gather_dim=2,
|
|
|
|
scatter_dim=0,
|
|
|
|
fp8_communication=shard_config.fp8_communication,
|
|
|
|
)
|
2024-07-10 03:34:25 +00:00
|
|
|
|
|
|
|
# =================
|
|
|
|
# Output. [sq, b, h]
|
|
|
|
# =================
|
|
|
|
output = self.dense(context_layer)
|
|
|
|
|
|
|
|
return output, kv_cache
|
|
|
|
|
|
|
|
return forward
|