2023-08-25 14:04:57 +00:00
|
|
|
import copy
|
|
|
|
import logging
|
|
|
|
import os
|
2024-02-01 08:13:06 +00:00
|
|
|
from functools import reduce
|
2023-08-25 14:04:57 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from shutil import rmtree
|
2023-09-20 10:29:37 +00:00
|
|
|
from typing import Dict, Iterator, Optional, OrderedDict, Tuple
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
import torch.distributed as dist
|
|
|
|
import torch.nn as nn
|
|
|
|
from torch.distributed import ProcessGroup
|
|
|
|
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
|
|
|
|
|
2023-09-15 10:45:44 +00:00
|
|
|
from colossalai.cluster import DistCoordinator
|
2023-09-20 10:29:37 +00:00
|
|
|
from colossalai.interface import ModelWrapper, OptimizerWrapper
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
from colossalai.tensor.padded_tensor import (
|
|
|
|
init_as_padded_tensor,
|
|
|
|
is_padded_tensor,
|
|
|
|
to_padded_tensor,
|
|
|
|
to_unpadded_tensor,
|
|
|
|
)
|
2024-02-06 03:52:17 +00:00
|
|
|
from colossalai.utils import get_current_device
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
from .general_checkpoint_io import GeneralCheckpointIO
|
|
|
|
from .index_file import CheckpointIndexFile
|
|
|
|
from .utils import (
|
|
|
|
StateDictSharder,
|
|
|
|
gather_distributed_param,
|
|
|
|
get_model_base_filenames,
|
|
|
|
get_optimizer_base_filenames,
|
|
|
|
is_safetensors_available,
|
|
|
|
load_shard_state_dict,
|
2023-09-26 02:58:03 +00:00
|
|
|
load_state_dict,
|
2023-08-25 14:04:57 +00:00
|
|
|
load_state_dict_into_model,
|
2023-08-31 06:50:47 +00:00
|
|
|
load_states_into_optimizer,
|
2023-09-01 09:40:01 +00:00
|
|
|
save_config_file,
|
2023-08-25 14:04:57 +00:00
|
|
|
save_param_groups,
|
2023-09-26 02:58:03 +00:00
|
|
|
save_state_dict,
|
2023-08-25 14:04:57 +00:00
|
|
|
save_state_dict_shards,
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
search_padding_dim,
|
2023-08-31 06:50:47 +00:00
|
|
|
search_tp_partition_dim,
|
|
|
|
sharded_optimizer_loading_epilogue,
|
2023-08-25 14:04:57 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
try:
|
2023-09-19 06:20:26 +00:00
|
|
|
from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
|
2023-08-25 14:04:57 +00:00
|
|
|
except ImportError:
|
2023-09-19 06:20:26 +00:00
|
|
|
_EXTRA_STATE_KEY_SUFFIX = "_extra_state"
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
|
2023-09-12 09:32:19 +00:00
|
|
|
class HybridParallelCheckpointIO(GeneralCheckpointIO):
|
2023-08-25 14:04:57 +00:00
|
|
|
"""
|
|
|
|
CheckpointIO for Hybrid Parallel Training.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
dp_group (ProcessGroup): Process group along data parallel dimension.
|
|
|
|
pp_group (ProcessGroup): Process group along pipeline parallel dimension.
|
|
|
|
tp_group (ProcessGroup): Process group along tensor parallel dimension.
|
2023-08-31 06:50:47 +00:00
|
|
|
zero_stage (int): The zero stage of plugin. Should be in [0, 1, 2].
|
2024-03-05 13:52:30 +00:00
|
|
|
verbose (bool, optional): Whether to print logging massage when saving/loading has been successfully executed. Defaults to True.
|
2023-08-25 14:04:57 +00:00
|
|
|
"""
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
dp_group: ProcessGroup,
|
|
|
|
pp_group: ProcessGroup,
|
|
|
|
tp_group: ProcessGroup,
|
|
|
|
zero_stage: int,
|
|
|
|
verbose: bool = True,
|
|
|
|
) -> None:
|
2023-08-25 14:04:57 +00:00
|
|
|
super().__init__()
|
|
|
|
self.dp_group = dp_group
|
|
|
|
self.pp_group = pp_group
|
|
|
|
self.tp_group = tp_group
|
|
|
|
self.dp_rank = dist.get_rank(self.dp_group)
|
|
|
|
self.tp_rank = dist.get_rank(self.tp_group)
|
|
|
|
self.pp_rank = dist.get_rank(self.pp_group)
|
|
|
|
self.dp_size = dist.get_world_size(dp_group)
|
|
|
|
self.pp_size = dist.get_world_size(pp_group)
|
|
|
|
self.tp_size = dist.get_world_size(tp_group)
|
2023-09-19 06:20:26 +00:00
|
|
|
self.use_zero = zero_stage > 0
|
2023-08-31 06:50:47 +00:00
|
|
|
self.verbose = verbose
|
2023-09-15 10:45:44 +00:00
|
|
|
self.coordinator = DistCoordinator()
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2023-09-19 06:20:26 +00:00
|
|
|
def _model_sharder(
|
|
|
|
model: nn.Module, prefix: str = "", keep_vars: bool = False, size_per_shard: int = 1024
|
|
|
|
) -> Iterator[Tuple[OrderedDict, int]]:
|
2023-08-25 14:04:57 +00:00
|
|
|
# An internel method that breaks state_dict of model into shards within limited size.
|
|
|
|
|
|
|
|
state_dict_sharder = StateDictSharder(size_per_shard)
|
|
|
|
|
|
|
|
# Save parameters.
|
|
|
|
for name, param in model.named_parameters():
|
|
|
|
if param is None:
|
|
|
|
continue
|
|
|
|
# Gather tensor pieces when using tensor parallel.
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
if is_padded_tensor(param):
|
|
|
|
param = to_unpadded_tensor(param)
|
2023-08-25 14:04:57 +00:00
|
|
|
param_ = gather_distributed_param(param, keep_vars=False)
|
2023-08-31 06:50:47 +00:00
|
|
|
block, block_size = state_dict_sharder.append_param(prefix + name, param_)
|
2023-08-25 14:04:57 +00:00
|
|
|
if block is not None:
|
|
|
|
yield block, block_size
|
|
|
|
|
|
|
|
# Save buffers.
|
|
|
|
for name, buf in model.named_buffers():
|
|
|
|
if buf is not None and name not in model._non_persistent_buffers_set:
|
|
|
|
buffer = buf if keep_vars else buf.detach()
|
2023-08-31 06:50:47 +00:00
|
|
|
block, block_size = state_dict_sharder.append_param(prefix + name, buffer)
|
2023-08-25 14:04:57 +00:00
|
|
|
if block is not None:
|
|
|
|
yield block, block_size
|
|
|
|
|
|
|
|
# Save extra states.
|
|
|
|
extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
|
2023-09-19 06:20:26 +00:00
|
|
|
if (
|
|
|
|
getattr(model.__class__, "get_extra_state", torch.nn.Module.get_extra_state)
|
|
|
|
is not torch.nn.Module.get_extra_state
|
|
|
|
):
|
2023-08-25 14:04:57 +00:00
|
|
|
extra_state = model.get_extra_state()
|
2023-08-31 06:50:47 +00:00
|
|
|
block, block_size = state_dict_sharder.append_param(extra_state_key, extra_state)
|
2023-08-25 14:04:57 +00:00
|
|
|
if block is not None:
|
|
|
|
yield block, block_size
|
|
|
|
|
|
|
|
# Return the last block in sharder.
|
|
|
|
yield state_dict_sharder.current_block, state_dict_sharder.current_block_size
|
|
|
|
|
|
|
|
@staticmethod
|
2023-09-19 06:20:26 +00:00
|
|
|
def _optimizer_sharder(
|
|
|
|
optimizer: OptimizerWrapper,
|
|
|
|
use_zero: bool,
|
|
|
|
dp_group: ProcessGroup,
|
|
|
|
tp_group: ProcessGroup,
|
|
|
|
size_per_shard: int = 1024,
|
|
|
|
):
|
2023-08-25 14:04:57 +00:00
|
|
|
# An internel method that breaks state_dict of optimizer into shards within limited size.
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
state_dict_sharder = StateDictSharder(size_per_shard)
|
|
|
|
param_info = optimizer.param_info
|
2023-09-26 02:58:03 +00:00
|
|
|
master_to_working_map = optimizer.get_master_to_working_map()
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
for param, state in optimizer.optim.state.items():
|
|
|
|
if param is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
|
|
|
else:
|
|
|
|
working_param = param
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
param_id = param_info["param2id"][id(working_param)]
|
|
|
|
original_shape = param_info["param2shape"][id(working_param)]
|
|
|
|
state_ = HybridParallelCheckpointIO.gather_from_sharded_optimizer_state(
|
|
|
|
state,
|
|
|
|
working_param,
|
|
|
|
original_shape=original_shape,
|
|
|
|
dp_group=dp_group,
|
|
|
|
tp_group=tp_group,
|
|
|
|
use_zero=use_zero,
|
|
|
|
inplace=False,
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
block, block_size = state_dict_sharder.append_optim_state(param_id, state_)
|
|
|
|
if block is not None:
|
|
|
|
yield block, block_size
|
|
|
|
|
|
|
|
# Return the last block in sharder.
|
|
|
|
yield state_dict_sharder.current_block, state_dict_sharder.current_block_size
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def save_sharded_model(
|
|
|
|
self,
|
2023-09-20 10:29:37 +00:00
|
|
|
model: ModelWrapper,
|
2023-09-19 06:20:26 +00:00
|
|
|
checkpoint: str,
|
|
|
|
gather_dtensor: bool = True,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
size_per_shard: int = 1024,
|
|
|
|
use_safetensors: bool = False,
|
|
|
|
) -> None:
|
2023-08-25 14:04:57 +00:00
|
|
|
"""
|
|
|
|
Save sharded model checkpoint under the given checkpointing path.
|
|
|
|
The following files will be created under the path:
|
|
|
|
- An index file (pytorch_model.bin.index.json) containing a map between model params/buffers and file names.
|
|
|
|
- Multiple files that store state tensors of models.
|
|
|
|
If pipeline parallelism is used, the filenames are in the form of "pytorch_model.<prefix>-stage-000XX-shard-000XX.bin".
|
|
|
|
If pipeline parallelism is not used, "pytorch_model.<prefix>-000XX.bin"
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
model (nn.Module): Model on local device to be saved.
|
|
|
|
checkpoint (str): Checkpointing path which should be a directory path.
|
|
|
|
gather_dtensor (bool, optional): Whether to gather_dtensor, currently not used. Defaults to True.
|
|
|
|
prefix (str, optional): Perfix of file to save. Defaults to None.
|
|
|
|
size_per_shard (int, optional): Size per shard in MB. Defaults to 1024.
|
|
|
|
use_safetensors (bool, optional): Whether to use safe tensors. Defaults to False.
|
|
|
|
"""
|
|
|
|
|
2023-09-20 10:29:37 +00:00
|
|
|
assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
|
|
|
|
model = model.unwrap()
|
|
|
|
|
2023-08-25 14:04:57 +00:00
|
|
|
if os.path.isfile(checkpoint):
|
|
|
|
logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
|
|
|
|
return
|
|
|
|
|
|
|
|
Path(checkpoint).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
# Devices along the same dp_group share the same copies of model.
|
|
|
|
# So only let the device with dp_rank == 0 save the model.
|
|
|
|
if self.dp_rank != 0:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Then collect the sharded parameters & buffers along tp_group.
|
2023-08-31 06:50:47 +00:00
|
|
|
# Only devices with tp_rank == 0 are responsible for model saving.
|
2023-09-12 09:32:19 +00:00
|
|
|
state_dict_shard = HybridParallelCheckpointIO._model_sharder(model, size_per_shard=size_per_shard)
|
2023-08-25 14:04:57 +00:00
|
|
|
weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
|
|
|
|
index_file = CheckpointIndexFile(checkpoint)
|
2023-09-19 06:20:26 +00:00
|
|
|
control_saving = self.tp_rank == 0
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
if self.pp_size == 1:
|
|
|
|
# When pipeline is not used, save the model shards as in general checkpointIO
|
2023-09-19 06:20:26 +00:00
|
|
|
total_size = save_state_dict_shards(
|
|
|
|
sharded_state_dict=state_dict_shard,
|
|
|
|
checkpoint=checkpoint,
|
|
|
|
index_file=index_file,
|
|
|
|
base_filename=weights_name,
|
|
|
|
is_master=control_saving,
|
|
|
|
use_safetensors=use_safetensors,
|
|
|
|
)
|
2023-08-25 14:04:57 +00:00
|
|
|
if control_saving:
|
|
|
|
index_file.append_meta_data("total_size", total_size)
|
|
|
|
index_file.write_index_file(save_index_file)
|
2023-09-01 09:40:01 +00:00
|
|
|
save_config_file(model, checkpoint)
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-09-19 06:20:26 +00:00
|
|
|
logging.info(
|
|
|
|
f"The model is split into checkpoint shards. "
|
|
|
|
f"You can find where each parameters has been saved in the "
|
|
|
|
f"index located at {save_index_file}."
|
|
|
|
)
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
# When pipeline is used, each stage produces its own shard files and index files.
|
|
|
|
# Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
|
|
|
|
# After all the state_dicts have been saved, the master rank integrates all the index files into one final index file and deletes the tmp folder.
|
|
|
|
final_index_file_path = copy.deepcopy(save_index_file)
|
|
|
|
tmp_index_file_folder = os.path.join(checkpoint, "tmp_index_files")
|
|
|
|
Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
# Manage filenames of sharded weights and index file for each pipeline stage.
|
2023-09-01 09:40:01 +00:00
|
|
|
weights_name = weights_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-shard.bin")
|
|
|
|
weights_name = weights_name.replace(".safetensors", f"-stage-{self.pp_rank+1:05d}-shard.safetensors")
|
|
|
|
save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}.json")
|
2023-08-25 14:04:57 +00:00
|
|
|
save_index_file = os.path.join("tmp_index_files", save_index_file)
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
total_size = save_state_dict_shards(
|
|
|
|
sharded_state_dict=state_dict_shard,
|
|
|
|
checkpoint=checkpoint,
|
|
|
|
index_file=index_file,
|
|
|
|
base_filename=weights_name,
|
|
|
|
is_master=control_saving,
|
|
|
|
use_safetensors=use_safetensors,
|
|
|
|
use_pp_format=True,
|
|
|
|
)
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
|
2023-08-25 14:04:57 +00:00
|
|
|
if control_saving:
|
2023-09-19 06:20:26 +00:00
|
|
|
assert (
|
|
|
|
self.dp_rank == 0 and self.tp_rank == 0
|
|
|
|
), "The saving process should have both dp_rank and tp_rank as 0."
|
2023-08-25 14:04:57 +00:00
|
|
|
index_file.append_meta_data("total_size", total_size)
|
|
|
|
index_file.write_index_file(save_index_file)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
dist.barrier(self.pp_group)
|
|
|
|
|
|
|
|
# The global master rank integrates the index files and clean the folder.
|
|
|
|
if self.pp_rank == 0:
|
|
|
|
final_index_file = CheckpointIndexFile(checkpoint)
|
|
|
|
final_index_file.append_meta_data("total_size", 0)
|
|
|
|
|
|
|
|
for filename in os.listdir(tmp_index_file_folder):
|
|
|
|
stage_index_file = CheckpointIndexFile.from_file(os.path.join(tmp_index_file_folder, filename))
|
|
|
|
final_index_file.metadata["total_size"] += stage_index_file.metadata["total_size"]
|
|
|
|
for weight, weight_filename in stage_index_file.weight_map.items():
|
|
|
|
final_index_file.append_weight_map(weight, weight_filename)
|
|
|
|
|
|
|
|
final_index_file.write_index_file(final_index_file_path)
|
2023-09-01 09:40:01 +00:00
|
|
|
save_config_file(model, checkpoint)
|
2023-08-25 14:04:57 +00:00
|
|
|
rmtree(tmp_index_file_folder)
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-09-19 06:20:26 +00:00
|
|
|
logging.info(
|
|
|
|
f"The model is split into checkpoint shards. "
|
|
|
|
f"You can find where each parameters has been saved in the "
|
|
|
|
f"index located at {final_index_file_path}."
|
|
|
|
)
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-20 10:29:37 +00:00
|
|
|
def load_sharded_model(self, model: ModelWrapper, checkpoint_index_file: Path, strict: bool = False):
|
2023-08-25 14:04:57 +00:00
|
|
|
"""
|
|
|
|
Load sharded model with the given path to index file of checkpoint folder.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
model (nn.Module): The model to be loaded.
|
2023-08-31 06:50:47 +00:00
|
|
|
checkpoint_index_file (str): Path to the index file of checkpointing folder.
|
2023-08-25 14:04:57 +00:00
|
|
|
strict (bool, optional): For name matching during loading state_dict. Defaults to False.
|
|
|
|
This argument should be manually set to False since params on same device might be stored in different files.
|
|
|
|
"""
|
2023-09-20 10:29:37 +00:00
|
|
|
assert isinstance(model, ModelWrapper), "Please boost the model before loading!"
|
|
|
|
model_before_wrapping = model # backup for model before wrapping
|
|
|
|
model = model.unwrap()
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
# Check whether the checkpoint uses safetensors.
|
|
|
|
use_safetensors = False
|
|
|
|
if "safetensors" in checkpoint_index_file.name:
|
|
|
|
use_safetensors = True
|
|
|
|
|
|
|
|
if use_safetensors and not is_safetensors_available():
|
|
|
|
raise ImportError("`safe_serialization` requires the `safetensors` library: `pip install safetensors`.")
|
|
|
|
|
|
|
|
# Read checkpoint index file.
|
|
|
|
ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
|
|
|
|
ckpt_root_path = ckpt_index_file.root_path
|
|
|
|
weight_map = ckpt_index_file.weight_map
|
|
|
|
strict = False
|
|
|
|
|
|
|
|
# Load params & buffers to model.
|
|
|
|
# Keep a record of loaded files so that file will not be repeatedly loaded.
|
|
|
|
loaded_file = set()
|
|
|
|
|
2023-11-16 12:15:59 +00:00
|
|
|
missing_keys = []
|
|
|
|
missing_file_keys = []
|
|
|
|
|
2023-08-25 14:04:57 +00:00
|
|
|
def _load(name: str):
|
|
|
|
if name not in weight_map:
|
2023-11-16 12:15:59 +00:00
|
|
|
missing_file_keys.append(name)
|
|
|
|
return
|
2023-08-25 14:04:57 +00:00
|
|
|
filename = weight_map[name]
|
|
|
|
|
|
|
|
# If this param/buffer has been loaded before, directly return.
|
|
|
|
if filename in loaded_file:
|
|
|
|
return
|
|
|
|
|
|
|
|
file_path = os.path.join(ckpt_root_path, filename)
|
|
|
|
state_dict = load_shard_state_dict(Path(file_path), use_safetensors)
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
load_state_dict_into_model(
|
|
|
|
model, state_dict, missing_keys=missing_keys, strict=strict, load_sub_module=True
|
|
|
|
)
|
2023-08-25 14:04:57 +00:00
|
|
|
loaded_file.add(filename)
|
|
|
|
|
|
|
|
# Load parameters.
|
|
|
|
for name, _ in model.named_parameters():
|
|
|
|
_load(name)
|
|
|
|
|
|
|
|
# Load buffers.
|
2023-08-31 06:50:47 +00:00
|
|
|
non_persistent_buffers = set()
|
|
|
|
for n, m in model.named_modules():
|
2023-09-19 06:20:26 +00:00
|
|
|
non_persistent_buffers |= set(".".join((n, b)) for b in m._non_persistent_buffers_set)
|
2023-08-25 14:04:57 +00:00
|
|
|
for name, buf in model.named_buffers():
|
2023-08-31 06:50:47 +00:00
|
|
|
if buf is not None and name not in non_persistent_buffers:
|
2023-08-25 14:04:57 +00:00
|
|
|
_load(name)
|
|
|
|
|
|
|
|
# Load extra states.
|
|
|
|
extra_state_key = _EXTRA_STATE_KEY_SUFFIX
|
2023-09-19 06:20:26 +00:00
|
|
|
if (
|
|
|
|
getattr(model.__class__, "get_extra_state", torch.nn.Module.get_extra_state)
|
|
|
|
is not torch.nn.Module.get_extra_state
|
|
|
|
):
|
2023-08-25 14:04:57 +00:00
|
|
|
_load(extra_state_key)
|
|
|
|
|
2023-08-31 06:50:47 +00:00
|
|
|
# Update master params if mixed-precision training is enabled.
|
2023-09-20 10:29:37 +00:00
|
|
|
model_before_wrapping.update_master_params()
|
2023-08-31 06:50:47 +00:00
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-08-31 06:50:47 +00:00
|
|
|
logging.info(f"The model has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
|
|
|
|
|
2023-11-16 12:15:59 +00:00
|
|
|
if len(missing_keys) == 0:
|
|
|
|
raise RuntimeError(
|
|
|
|
"No weigth is loaded into the model. Please check the checkpoint files and the model structure."
|
|
|
|
)
|
|
|
|
|
|
|
|
remain_keys = reduce(lambda a, b: a & b, map(set, missing_keys))
|
|
|
|
remain_keys = remain_keys.union(set(missing_file_keys))
|
|
|
|
if len(remain_keys) > 0:
|
|
|
|
if strict:
|
|
|
|
error_msgs = "Missing key(s) in state_dict: {}. ".format(
|
|
|
|
", ".join('"{}"'.format(k) for k in missing_keys)
|
|
|
|
)
|
|
|
|
raise RuntimeError(
|
|
|
|
"Error(s) in loading state_dict for {}:\n\t{}".format(
|
|
|
|
self.__class__.__name__, "\n\t".join(error_msgs)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
logging.info(f"The following keys are not loaded from checkpoint: {remain_keys}")
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def save_sharded_optimizer(
|
|
|
|
self,
|
|
|
|
optimizer: OptimizerWrapper,
|
|
|
|
checkpoint: str,
|
|
|
|
gather_dtensor: bool = True,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
size_per_shard: int = 1024,
|
|
|
|
):
|
2023-08-31 06:50:47 +00:00
|
|
|
"""
|
|
|
|
Save sharded optimizer checkpoint under the given checkpointing path.
|
|
|
|
The following files will be created under the path:
|
|
|
|
- An index file (pytorch_optim.bin.index.json) containing a map between optimizer states and file names
|
|
|
|
- A group file (pytorch_optim_group.bin) recording information of param_groups
|
|
|
|
- Multiple files that store state tensors of optimizers.
|
|
|
|
If pipeline parallelism is used, the filenames are in the form of "pytorch_optim.<prefix>-stage-000XX-shard-000XX.bin".
|
|
|
|
If pipeline parallelism is not used, "pytorch_optim.<prefix>-000XX.bin"
|
|
|
|
|
|
|
|
Args:
|
|
|
|
optimizer (OptimizerWrapper): Optimizer to save sharded state_dict
|
|
|
|
checkpoint (str): Path to save optimizer state_dict
|
|
|
|
gather_dtensor (bool): Whether to gather_dtensor, not used
|
|
|
|
prefix (str): Perfix of file to save
|
|
|
|
size_per_shard (int): Max file size of each file shard that store state tensors
|
|
|
|
"""
|
2023-09-20 10:29:37 +00:00
|
|
|
assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before saving!"
|
2023-08-31 06:50:47 +00:00
|
|
|
if os.path.isfile(checkpoint):
|
|
|
|
logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
|
|
|
|
return
|
|
|
|
|
|
|
|
Path(checkpoint).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
# Devices along the same dp_group share the same copies of states when zero is not used.
|
|
|
|
# In this case only let the device with dp_rank == 0 save the model.
|
|
|
|
if not self.use_zero and self.dp_rank != 0:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Then collect the sharded states along dp_group(if using zero)/tp_group.
|
|
|
|
# Only devices with (dp_rank == 0 and tp_rank == 0) are responsible for states saving.
|
2023-09-12 09:32:19 +00:00
|
|
|
state_dict_shard = HybridParallelCheckpointIO._optimizer_sharder(
|
2023-08-31 06:50:47 +00:00
|
|
|
optimizer,
|
|
|
|
use_zero=self.use_zero,
|
|
|
|
dp_group=self.dp_group,
|
|
|
|
tp_group=self.tp_group,
|
2023-09-19 06:20:26 +00:00
|
|
|
size_per_shard=size_per_shard,
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
states_name, save_index_file, param_group_file = get_optimizer_base_filenames(prefix)
|
|
|
|
index_file = CheckpointIndexFile(checkpoint)
|
2023-09-19 06:20:26 +00:00
|
|
|
control_saving = self.dp_rank == 0 and self.tp_rank == 0
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
if self.pp_size == 1:
|
|
|
|
# When pipeline is not used, save the optimizer shards as in general checkpointIO
|
2023-09-19 06:20:26 +00:00
|
|
|
total_size = save_state_dict_shards(
|
|
|
|
sharded_state_dict=state_dict_shard,
|
|
|
|
checkpoint=checkpoint,
|
|
|
|
index_file=index_file,
|
|
|
|
base_filename=states_name,
|
|
|
|
is_master=control_saving,
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
if control_saving:
|
|
|
|
# Store param groups.
|
|
|
|
index_file.append_meta_data("param_groups", param_group_file)
|
|
|
|
group_file_path = os.path.join(checkpoint, param_group_file)
|
2024-02-01 08:13:06 +00:00
|
|
|
param_groups = [
|
|
|
|
{**group, "params": group_info["params"]}
|
|
|
|
for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
|
|
|
|
]
|
|
|
|
save_param_groups({"param_groups": param_groups}, group_file_path)
|
2023-08-31 06:50:47 +00:00
|
|
|
# Store index file.
|
|
|
|
index_file.append_meta_data("total_size", total_size)
|
|
|
|
index_file.write_index_file(save_index_file)
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-09-19 06:20:26 +00:00
|
|
|
logging.info(
|
|
|
|
f"The optimizer is going to be split to checkpoint shards. "
|
|
|
|
f"You can find where each parameters has been saved in the "
|
|
|
|
f"index located at {save_index_file}."
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
# When pipeline is used, each stage produces its own shard files and index files.
|
|
|
|
# Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
|
|
|
|
# After all the state_dicts have been saved, the master rank integrates all the index files into one final index file and deletes the tmp folder.
|
|
|
|
|
|
|
|
final_index_file_path = copy.deepcopy(save_index_file)
|
|
|
|
tmp_index_file_folder = os.path.join(checkpoint, "tmp_index_files")
|
|
|
|
Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
# Manage filenames of sharded weights and index file for each pipeline stage.
|
2023-09-01 09:40:01 +00:00
|
|
|
states_name = states_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-shard.bin")
|
|
|
|
save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}.json")
|
2023-08-31 06:50:47 +00:00
|
|
|
save_index_file = os.path.join("tmp_index_files", save_index_file)
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
total_size = save_state_dict_shards(
|
|
|
|
sharded_state_dict=state_dict_shard,
|
|
|
|
checkpoint=checkpoint,
|
|
|
|
index_file=index_file,
|
|
|
|
base_filename=states_name,
|
|
|
|
is_master=control_saving,
|
|
|
|
use_pp_format=True,
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
if control_saving:
|
2023-09-19 06:20:26 +00:00
|
|
|
assert (
|
|
|
|
self.dp_rank == 0 and self.tp_rank == 0
|
|
|
|
), "The saving process should have both dp_rank and tp_rank as 0."
|
2023-08-31 06:50:47 +00:00
|
|
|
index_file.append_meta_data("total_size", total_size)
|
|
|
|
index_file.write_index_file(save_index_file)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
dist.barrier(self.pp_group)
|
|
|
|
|
|
|
|
# The global master rank integrates the index files and clean the folder.
|
|
|
|
if self.pp_rank == 0:
|
|
|
|
final_index_file = CheckpointIndexFile(checkpoint)
|
|
|
|
final_index_file.append_meta_data("total_size", 0)
|
|
|
|
|
|
|
|
for filename in os.listdir(tmp_index_file_folder):
|
|
|
|
stage_index_file = CheckpointIndexFile.from_file(os.path.join(tmp_index_file_folder, filename))
|
|
|
|
final_index_file.metadata["total_size"] += stage_index_file.metadata["total_size"]
|
|
|
|
for param_id, state_filename in stage_index_file.weight_map.items():
|
|
|
|
final_index_file.append_weight_map(param_id, state_filename)
|
|
|
|
|
|
|
|
# Store param groups.
|
|
|
|
final_index_file.append_meta_data("param_groups", param_group_file)
|
|
|
|
group_file_path = os.path.join(checkpoint, param_group_file)
|
2024-02-01 08:13:06 +00:00
|
|
|
param_groups = [
|
|
|
|
{**group, "params": group_info["params"]}
|
|
|
|
for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
|
|
|
|
]
|
|
|
|
save_param_groups({"param_groups": param_groups}, group_file_path)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
final_index_file.write_index_file(final_index_file_path)
|
|
|
|
rmtree(tmp_index_file_folder)
|
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-09-19 06:20:26 +00:00
|
|
|
logging.info(
|
|
|
|
f"The model is split into checkpoint shards. "
|
|
|
|
f"You can find where each parameters has been saved in the "
|
|
|
|
f"index located at {final_index_file_path}."
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
|
|
|
|
"""
|
|
|
|
Load sharded optimizer with the given path to index file of checkpoint folder.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
optimizer (OptimizerWrapper): The optimizer to be loaded.
|
|
|
|
checkpoint_index_file (str): Path to the index file of checkpointing folder.
|
|
|
|
prefix (str): Not used.
|
|
|
|
"""
|
2023-09-20 10:29:37 +00:00
|
|
|
assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
|
2023-08-31 06:50:47 +00:00
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def _get_param_id_from_optimizer_param(
|
|
|
|
param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
|
|
|
|
):
|
2023-08-31 06:50:47 +00:00
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
|
|
|
else:
|
|
|
|
working_param = param
|
2023-09-19 06:20:26 +00:00
|
|
|
return optimizer.param_info["param2id"][id(working_param)]
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
# id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
|
|
|
|
# When Zero is used, the mapped parameter objects should be fp32 master parameters.
|
|
|
|
# IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
|
|
|
|
id_map = {}
|
2023-09-20 10:29:37 +00:00
|
|
|
master_to_working_map = optimizer.get_master_to_working_map()
|
2023-08-31 06:50:47 +00:00
|
|
|
for pg in optimizer.optim.param_groups:
|
2023-09-19 06:20:26 +00:00
|
|
|
for param in pg["params"]:
|
2023-09-20 10:29:37 +00:00
|
|
|
param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
|
2023-08-31 06:50:47 +00:00
|
|
|
id_map[param_id] = param
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-08-31 06:50:47 +00:00
|
|
|
# Read checkpoint index file.
|
|
|
|
ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
|
|
|
|
ckpt_root_path = ckpt_index_file.root_path
|
|
|
|
weight_map = ckpt_index_file.weight_map
|
2023-09-19 06:20:26 +00:00
|
|
|
weight_map = {int(k): v for k, v in weight_map.items()} # convert saved id from str to int
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
# Load param_groups
|
|
|
|
param_group_path = ckpt_index_file.get_param_group_filename()
|
|
|
|
if param_group_path is None:
|
2023-09-19 06:20:26 +00:00
|
|
|
raise RuntimeError(
|
|
|
|
f"Invalid index file path {checkpoint_index_file} for an optimizer. \
|
|
|
|
Lacking param group file under current directory."
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
saved_groups = torch.load(param_group_path)
|
|
|
|
|
|
|
|
updated_groups = []
|
|
|
|
for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
|
|
|
|
# obtain updated param group
|
|
|
|
new_pg = copy.deepcopy(saved_pg)
|
2024-03-05 13:52:30 +00:00
|
|
|
new_pg["params"] = old_pg["params"] # The parameters in the same group shouldn't change.
|
2023-08-31 06:50:47 +00:00
|
|
|
updated_groups.append(new_pg)
|
2023-09-19 06:20:26 +00:00
|
|
|
optimizer.optim.__dict__.update({"param_groups": updated_groups})
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
# Load saved states to optimizer.
|
|
|
|
# Keep a record of loaded files so that file will not be repeatedly loaded.
|
|
|
|
loaded_file = set()
|
|
|
|
for pg in optimizer.optim.param_groups:
|
2023-09-19 06:20:26 +00:00
|
|
|
for param in pg["params"]:
|
2023-08-31 06:50:47 +00:00
|
|
|
if param is None:
|
|
|
|
continue
|
2023-09-20 10:29:37 +00:00
|
|
|
param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
|
2023-08-31 06:50:47 +00:00
|
|
|
if param_id not in weight_map:
|
|
|
|
continue
|
|
|
|
filename = weight_map[param_id]
|
|
|
|
|
|
|
|
# If this param's states has been loaded before, directly return.
|
|
|
|
if filename in loaded_file:
|
|
|
|
continue
|
|
|
|
|
|
|
|
file_path = os.path.join(ckpt_root_path, filename)
|
|
|
|
state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
|
|
|
|
load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
|
|
|
|
loaded_file.add(filename)
|
|
|
|
|
|
|
|
# Then shard the loaded optimizer states if using tp/zero.
|
|
|
|
for param, state in optimizer.optim.state.items():
|
|
|
|
device = param.device
|
2023-09-20 10:29:37 +00:00
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
2023-08-31 06:50:47 +00:00
|
|
|
else:
|
|
|
|
working_param = param
|
2023-09-19 06:20:26 +00:00
|
|
|
original_shape = optimizer.param_info["param2shape"][id(working_param)]
|
|
|
|
sharded_state = self.shard_from_complete_optimizer_state(
|
|
|
|
state, current_shape=working_param.shape, original_shape=original_shape, device=device, inplace=True
|
|
|
|
)
|
2023-08-31 06:50:47 +00:00
|
|
|
optimizer.optim.state[param] = sharded_state
|
|
|
|
|
|
|
|
sharded_optimizer_loading_epilogue(optimizer.optim)
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.verbose and self.coordinator.is_master():
|
2023-08-31 06:50:47 +00:00
|
|
|
logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
|
|
|
|
"""
|
|
|
|
Save model state dict to a single file with given checkpointing path.
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
Args:
|
|
|
|
model (nn.Module): Model on local device to be saved.
|
|
|
|
checkpoint (str): Checkpointing path which should be a file path. Can be absolute or relative path.
|
|
|
|
gather_dtensor (bool, optional): Whether to gather dtensor, currently not used. Defaults to True.
|
|
|
|
use_safetensors (bool, optional): Whether to use safe tensors. Defaults to False.
|
|
|
|
"""
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
|
|
|
|
model = model.unwrap()
|
|
|
|
|
|
|
|
if self.dp_rank != 0:
|
|
|
|
return
|
2023-08-25 14:04:57 +00:00
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
# The logic of collecting parameter shards along tp degree
|
|
|
|
# has been implemented by _save_to_state_dict method of ParallelModule in Shardformer.
|
|
|
|
state_dict = model.state_dict()
|
|
|
|
|
|
|
|
if self.pp_size == 1:
|
|
|
|
# When pipeline is not used, let master rank directly save the collected state_dict.
|
|
|
|
if self.tp_rank == 0:
|
|
|
|
save_state_dict(state_dict, checkpoint, use_safetensors)
|
|
|
|
else:
|
|
|
|
# When pipeline is used, first collect state_dict from every pipeline stage, then save the complete state_dict.
|
|
|
|
state_dict_list = [None for _ in range(self.pp_size)]
|
|
|
|
dist.barrier(self.pp_group)
|
|
|
|
dist.all_gather_object(state_dict_list, state_dict, self.pp_group)
|
|
|
|
|
|
|
|
# Only the master rank do the saving.
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
complete_state_dict = dict()
|
|
|
|
for _state_dict in state_dict_list:
|
|
|
|
complete_state_dict.update(_state_dict)
|
|
|
|
save_state_dict(complete_state_dict, checkpoint, use_safetensors)
|
|
|
|
|
|
|
|
def load_unsharded_model(self, model: ModelWrapper, checkpoint: str, strict: bool = False):
|
|
|
|
"""
|
|
|
|
Load model from a single file with the given path of checkpoint.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
model (nn.Module): The model to be loaded.
|
|
|
|
checkpoint_index_file (str): Path to the checkpoint file.
|
|
|
|
strict (bool, optional): For name matching during loading state_dict. Defaults to False.
|
|
|
|
This argument should be manually set to False since not all params in checkpoint are needed for each device when pipeline is enabled.
|
|
|
|
"""
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
|
|
|
|
|
|
|
|
assert isinstance(model, ModelWrapper), "Please boost the model before loading!"
|
|
|
|
strict = False
|
|
|
|
model_before_wrapping = model
|
|
|
|
model = model.unwrap()
|
|
|
|
|
|
|
|
# Load from checkpoint. Since the logic of breaking parameter shards along tp degree
|
|
|
|
# has been implemented by _load_from_state_dict method of ParallelModule in Shardformer,
|
|
|
|
# model.load_state_dict can be directly called.
|
|
|
|
state_dict = load_state_dict(checkpoint)
|
|
|
|
model.load_state_dict(state_dict, strict=strict)
|
|
|
|
|
|
|
|
# Update master params if mixed-precision training is enabled.
|
|
|
|
model_before_wrapping.update_master_params()
|
|
|
|
|
|
|
|
def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool):
|
|
|
|
"""
|
|
|
|
Save optimizer state dict to a file with given path.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
optimizer (OptimizerWrapper): Optimizer to save sharded state_dict.
|
|
|
|
checkpoint (str): Path to save optimizer state_dict.
|
|
|
|
gather_dtensor (bool): Whether to gather_dtensor, not used.
|
|
|
|
"""
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
|
|
|
|
|
|
|
|
assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before saving!"
|
|
|
|
|
|
|
|
# optimizer states of parameters kept by local device('s pipeline stage)
|
|
|
|
local_states = dict()
|
|
|
|
|
|
|
|
for param, state in optimizer.optim.state.items():
|
|
|
|
if param is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# working param is needed for obtaining correct param_id
|
|
|
|
master_to_working_map = optimizer.get_master_to_working_map()
|
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
|
|
|
else:
|
|
|
|
working_param = param
|
|
|
|
|
|
|
|
# gather complete state from tp shards & dp shards
|
|
|
|
param_id = optimizer.param_info["param2id"][id(working_param)]
|
|
|
|
original_shape = optimizer.param_info["param2shape"][id(working_param)]
|
|
|
|
local_states[param_id] = HybridParallelCheckpointIO.gather_from_sharded_optimizer_state(
|
|
|
|
state,
|
|
|
|
working_param,
|
|
|
|
original_shape=original_shape,
|
|
|
|
dp_group=self.dp_group,
|
|
|
|
tp_group=self.tp_group,
|
|
|
|
use_zero=self.use_zero,
|
|
|
|
inplace=False,
|
2024-02-06 03:52:17 +00:00
|
|
|
device=get_current_device(),
|
2023-09-26 02:58:03 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
if self.pp_size == 1:
|
|
|
|
# When pipeline is not used, let master rank directly save the collected state_dict.
|
2024-02-01 08:13:06 +00:00
|
|
|
param_groups = [
|
|
|
|
{**group, "params": group_info["params"]}
|
|
|
|
for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
|
|
|
|
]
|
|
|
|
state_dict = {"param_groups": param_groups, "state": local_states}
|
2023-09-26 02:58:03 +00:00
|
|
|
if self.coordinator.is_master():
|
|
|
|
save_state_dict(state_dict, checkpoint, use_safetensors=False)
|
|
|
|
else:
|
|
|
|
# When pipeline is used, first collect state_dict from every pipeline stage, then save the complete state_dict.
|
|
|
|
states_list = [None for _ in range(self.pp_size)]
|
|
|
|
dist.barrier(self.pp_group)
|
|
|
|
dist.all_gather_object(states_list, local_states, self.pp_group)
|
|
|
|
|
|
|
|
# Only the master rank do the saving.
|
|
|
|
if self.coordinator.is_master():
|
2024-02-01 08:13:06 +00:00
|
|
|
param_groups = [
|
|
|
|
{**group, "params": group_info["params"]}
|
|
|
|
for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
|
|
|
|
]
|
|
|
|
state_dict = {"param_groups": param_groups, "state": dict()}
|
2023-09-26 02:58:03 +00:00
|
|
|
for _states in states_list:
|
|
|
|
state_dict["state"].update(_states)
|
|
|
|
save_state_dict(state_dict, checkpoint, use_safetensors=False)
|
|
|
|
|
|
|
|
def load_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str):
|
|
|
|
"""
|
|
|
|
Load optimizer from a file with given path.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
optimizer (OptimizerWrapper): The optimizer to be loaded.
|
|
|
|
checkpoint_index_file (str): Path to the checkpoint file.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def _get_param_id_from_optimizer_param(
|
|
|
|
param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
|
|
|
|
):
|
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
|
|
|
else:
|
|
|
|
working_param = param
|
|
|
|
return optimizer.param_info["param2id"][id(working_param)]
|
|
|
|
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
|
|
|
|
|
|
|
|
assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
|
|
|
|
|
|
|
|
# Complete optimizer state_dict loaded from checkpoint, need to be processed later.
|
|
|
|
state_dict = load_state_dict(checkpoint)
|
|
|
|
|
|
|
|
# Load param_groups.
|
|
|
|
updated_groups = []
|
|
|
|
saved_groups = state_dict["param_groups"]
|
|
|
|
for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
|
|
|
|
new_pg = copy.deepcopy(saved_pg)
|
|
|
|
new_pg["params"] = old_pg["params"] # Only keep the parameters kept by current pipeline stage.
|
|
|
|
updated_groups.append(new_pg)
|
|
|
|
optimizer.optim.__dict__.update({"param_groups": updated_groups})
|
|
|
|
|
|
|
|
# Load saved states to optimizer. First discard those states not belonging to current pipeline stage.
|
|
|
|
master_to_working_map = optimizer.get_master_to_working_map()
|
|
|
|
id_map = {}
|
|
|
|
for pg in optimizer.optim.param_groups:
|
|
|
|
for param in pg["params"]:
|
|
|
|
param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
|
|
|
|
id_map[param_id] = param
|
|
|
|
load_states_into_optimizer(optimizer.optim, state_dict["state"], id_map, strict=True)
|
|
|
|
|
|
|
|
# Then shard the loaded optimizer states if using tp/zero.
|
|
|
|
for param, state in optimizer.optim.state.items():
|
|
|
|
if param is None:
|
|
|
|
continue
|
|
|
|
device = param.device
|
|
|
|
if master_to_working_map is not None:
|
|
|
|
working_param = master_to_working_map[id(param)]
|
|
|
|
else:
|
|
|
|
working_param = param
|
|
|
|
original_shape = optimizer.param_info["param2shape"][id(working_param)]
|
|
|
|
sharded_state = self.shard_from_complete_optimizer_state(
|
|
|
|
state, current_shape=working_param.shape, original_shape=original_shape, device=device, inplace=True
|
|
|
|
)
|
|
|
|
optimizer.optim.state[param] = sharded_state
|
|
|
|
|
|
|
|
sharded_optimizer_loading_epilogue(optimizer.optim)
|
2023-08-25 14:04:57 +00:00
|
|
|
|
|
|
|
def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
|
|
|
|
"""
|
|
|
|
Save lr scheduler to checkpoint but only on master process.
|
|
|
|
"""
|
|
|
|
if self.coordinator.is_master():
|
|
|
|
super().save_lr_scheduler(lr_scheduler, checkpoint)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2023-09-19 06:20:26 +00:00
|
|
|
def gather_from_sharded_optimizer_state(
|
|
|
|
state: OrderedDict,
|
|
|
|
param: torch.Tensor,
|
|
|
|
original_shape: torch.Size,
|
|
|
|
dp_group: ProcessGroup,
|
|
|
|
tp_group: ProcessGroup,
|
|
|
|
use_zero: bool,
|
|
|
|
inplace: bool,
|
2023-09-26 02:58:03 +00:00
|
|
|
device: torch.device = torch.device("cpu"),
|
2023-09-19 06:20:26 +00:00
|
|
|
) -> OrderedDict:
|
2023-08-31 06:50:47 +00:00
|
|
|
"""
|
|
|
|
With given parameter and its optimizer states, gather the complete optimizer state for saving.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
state (OrderedDict): Optimizer states of given parameter, might be distributed among tp/dp group if using TP/Zero.
|
|
|
|
param (torch.Tensor): The given parameter. It should be working_param when using Zero.
|
|
|
|
original_shape (torch.Size): The size of parameter before sharding.
|
|
|
|
dp_group (ProcessGroup): The process group of data parallel.
|
|
|
|
tp_group (ProcessGroup): The process group of tensor parallel.
|
|
|
|
use_zero (bool): Whether Zero is used.
|
|
|
|
inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
|
2023-09-26 02:58:03 +00:00
|
|
|
device (torch.device): The destination device of loaded optimizer states. Defaults to torch.device('cpu').
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
OrderedDict: The complete optimizer state of given parameter.
|
|
|
|
"""
|
|
|
|
dp_size = dist.get_world_size(dp_group)
|
|
|
|
tp_size = dist.get_world_size(tp_group)
|
|
|
|
current_shape = param.shape
|
|
|
|
state_ = state if inplace else copy.deepcopy(state)
|
|
|
|
|
|
|
|
for k, v in state_.items():
|
2023-09-19 06:20:26 +00:00
|
|
|
if isinstance(v, torch.Tensor) and k != "step":
|
2023-08-31 06:50:47 +00:00
|
|
|
# First gather Zero shards.
|
|
|
|
if use_zero:
|
2024-02-06 03:52:17 +00:00
|
|
|
v = v.to(get_current_device())
|
2023-08-31 06:50:47 +00:00
|
|
|
gather_tensor = [torch.zeros_like(v) for _ in range(dp_size)]
|
|
|
|
dist.all_gather(gather_tensor, v, group=dp_group)
|
2023-09-19 06:20:26 +00:00
|
|
|
v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
# Then gather TP shards.
|
|
|
|
partition_dim = search_tp_partition_dim(current_shape, original_shape, tp_size)
|
|
|
|
if partition_dim is not None:
|
|
|
|
gather_tensor = [torch.zeros_like(v) for _ in range(tp_size)]
|
|
|
|
dist.all_gather(gather_tensor, v, group=tp_group)
|
|
|
|
v = torch.cat(gather_tensor, dim=partition_dim)
|
|
|
|
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
padding_dim = search_padding_dim(v.shape, original_shape)
|
|
|
|
if padding_dim is not None:
|
|
|
|
v = init_as_padded_tensor(v, v.shape[padding_dim], original_shape[padding_dim], padding_dim)
|
|
|
|
v = to_unpadded_tensor(v)
|
|
|
|
|
2023-09-26 02:58:03 +00:00
|
|
|
state_[k] = v.detach().clone().to(device)
|
2023-08-31 06:50:47 +00:00
|
|
|
|
|
|
|
return state_
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def shard_from_complete_optimizer_state(
|
|
|
|
self,
|
|
|
|
state: OrderedDict,
|
|
|
|
current_shape: torch.Size,
|
|
|
|
original_shape: torch.Size,
|
|
|
|
device: torch.device,
|
|
|
|
inplace: bool,
|
|
|
|
) -> OrderedDict:
|
2023-08-31 06:50:47 +00:00
|
|
|
"""
|
|
|
|
With complete optimizer states of a specific parameter loaded from checkpoint,
|
|
|
|
slice out the sharded optimizer states kept by current device.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
state (OrderedDict): Complete optimizer states of a given parameter, loaded from checkpoint.
|
|
|
|
current_shape (torch.Size): The size of parameter after sharding.
|
|
|
|
original_shape (torch.Size): The size of parameter before sharding.
|
|
|
|
device (torch.device): The destination device of loaded optimizer states.
|
|
|
|
inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
OrderedDict: The sharded optimizer state of the given parameter.
|
|
|
|
"""
|
|
|
|
state_ = state if inplace else copy.deepcopy(state)
|
|
|
|
|
|
|
|
for k, v in state_.items():
|
2023-09-19 06:20:26 +00:00
|
|
|
if isinstance(v, torch.Tensor) and k != "step":
|
2023-08-31 06:50:47 +00:00
|
|
|
# Shard state along tensor parallel group.
|
|
|
|
partition_dim = search_tp_partition_dim(current_shape, original_shape, self.tp_size)
|
[shardformer] refactor embedding resize (#5603)
* [branch rebase] rebase main to Feature/resize_embedding (#5554)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [CI] run pre-commit (#5577)
* fix
* [release] update version (#5411)
* [hotfix] fix typo s/keywrods/keywords etc. (#5429)
* [devops] fix compatibility (#5444)
* [devops] fix compatibility
* [hotfix] update compatibility test on pr
* [devops] fix compatibility
* [devops] record duration during comp test
* [test] decrease test duration
* fix falcon
* [shardformer] fix gathering output when using tensor parallelism (#5431)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* [doc] release Open-Sora 1.0 with model weights (#5468)
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] release Open-Sora 1.0 with model weights
* [doc] update open-sora demo (#5479)
* [doc] update open-sora demo
* [doc] update open-sora demo
* [doc] update open-sora demo
* [example] add grok-1 inference (#5485)
* [misc] add submodule
* remove submodule
* [example] support grok-1 tp inference
* [example] add grok-1 inference script
* [example] refactor code
* [example] add grok-1 readme
* [exmaple] add test ci
* [exmaple] update readme
* run pre-commit
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
* [rebase] rebase main to resize-embedding (#5581)
* [release] grok-1 314b inference (#5490)
* [release] grok-1 inference
* [release] grok-1 inference
* [release] grok-1 inference
* [example] update Grok-1 inference (#5495)
* revise grok-1 example
* remove unused arg in scripts
* prevent re-installing torch
* update readme
* revert modifying colossalai requirements
* add perf
* trivial
* add tokenizer url
* [hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch
* fix: use return_outputs=False to eliminate extra memory consumption
* feat: add return_outputs warning
* style: remove `return_outputs=False` as it is the default value
* [release] grok-1 inference benchmark (#5500)
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [release] grok-1 inference benchmark
* [shardformer]Fix lm parallel. (#5480)
* fix
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* fix lm forward distribution
* fix
* test ci
* fix
* [fix] fix grok-1 example typo (#5506)
* [devops] fix example test ci (#5504)
* Fix ColoTensorSpec for py11 (#5440)
* fixed layout converter caching and updated tester
* Empty-Commit
* [shardformer] update colo attention to support custom mask (#5510)
* [feature] refactor colo attention (#5462)
* [extension] update api
* [feature] add colo attention
* [feature] update sdpa
* [feature] update npu attention
* [feature] update flash-attn
* [test] add flash attn test
* [test] update flash attn test
* [shardformer] update modeling to fit colo attention (#5465)
* [misc] refactor folder structure
* [shardformer] update llama flash-attn
* [shardformer] fix llama policy
* [devops] update tensornvme install
* [test] update llama test
* [shardformer] update colo attn kernel dispatch
* [shardformer] update blip2
* [shardformer] update chatglm
* [shardformer] update gpt2
* [shardformer] update gptj
* [shardformer] update opt
* [shardformer] update vit
* [shardformer] update colo attention mask prep
* [shardformer] update whisper
* [test] fix shardformer tests (#5514)
* [test] fix shardformer tests
* [test] fix shardformer tests
* [format] applied code formatting on changed files in pull request 5510 (#5517)
Co-authored-by: github-actions <github-actions@github.com>
* [shardformer] fix pipeline forward error if custom layer distribution is used (#5189)
* Use self.[distribute_layers|get_stage_index] to exploit custom layer distribution
* Change static methods for t5 layer distribution to member functions
* Change static methods for whisper layer distribution to member functions
* Replace whisper policy usage with self one
* Fix test case to use non-static layer distribution methods
* fix: fix typo
---------
Co-authored-by: Wenhao Chen <cwher@outlook.com>
* [Fix] Grok-1 use tokenizer from the same pretrained path (#5532)
* [fix] use tokenizer from the same pretrained path
* trust remote code
* [ColossalChat] Update RLHF V2 (#5286)
* Add dpo. Fix sft, ppo, lora. Refactor all
* fix and tested ppo
* 2 nd round refactor
* add ci tests
* fix ci
* fix ci
* fix readme, style
* fix readme style
* fix style, fix benchmark
* reproduce benchmark result, remove useless files
* rename to ColossalChat
* use new image
* fix ci workflow
* fix ci
* use local model/tokenizer for ci tests
* fix ci
* fix ci
* fix ci
* fix ci timeout
* fix rm progress bar. fix ci timeout
* fix ci
* fix ci typo
* remove 3d plugin from ci temporary
* test environment
* cannot save optimizer
* support chat template
* fix readme
* fix path
* test ci locally
* restore build_or_pr
* fix ci data path
* fix benchmark
* fix ci, move ci tests to 3080, disable fast tokenizer
* move ci to 85
* support flash attention 2
* add all-in-one data preparation script. Fix colossal-llama2-chat chat template
* add hardware requirements
* move ci test data
* fix save_model, add unwrap
* fix missing bos
* fix missing bos; support grad accumulation with gemini
* fix ci
* fix ci
* fix ci
* fix llama2 chat template config
* debug sft
* debug sft
* fix colossalai version requirement
* fix ci
* add sanity check to prevent NaN loss
* fix requirements
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* add dummy data generation script
* update readme
* update readme
* update readme and ignore
* fix logger bug
* support parallel_output
* modify data preparation logic
* fix tokenization
* update lr
* fix inference
* run pre-commit
---------
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
* [shardformer, pipeline] add `gradient_checkpointing_ratio` and heterogenous shard policy for llama (#5508)
* feat: add `GradientCheckpointConfig` and `PipelineGradientCheckpointConfig`
* feat: apply `GradientCheckpointConfig` to policy and llama_forward
* feat: move `distribute_layer` and `get_stage_index` to PipelineStageManager
* fix: add optional args for `distribute_layer` and `get_stage_index`
* fix: fix changed API calls
* test: update llama tests
* style: polish `GradientCheckpointConfig`
* fix: fix pipeline utils tests
* fix incorrect sharding without zero (#5545)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [shardformer] Sequence Parallelism Optimization (#5533)
* sequence parallel optimization
* validate sequence parallel in llama (code to be polished)
* shardformer api writing
* integrate sequence parallel in ShardFormer
* fix pp bugs and sp bugs for LlaMa model
* integrating ring-based sequence parallelism into ShardFormer
* [sequence parallelism]: Add fused megatron function
* integrating ring-based sequence parallelism into ShardFormer
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* fix bugs when useing sp and flashattention together
* fix operation function name
* support flash attention for ulysses-style sp
* clarify sp process group
* fix compatibility bugs in moe plugin
* fix fused linear bugs
* fix linear layer test
* support gpt model all-to-all sp
* modify shard data dimension (meant to be dim=-1)
* support megtron-style sp and distributed attn for llama model
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* finish sp mode 3 support for gpt
* using all_to_all_single when batch size is 1
* support mode 2 sp in gpt2 (#5)
* [shardformer] add megatron sp to llama
* support llama7B 128k with distributed attention
* [shardformer] robustness enhancement
* add block attn
* sp mode 1: keep input as a complete sequence
* fix sp compatability
* refactor ring implementation
* support mode 2 sp in gpt2
* polish code
* enable distributed attn mask when using sp mode 2 and 3 in llama
* automatically enable flash attn when using sp mode 2 and 3 in llama
* inplace attn mask
* add zero2 support for sequence parallel
* polish code
* fix bugs
* fix gemini checkpoint io
* loose tensor checking atol and rtol
* add comment
* fix llama layernorm grad
* fix zero grad
* fix zero grad
* fix conflict
* update split and gather auto grad func
* sequence parallel: inside text split (#6)
* polish code (part 1)
* polish code (part 2)
* polish code (part 2.5)
* polish code (part 3)
* sequence parallel: inside text split
* miscellaneous minor fixes
* polish code
* fix ulysses style ZeRO
* sequence parallel: inside text split
* miscellaneous minor fixes
* disaggregate sp group and dp group for sp
* fix llama and gpt sp
* polish code
* move ulysses grad sync to ddp (#9)
* remove zero_stage and unbind the grad sync for alltoall sp
* add 2d group creation test
* move ulysses grad sync to ddp
* add 2d group creation test
* remove useless code
* change shard config not to enable sp when enable_all_optimizations
* add sp warnings for several model
* remove useless code
---------
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
* [hotfix] quick fixes to make legacy tutorials runnable (#5559)
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
* [fix] fix typo s/muiti-node /multi-node etc. (#5448)
* [hotfix] fix typo s/get_defualt_parser /get_default_parser (#5548)
* [devops] remove post commit ci (#5566)
* [devops] remove post commit ci
* [misc] run pre-commit on all files
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---------
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
* [shardformer]enable padding vocabulary size. (#5489)
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
* fix
fix
fix
* fix gather output
* fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* revert
* padding vocab
* padding vocabe
* fix
* fix
* fxi
* test ci
* fix
fix
fix
fix
* fix
fix
* fix
* fix
* Update hybrid_parallel_plugin.py
fix
fix
fix
* fix
fix
* fix
fix
* fix
* resolve super init
resolve super init
resolve super init
resolve super init
* resolve comments
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* vocab checkpointio
* padding vocab_size when using pipeline parallellism
padding vocab_size when using pipeline parallellism
fix
fix
* fix
fix
fix
* fix
* fix
fix resize embedding
fix resize embedding
* fix resize embedding
fix
* revert
* revert
* padding vocab
* fix
* fix
fix
* fix
fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix ci
* fix
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
* cherry-pick
* revert moe modify
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix
fix
fix
fix
fix
fix
fix
fix
* resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
* ptensor
ptensor
resolve comments
fix
fix
fix
fix
fix
resolve comments
resolve comments
resolve comments
resolve comments
resolve comments
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* fix rebase
* fix rebase
---------
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: Rocky Duan <dementrock@users.noreply.github.com>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: linsj20 <linsj20@mails.tsinghua.edu.cn>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-04-18 08:10:18 +00:00
|
|
|
global_shape = current_shape
|
|
|
|
if partition_dim is not None:
|
|
|
|
# pad embedding params
|
|
|
|
global_shape = (
|
|
|
|
*current_shape[:partition_dim],
|
|
|
|
current_shape[partition_dim] * self.tp_size,
|
|
|
|
*current_shape[partition_dim + 1 :],
|
|
|
|
)
|
|
|
|
|
|
|
|
padding_dim = search_padding_dim(global_shape, original_shape)
|
|
|
|
if padding_dim is not None:
|
|
|
|
v = to_padded_tensor(v, global_shape[padding_dim], padding_dim)
|
|
|
|
|
2023-08-31 06:50:47 +00:00
|
|
|
if partition_dim is not None:
|
|
|
|
slice_size = current_shape[partition_dim]
|
|
|
|
v = v.split(slice_size, dim=partition_dim)[self.tp_rank]
|
|
|
|
|
|
|
|
# Shard state along data parallel group when using Zero.
|
|
|
|
if self.use_zero:
|
|
|
|
padding_size = (self.dp_size - v.numel() % self.dp_size) % self.dp_size
|
|
|
|
with torch.no_grad():
|
|
|
|
v = v.flatten()
|
|
|
|
if padding_size > 0:
|
|
|
|
v = torch.nn.functional.pad(v, [0, padding_size])
|
|
|
|
slice_size = v.numel() // self.dp_size
|
|
|
|
v = v.split(slice_size, dim=0)[self.dp_rank]
|
|
|
|
|
|
|
|
state_[k] = v.detach().clone().to(device)
|
|
|
|
|
|
|
|
return state_
|