hotfix

2024-08-13 09:35:03 +00:00 · 2024-08-13 09:35:03 +00:00 · 2422341d03
parent 22218d31e1
commit 2422341d03
2 changed files with 2 additions and 2 deletions
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@ -30,7 +30,7 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
 # Skip those tests due to CI tests timeout
 MODELS=('llama')
-ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp')
+ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu' 'pp' 'tp_pp')
 PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu')
 LORA_RANK=('0')  # skip to reduce CI execution time, can pass all locally
 LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json"
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@ -1328,7 +1328,7 @@ class HybridParallelPlugin(PipelinePluginBase):
        # run with gradients accumulation
        if model.require_grad_sync == False or (
            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False
-        ):
+        ) or not torch.is_grad_enabled():
            return outputs

        # Synchronize the grads of shared parameters of the model.