[fix] rm output.data after send fwd;

2024-09-03 14:12:17 +08:00 · 2024-09-03 14:12:17 +08:00 · ab643c9af7
parent a48afc4a66
commit ab643c9af7
3 changed files with 25 additions and 49 deletions
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@ -25,6 +25,24 @@ def _wait_p2p(wait_handles: List[torch.cuda.Event]) -> None:
            req.wait()
 def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
    """Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
    This method should be called right after the output tensor has been
    sent to the next pipeline stage. At this point, the output tensor is
    only useful for its '.grad_fn' field, and not its '.data'.
    """
    if (out is None) or (not deallocate_pipeline_outputs):
        print(
            f"(out is None) or (not deallocate_pipeline_outputs): {(out is None) or (not deallocate_pipeline_outputs)}"
        )
        return
    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
    assert out._base is None, "counter-productive to free a view of another tensor."
    # out.data = torch.empty((1,), device=out.device, dtype=out.dtype,)
    out.data.storage().resize_(0)
 class ZeroBubbleVPipeScheduler(PipelineSchedule):
    def __init__(
        self,
@ -562,10 +580,13 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
        )
        # add input and output object for backward b
        self.input_tensors[model_chunk_id].append(input_obj)
        self.output_tensors[model_chunk_id].append(output_obj)
        # detached output; for bwd b&w, we only need the graph(grad_fn) of output_obj
        detached_output_obj = output_obj.clone()
        deallocate_output_tensor(detached_output_obj, deallocate_pipeline_outputs=True)
        self.output_tensors[model_chunk_id].append(detached_output_obj)
        # add output object for backward w
-        self.output_tensors_dw[model_chunk_id].append(output_obj)
+        self.output_tensors_dw[model_chunk_id].append(detached_output_obj)
        # Step3: send fwd
        # add output to send_fwd_buffer
--- a/tests/kit/model_zoo/transformers/init.py
+++ b/tests/kit/model_zoo/transformers/init.py
@ -2,8 +2,7 @@ from .albert import *
 from .bert import *
 from .blip2 import *
 from .bloom import *
-
+from .chatglm2 import *
 # from .chatglm2 import *
 from .command import *
 from .deepseek import *
 from .falcon import *
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@ -14,7 +14,6 @@ from colossalai.logging import disable_existing_loggers
 from colossalai.pipeline.schedule.v_schedule import PipelineGraph, ScheduledNode
 from colossalai.pipeline.schedule.zero_bubble_pp import ZeroBubbleVPipeScheduler
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
@ -701,56 +700,13 @@ def run_with_hybridplugin(test_config):
    ],
 )
 def run_with_moehybridplugin(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_bert")
+    model_zoo.get_sub_registry("transformers_bert")
    test_config["use_lazy_init"] = False
    test_config["pp_size"] = 1  # Do NOT test Pipeline Parallel
    test_config["initial_scale"] = 2**16  # avoid overflow
    model_list = [
        "transformers_bert",
    ]
    clear_layout_converter()
    torch.set_default_dtype(torch.bfloat16)
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        data_gen_fn()
        # print(f"data {data}")
    #     if name in model_list:
    #         (
    #             org_model,
    #             org_optimizer,
    #             sharded_model,
    #             sharded_optimizer,
    #             criterion,
    #             booster,
    #         ) = build_model_from_hybrid_plugin(model_fn, loss_fn, test_config, torch.optim.SGD, torch.optim.SGD)
    #         org_loss, org_output, sharded_loss, sharded_output = run_forward_backward_with_hybrid_plugin(
    #             org_model, sharded_model, sharded_optimizer, data_gen_fn, output_transform_fn, criterion, booster
    #         )
    #         stage_manager = booster.plugin.stage_manager
    #         tp_group = booster.plugin.tp_group
    #         bert = unwrap_model(org_model, "BertModel", "bert")
    #         sharded_bert = unwrap_model(sharded_model, "BertModel", "bert")
    #         weight_layer_for_check = ["encoder.layer[0].output.dense", "encoder.layer[1].output.dense"]
    #         org_optimizer.step()
    #         sharded_optimizer.step()
    #         # check weights
    #         if test_config["precision"] == "bf16":
    #             atol, rtol = 5e-4, 5e-4
    #         else:
    #             atol, rtol = 5e-4, 5e-4
    #         if stage_manager is None or stage_manager.is_first_stage(ignore_chunk=True):
    #             check_weight(bert, sharded_bert, weight_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1)
    #             # check optim states
    #             # check_dist_optim_state(org_optimizer, sharded_optimizer.optim)
    # clear_layout_converter()
    # Randomizer.reset_index()
    # torch.cuda.empty_cache()
    # print(f"Bert Model Zoo Test Passed")
 # TODO:6) support booster & Hybrid base 4)