diff --git a/colossalai/utils/profiler/comm_profiler.py b/colossalai/utils/profiler/comm_profiler.py index 9fea2a1cf..09264f94a 100644 --- a/colossalai/utils/profiler/comm_profiler.py +++ b/colossalai/utils/profiler/comm_profiler.py @@ -8,6 +8,7 @@ from torch.distributed import ReduceOp from colossalai.utils import get_current_device from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth from typing import List, Optional +from colossalai.core import global_context as gpc import json def _get_code_location(depth: int): @@ -109,8 +110,9 @@ class CommProfiler(BaseProfiler): }) data["events"] = events_list - - with open(json_dir.joinpath("communication.json"), "w") as f: + rank = gpc.get_global_rank() + + with open(json_dir.joinpath(f"worker{rank}.communication.json"), "w") as f: json.dump(data, f) def to_file(self, filename: Path): diff --git a/colossalai/utils/profiler/mem_profiler.py b/colossalai/utils/profiler/mem_profiler.py index 85469bb49..9ff11b1b4 100644 --- a/colossalai/utils/profiler/mem_profiler.py +++ b/colossalai/utils/profiler/mem_profiler.py @@ -4,6 +4,7 @@ from colossalai.engine import Engine from colossalai.engine.ophooks import MemTracerOpHook from colossalai.utils.profiler import BaseProfiler import json +from colossalai.core import global_context as gpc class MemProfiler(BaseProfiler): """Wraper of MemOpHook, used to show GPU memory usage through each iteration @@ -46,7 +47,7 @@ class MemProfiler(BaseProfiler): "cuda_usage": stats } } - + rank = gpc.get_global_rank() with open(log_dir.joinpath(f"worker{rank}.memory.json"), "w") as f: json.dump(data, f) diff --git a/colossalai/utils/profiler/pcie_profiler.py b/colossalai/utils/profiler/pcie_profiler.py index d7b0a312e..b4bb2ff62 100644 --- a/colossalai/utils/profiler/pcie_profiler.py +++ b/colossalai/utils/profiler/pcie_profiler.py @@ -3,6 +3,7 @@ from torch.autograd.profiler import profile from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth from typing import List import json +from colossalai.core import global_context as gpc def _get_size(dtype: str): if dtype == "fp16": @@ -121,8 +122,9 @@ class PcieProfiler(BaseProfiler): "count": event.count }) data["events"] = events_list - - with open(json_dir.joinpath("pcie.json"), "w") as f: + rank = gpc.get_global_rank() + + with open(json_dir.joinpath(f"worker{rank}.pcie.json"), "w") as f: json.dump(data, f) def to_file(self, filename: Path):