use rank-based JSON file to avoid inconsistency

2022-04-11 17:00:47 +08:00 · 2022-04-11 17:00:47 +08:00 · a1d7ab041d
parent 9ac531aba5
commit a1d7ab041d
3 changed files with 10 additions and 5 deletions
--- a/colossalai/utils/profiler/comm_profiler.py
+++ b/colossalai/utils/profiler/comm_profiler.py
@ -8,6 +8,7 @@ from torch.distributed import ReduceOp
 from colossalai.utils import get_current_device
 from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
 from typing import List, Optional
+from colossalai.core import global_context as gpc
 import json

 def _get_code_location(depth: int):
@ -109,8 +110,9 @@ class CommProfiler(BaseProfiler):
            })
        
        data["events"] = events_list
+        rank = gpc.get_global_rank()

-        with open(json_dir.joinpath("communication.json"), "w") as f:
+        with open(json_dir.joinpath(f"worker{rank}.communication.json"), "w") as f:
            json.dump(data, f)

    def to_file(self, filename: Path):
--- a/colossalai/utils/profiler/mem_profiler.py
+++ b/colossalai/utils/profiler/mem_profiler.py
@ -4,6 +4,7 @@ from colossalai.engine import Engine
 from colossalai.engine.ophooks import MemTracerOpHook
 from colossalai.utils.profiler import BaseProfiler
 import json
+from colossalai.core import global_context as gpc

 class MemProfiler(BaseProfiler):
    """Wraper of MemOpHook, used to show GPU memory usage through each iteration
@ -46,7 +47,7 @@ class MemProfiler(BaseProfiler):
                "cuda_usage": stats
            }
        }
-
+        rank = gpc.get_global_rank()
        with open(log_dir.joinpath(f"worker{rank}.memory.json"), "w") as f:
            json.dump(data, f)

--- a/colossalai/utils/profiler/pcie_profiler.py
+++ b/colossalai/utils/profiler/pcie_profiler.py
@ -3,6 +3,7 @@ from torch.autograd.profiler import profile
 from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
 from typing import List
 import json
+from colossalai.core import global_context as gpc

 def _get_size(dtype: str):
    if dtype == "fp16":
@ -121,8 +122,9 @@ class PcieProfiler(BaseProfiler):
                "count": event.count
            })
        data["events"] = events_list
+        rank = gpc.get_global_rank()

-        with open(json_dir.joinpath("pcie.json"), "w") as f:
+        with open(json_dir.joinpath(f"worker{rank}.pcie.json"), "w") as f:
            json.dump(data, f)

    def to_file(self, filename: Path):