mirror of https://github.com/hpcaitech/ColossalAI
use rank-based JSON file to avoid inconsistency
parent
9ac531aba5
commit
a1d7ab041d
|
@ -8,6 +8,7 @@ from torch.distributed import ReduceOp
|
|||
from colossalai.utils import get_current_device
|
||||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||
from typing import List, Optional
|
||||
from colossalai.core import global_context as gpc
|
||||
import json
|
||||
|
||||
def _get_code_location(depth: int):
|
||||
|
@ -109,8 +110,9 @@ class CommProfiler(BaseProfiler):
|
|||
})
|
||||
|
||||
data["events"] = events_list
|
||||
|
||||
with open(json_dir.joinpath("communication.json"), "w") as f:
|
||||
rank = gpc.get_global_rank()
|
||||
|
||||
with open(json_dir.joinpath(f"worker{rank}.communication.json"), "w") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
def to_file(self, filename: Path):
|
||||
|
|
|
@ -4,6 +4,7 @@ from colossalai.engine import Engine
|
|||
from colossalai.engine.ophooks import MemTracerOpHook
|
||||
from colossalai.utils.profiler import BaseProfiler
|
||||
import json
|
||||
from colossalai.core import global_context as gpc
|
||||
|
||||
class MemProfiler(BaseProfiler):
|
||||
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration
|
||||
|
@ -46,7 +47,7 @@ class MemProfiler(BaseProfiler):
|
|||
"cuda_usage": stats
|
||||
}
|
||||
}
|
||||
|
||||
rank = gpc.get_global_rank()
|
||||
with open(log_dir.joinpath(f"worker{rank}.memory.json"), "w") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from torch.autograd.profiler import profile
|
|||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||
from typing import List
|
||||
import json
|
||||
from colossalai.core import global_context as gpc
|
||||
|
||||
def _get_size(dtype: str):
|
||||
if dtype == "fp16":
|
||||
|
@ -121,8 +122,9 @@ class PcieProfiler(BaseProfiler):
|
|||
"count": event.count
|
||||
})
|
||||
data["events"] = events_list
|
||||
|
||||
with open(json_dir.joinpath("pcie.json"), "w") as f:
|
||||
rank = gpc.get_global_rank()
|
||||
|
||||
with open(json_dir.joinpath(f"worker{rank}.pcie.json"), "w") as f:
|
||||
json.dump(data, f)
|
||||
|
||||
def to_file(self, filename: Path):
|
||||
|
|
Loading…
Reference in New Issue