mirror of https://github.com/hpcaitech/ColossalAI
use rank-based JSON file to avoid inconsistency
parent
9ac531aba5
commit
a1d7ab041d
|
@ -8,6 +8,7 @@ from torch.distributed import ReduceOp
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
from colossalai.core import global_context as gpc
|
||||||
import json
|
import json
|
||||||
|
|
||||||
def _get_code_location(depth: int):
|
def _get_code_location(depth: int):
|
||||||
|
@ -109,8 +110,9 @@ class CommProfiler(BaseProfiler):
|
||||||
})
|
})
|
||||||
|
|
||||||
data["events"] = events_list
|
data["events"] = events_list
|
||||||
|
rank = gpc.get_global_rank()
|
||||||
|
|
||||||
with open(json_dir.joinpath("communication.json"), "w") as f:
|
with open(json_dir.joinpath(f"worker{rank}.communication.json"), "w") as f:
|
||||||
json.dump(data, f)
|
json.dump(data, f)
|
||||||
|
|
||||||
def to_file(self, filename: Path):
|
def to_file(self, filename: Path):
|
||||||
|
|
|
@ -4,6 +4,7 @@ from colossalai.engine import Engine
|
||||||
from colossalai.engine.ophooks import MemTracerOpHook
|
from colossalai.engine.ophooks import MemTracerOpHook
|
||||||
from colossalai.utils.profiler import BaseProfiler
|
from colossalai.utils.profiler import BaseProfiler
|
||||||
import json
|
import json
|
||||||
|
from colossalai.core import global_context as gpc
|
||||||
|
|
||||||
class MemProfiler(BaseProfiler):
|
class MemProfiler(BaseProfiler):
|
||||||
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration
|
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration
|
||||||
|
@ -46,7 +47,7 @@ class MemProfiler(BaseProfiler):
|
||||||
"cuda_usage": stats
|
"cuda_usage": stats
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
rank = gpc.get_global_rank()
|
||||||
with open(log_dir.joinpath(f"worker{rank}.memory.json"), "w") as f:
|
with open(log_dir.joinpath(f"worker{rank}.memory.json"), "w") as f:
|
||||||
json.dump(data, f)
|
json.dump(data, f)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ from torch.autograd.profiler import profile
|
||||||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||||
from typing import List
|
from typing import List
|
||||||
import json
|
import json
|
||||||
|
from colossalai.core import global_context as gpc
|
||||||
|
|
||||||
def _get_size(dtype: str):
|
def _get_size(dtype: str):
|
||||||
if dtype == "fp16":
|
if dtype == "fp16":
|
||||||
|
@ -121,8 +122,9 @@ class PcieProfiler(BaseProfiler):
|
||||||
"count": event.count
|
"count": event.count
|
||||||
})
|
})
|
||||||
data["events"] = events_list
|
data["events"] = events_list
|
||||||
|
rank = gpc.get_global_rank()
|
||||||
|
|
||||||
with open(json_dir.joinpath("pcie.json"), "w") as f:
|
with open(json_dir.joinpath(f"worker{rank}.pcie.json"), "w") as f:
|
||||||
json.dump(data, f)
|
json.dump(data, f)
|
||||||
|
|
||||||
def to_file(self, filename: Path):
|
def to_file(self, filename: Path):
|
||||||
|
|
Loading…
Reference in New Issue