Merge pull request #409 from 1SAA/develop

[hotfix] fixed error when no collective communication in CommProfiler
pull/413/head
Frank Lee 2022-03-14 17:43:45 +08:00 committed by GitHub
commit 32296cf462
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 8 deletions

View File

@ -93,16 +93,16 @@ class CommProfiler(BaseProfiler):
dist.reduce = torch_reduce
def to_tensorboard(self, writer):
writer.add_text(tag="Collective Communication", text_string=self.result_list("\n\n"))
writer.add_text(tag="Collective Communication", text_string=self.result_str("\n\n"))
def to_file(self, filename: Path):
with open(filename, "w") as f:
f.write(self.result_list())
f.write(self.result_str())
def show(self):
print(self.result_list())
print(self.result_str())
def result_list(self, sep: str = "\n"):
def result_str(self, sep: str = "\n"):
res = []
def append(s: str = None):
@ -114,6 +114,9 @@ class CommProfiler(BaseProfiler):
append("Warnning: there exists multiple communication operations in the same time. As a result, "
"the profiling result is not accurate.")
if self.total_cuda_time == 0:
return "No collective communication has been called yet!"
append("Collective communication profiling result:")
append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))

View File

@ -105,16 +105,16 @@ class PcieProfiler(BaseProfiler):
self.profiler = None
def to_tensorboard(self, writer):
writer.add_text(tag="Data Transmission", text_string=self.result_list("\n\n"))
writer.add_text(tag="Data Transmission", text_string=self.result_str("\n\n"))
def to_file(self, filename: Path):
with open(filename, "w") as f:
f.write(self.result_list())
f.write(self.result_str())
def show(self):
print(self.result_list())
print(self.result_str())
def result_list(self, sep: str = "\n"):
def result_str(self, sep: str = "\n"):
res = []
def append(s: str = None):