diff --git a/colossalai/utils/profiler/comm_profiler.py b/colossalai/utils/profiler/comm_profiler.py
index d672a8e20..93c72cc65 100644
--- a/colossalai/utils/profiler/comm_profiler.py
+++ b/colossalai/utils/profiler/comm_profiler.py
@@ -6,20 +6,25 @@ from torch.autograd.profiler import profile
 import torch.distributed as dist
 from torch.distributed import ReduceOp
 from colossalai.utils import get_current_device
-from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith
+from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
 from typing import List, Optional
 
 
 def _get_code_location(depth: int):
-    ret = ""
-    length = len(inspect.stack())
-    for i in range(3, min(length, depth + 1)):
+    ret = []
+    length = min(len(inspect.stack()), depth + 1)
+    for i in range(3, length):
         upper_frame = inspect.stack()[i]
         function_name = inspect.stack()[i - 1].function
-        info = upper_frame.filename + "(" + str(upper_frame.lineno) + "): " + function_name + "\n"
-        ret += info
+        ret.append(upper_frame.filename)
+        ret.append('(')
+        ret.append(str(upper_frame.lineno))
+        ret.append('): ')
+        ret.append(function_name)
+        if i != length - 1:
+            ret.append('\n')
 
-    return ret
+    return ''.join(ret)
 
 
 torch_all_reduce = dist.all_reduce
@@ -100,8 +105,9 @@ class CommProfiler(BaseProfiler):
     def result_list(self, sep: str = "\n"):
         res = []
 
-        def append(s: str):
-            res.append(s)
+        def append(s: str = None):
+            if s is not None:
+                res.append(s)
             res.append(sep)
 
         if self.warn_flag:
@@ -110,19 +116,26 @@ class CommProfiler(BaseProfiler):
 
         append("Collective communication profiling result:")
         append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
-        append("average bandwith: {}".format(_format_bandwith(self.total_comm_vol, self.total_cuda_time)))
+        append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))
         append("total number of calls: {}".format(self.total_count))
-        append("All events:\n----------------------------------------")
+        append("All events:")
+
+        seperation = '-' * 74
+        row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
+
+        append(seperation)
+        append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
+        append(seperation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
         for location, event in show_list:
             append(location)
-            append("self cuda time: {}".format(_format_time(event.self_cuda_time)))
-            append("{:.1f}% of total communication time".format(event.self_cuda_time / self.total_cuda_time * 100.0))
-            append("self communication volme: {}".format(_format_memory(event.self_comm_vol)))
-            append("average bandwith: {}".format(_format_bandwith(event.self_comm_vol, event.self_cuda_time)))
-            append("number of calls: {}".format(event.self_count))
-            append("----------------------------------------")
+            append(
+                row_format.format('', _format_time(event.self_cuda_time),
+                                  '{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),
+                                  _format_memory(event.self_comm_vol),
+                                  _format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))
+            append()
 
         return ''.join(res)
 
diff --git a/colossalai/utils/profiler/pcie_profiler.py b/colossalai/utils/profiler/pcie_profiler.py
index a01a37489..3a9ec95b4 100644
--- a/colossalai/utils/profiler/pcie_profiler.py
+++ b/colossalai/utils/profiler/pcie_profiler.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 from torch.autograd.profiler import profile
-from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith
+from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
 from typing import List
 
 
@@ -24,6 +24,7 @@ def _reduce_location(locations: List[str]) -> str:
     for lo in locations:
         ret.append(lo)
         ret.append("\n")
+    ret = ret[:-1]
     return ''.join(ret)
 
 
@@ -48,18 +49,23 @@ class PcieProfiler(BaseProfiler):
     TODO: Merge pcie profiler into communication profiler
     """
 
-    def __init__(self,
-                 dtype: str = "fp32",
-                 depth: int = 1,
-                 total_count: int = 0,
-                 total_pcie_vol: int = 0,
-                 total_cuda_time: int = 0):
+    def __init__(self, dtype: str = "fp32", depth: int = 1):
         super().__init__(profiler_name="Pcie", priority=10)
         self.depth = depth
         self.data_size = _get_size(dtype)
-        self.total_count = total_count
-        self.total_pcie_vol = total_pcie_vol
-        self.total_cuda_time = total_cuda_time
+        self.h2d_count = 0
+        self.h2d_time = 0
+        self.d2h_count = 0
+        self.d2h_time = 0
+
+        self.ops_record = dict()
+        self.profiler = None
+
+    def reset(self):
+        self.h2d_count = 0
+        self.h2d_time = 0
+        self.d2h_count = 0
+        self.d2h_time = 0
 
         self.ops_record = dict()
         self.profiler = None
@@ -81,17 +87,20 @@ class PcieProfiler(BaseProfiler):
             for event in events:
                 if event.name == "aten::copy_":
                     t_shape = event.input_shapes[0]
-                    if len(t_shape) == 0 or event.cuda_time_total == 0:
+                    if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:
                         continue
                     current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
-                    self.total_count += current_comm_event.count
-                    self.total_pcie_vol += current_comm_event.pcie_vol
-                    self.total_cuda_time += current_comm_event.cuda_time
                     code_location = _reduce_location(event.stack[:self.depth])
                     if code_location in self.ops_record:
                         self.ops_record[code_location].add(current_comm_event)
                     else:
                         self.ops_record[code_location] = current_comm_event
+                elif 'Memcpy HtoD' in event.name:
+                    self.h2d_count += 1
+                    self.h2d_time += event.cuda_time_total
+                elif 'Memcpy DtoH' in event.name:
+                    self.d2h_count += 1
+                    self.d2h_time += event.cuda_time_total
 
         self.profiler = None
 
@@ -108,24 +117,32 @@ class PcieProfiler(BaseProfiler):
     def result_list(self, sep: str = "\n"):
         res = []
 
-        def append(s: str):
-            res.append(s)
+        def append(s: str = None):
+            if s is not None:
+                res.append(s)
             res.append(sep)
 
         append("Pcie profiling result:")
-        append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
-        append("average bandwith: {}".format(_format_bandwith(self.total_pcie_vol, self.total_cuda_time)))
-        append("total number of calls: {}".format(self.total_count))
-        append("All events:\n----------------------------------------")
+        append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))
+        append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))
+        append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))
+        append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))
+
+        append("Possible data transmission events in PCIE:")
+
+        seperation = '-' * 62
+        row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
+
+        append(seperation)
+        append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
+        append(seperation)
 
         show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
         for location, event in show_list:
             append(location)
-            append("cuda time: {}".format(_format_time(event.cuda_time)))
-            append("{:.1f}% of total pcie time".format(event.cuda_time / self.total_cuda_time * 100.0))
-            append("pcie volme: {}".format(_format_memory(event.pcie_vol)))
-            append("average bandwith: {}".format(_format_bandwith(event.pcie_vol, event.cuda_time)))
-            append("number of calls: {}".format(event.count))
-            append("----------------------------------------")
+            append(
+                row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),
+                                  _format_bandwidth(event.pcie_vol, event.cuda_time), event.count))
+            append()
 
         return ''.join(res)
diff --git a/colossalai/utils/profiler/prof_utils.py b/colossalai/utils/profiler/prof_utils.py
index d71906868..641a514cf 100644
--- a/colossalai/utils/profiler/prof_utils.py
+++ b/colossalai/utils/profiler/prof_utils.py
@@ -32,7 +32,7 @@ def _format_memory(nbytes):
         return str(nbytes) + ' B'
 
 
-def _format_bandwith(volme: float or int, time_us: int):
+def _format_bandwidth(volme: float or int, time_us: int):
     sec_div_mb = (1000.0 / 1024.0)**2
     mb_per_sec = volme / time_us * sec_div_mb