polish utils docstring (#620)

2022-04-01 16:36:47 +08:00 · 2022-04-01 16:36:47 +08:00 · 369a288bf3
parent e619a651fb
commit 369a288bf3
4 changed files with 42 additions and 48 deletions
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@ -175,7 +175,7 @@ def load_checkpoint(checkpoint_path: str,
    If strict is True, then the keys of state_dict must exactly match the keys returned
    by this module’s state_dict() function.
-     Args:
+    Args:
        checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
        model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
--- a/colossalai/utils/memory_tracer/async_memtracer.py
+++ b/colossalai/utils/memory_tracer/async_memtracer.py
@ -11,32 +11,31 @@ from colossalai.utils import get_current_device
 class AsyncMemoryMonitor:
    """
    An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
-    at interval of 1/(10**power) sec.
+    at interval of `1/(10**power)` sec.
    The idea comes from Runtime Memory Tracer of PatrickStar
-    PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
+    `PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_
    https://arxiv.org/abs/2108.05818
-    :param power: the power of time interval, defaults to 10
+    Usage::
    :type power: int
-    Usage:
+        async_mem_monitor = AsyncMemoryMonitor()
-    ::
+        input = torch.randn(2, 20).cuda()
        OP1 = torch.nn.Linear(20, 30).cuda()
        OP2 = torch.nn.Linear(30, 40).cuda()
-        ```python
+        async_mem_monitor.start()
-            async_mem_monitor = AsyncMemoryMonitor()
+        output = OP1(input)
-            input = torch.randn(2, 20).cuda()
+        async_mem_monitor.finish()
-            OP1 = torch.nn.Linear(20, 30).cuda()
+        async_mem_monitor.start()
-            OP2 = torch.nn.Linear(30, 40).cuda()
+        output = OP2(output)
        async_mem_monitor.finish()
        async_mem_monitor.save('log.pkl')
-            async_mem_monitor.start()
+    Args:
-            output = OP1(input)
+        power (int, optional): the power of time interva. Defaults to 10.
-            async_mem_monitor.finish()
+
-            async_mem_monitor.start()
+    .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
-            output = OP2(output)
+        https://arxiv.org/abs/2108.05818
            async_mem_monitor.finish()
            async_mem_monitor.save('log.pkl')
        ```
    """
    def __init__(self, power: int = 10):
--- a/colossalai/utils/profiler/mem_profiler.py
+++ b/colossalai/utils/profiler/mem_profiler.py
@ -12,6 +12,8 @@ class MemProfiler(BaseProfiler):
    To use this profiler, you need to pass an `engine` instance. And the usage is same like
    CommProfiler.
    Usage::
        mm_prof = MemProfiler(engine)
        with ProfilerContext([mm_prof]) as prof:
            writer = SummaryWriter("mem")
@ -36,11 +38,7 @@ class MemProfiler(BaseProfiler):
    def to_tensorboard(self, writer: SummaryWriter) -> None:
        stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
        for info, i in enumerate(stats):
-            writer.add_scalar(
+            writer.add_scalar("memory_usage/GPU", info, i)
                "memory_usage/GPU",
                info,
                i
            )
    def to_file(self, data_file: Path) -> None:
        self._mem_tracer.save_results(data_file)
--- a/colossalai/utils/profiler/prof_utils.py
+++ b/colossalai/utils/profiler/prof_utils.py
@ -70,29 +70,26 @@ class BaseProfiler(ABC):
 class ProfilerContext(object):
-    """
+    """Profiler context manager
    Profiler context manager
    Usage:
    ::
-        ```python
+    Usage::
            world_size = 4
            inputs = torch.randn(10, 10, dtype=torch.float32, device=get_current_device())
            outputs = torch.empty(world_size, 10, 10, dtype=torch.float32, device=get_current_device())
            outputs_list = list(torch.chunk(outputs, chunks=world_size, dim=0))
-            cc_prof = CommProfiler()
+        world_size = 4
        inputs = torch.randn(10, 10, dtype=torch.float32, device=get_current_device())
        outputs = torch.empty(world_size, 10, 10, dtype=torch.float32, device=get_current_device())
        outputs_list = list(torch.chunk(outputs, chunks=world_size, dim=0))
-            with ProfilerContext([cc_prof]) as prof:
+        cc_prof = CommProfiler()
                op = dist.all_reduce(inputs, async_op=True)
                dist.all_gather(outputs_list, inputs)
                op.wait()
                dist.reduce_scatter(inputs, outputs_list)
                dist.broadcast(inputs, 0)
                dist.reduce(inputs, 0)
-            prof.show()
+        with ProfilerContext([cc_prof]) as prof:
-        ```
+            op = dist.all_reduce(inputs, async_op=True)
            dist.all_gather(outputs_list, inputs)
            op.wait()
            dist.reduce_scatter(inputs, outputs_list)
            dist.broadcast(inputs, 0)
            dist.reduce(inputs, 0)
        prof.show()
    """
    def __init__(self, profilers: List[BaseProfiler] = None, enable: bool = True):