mirror of https://github.com/hpcaitech/ColossalAI
72 lines
1.8 KiB
Markdown
72 lines
1.8 KiB
Markdown
|
# Rank Recorder
|
||
|
This is a useful tool to get the records of certain functions in each rank. The records of each rank will dump into a json file after the end of multiple process program. You can parse and visualise the json file easily.
|
||
|
|
||
|
Before using the tool, you should ensure dist.is_initialized() return true before exit of program.
|
||
|
|
||
|
## Usage
|
||
|
|
||
|
Is very simple:
|
||
|
|
||
|
```python
|
||
|
from colossalai.utils.rank_recorder import recorder
|
||
|
|
||
|
...
|
||
|
...
|
||
|
|
||
|
with recorder(record_name, current_rank) as r:
|
||
|
"""procedure to record
|
||
|
"""
|
||
|
|
||
|
```
|
||
|
|
||
|
## Example
|
||
|
This is a demo to display kernel select in cuda and visualise the cost of several procedures in each rank.
|
||
|
|
||
|
```python
|
||
|
import time
|
||
|
import os
|
||
|
import logging
|
||
|
logging.disable(logging.INFO)
|
||
|
|
||
|
import torch
|
||
|
import torch.distributed as dist
|
||
|
import torch.multiprocessing as mp
|
||
|
|
||
|
from colossalai.utils.rank_recorder import recorder
|
||
|
|
||
|
|
||
|
WORLD_SIZE = 4
|
||
|
|
||
|
# config the export image here
|
||
|
# If you want to dive into the detail, format 'svg' is recommended
|
||
|
recorder.export_format = 'png'
|
||
|
recorder.export_name = 'kernel_select'
|
||
|
recorder.dpi = 500
|
||
|
|
||
|
def calc(x, y):
|
||
|
a = torch.randn(x, y).cuda()
|
||
|
b = torch.randn(x, y).cuda()
|
||
|
c = sum(a * b)
|
||
|
return c
|
||
|
|
||
|
def worker(rank):
|
||
|
os.environ['MASTER_ADDR'] = 'localhost'
|
||
|
os.environ['MASTER_PORT'] = '29020'
|
||
|
dist.init_process_group(backend='nccl', world_size=WORLD_SIZE, rank=rank)
|
||
|
print(dist.get_rank(), "enter")
|
||
|
time.sleep(0.1 * rank)
|
||
|
|
||
|
with recorder("calc_1(x100)", rank) as r:
|
||
|
calc(100, 100)
|
||
|
|
||
|
with recorder("calc_2(x400)", rank) as r:
|
||
|
calc(400, 400)
|
||
|
|
||
|
with recorder("calc_2(x200)", rank) as r:
|
||
|
calc(200, 200)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
mp.spawn(worker, nprocs=WORLD_SIZE)
|
||
|
```
|
||
|
|
||
|
run the script directly and you will get `kernel_select.json` and `kernel_select.png` in your current folder.
|