mirror of https://github.com/InternLM/InternLM
feat: bind numa
parent
ea3d333144
commit
cde4c57ce9
|
@ -133,6 +133,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
|
||||||
botocore \
|
botocore \
|
||||||
torch-scatter \
|
torch-scatter \
|
||||||
pyecharts \
|
pyecharts \
|
||||||
|
py-libnuma \
|
||||||
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||||
&& /opt/conda/bin/pip --no-cache-dir install \
|
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||||
|
|
|
@ -114,6 +114,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
|
||||||
botocore \
|
botocore \
|
||||||
torch-scatter \
|
torch-scatter \
|
||||||
pyecharts \
|
pyecharts \
|
||||||
|
py-libnuma \
|
||||||
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||||
&& /opt/conda/bin/pip --no-cache-dir install \
|
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||||
|
|
|
@ -6,9 +6,7 @@ import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
|
|
||||||
import numa
|
|
||||||
import torch
|
import torch
|
||||||
from pynvml.smi import nvidia_smi
|
|
||||||
|
|
||||||
from internlm.core.context import Config
|
from internlm.core.context import Config
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
@ -480,6 +478,17 @@ def get_config_value(config, key, defalut):
|
||||||
|
|
||||||
|
|
||||||
def try_bind_numa(launcher):
|
def try_bind_numa(launcher):
|
||||||
|
try:
|
||||||
|
import numa
|
||||||
|
from numa import memory, schedule
|
||||||
|
from pynvml.smi import nvidia_smi
|
||||||
|
except (AttributeError, ImportError):
|
||||||
|
logger.info(
|
||||||
|
"Try bind numa failed! Package import error, if numa is not installed, "
|
||||||
|
"please implement: pip install --upgrade py-libnuma"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
# get numa node number
|
# get numa node number
|
||||||
numa_node_num = numa.info.get_max_node() + 1
|
numa_node_num = numa.info.get_max_node() + 1
|
||||||
# get total gpu number of current node
|
# get total gpu number of current node
|
||||||
|
@ -509,12 +518,7 @@ def try_bind_numa(launcher):
|
||||||
per_numa = total_GPU_per_node // numa_node_num
|
per_numa = total_GPU_per_node // numa_node_num
|
||||||
numa_id = local_rank // per_numa
|
numa_id = local_rank // per_numa
|
||||||
|
|
||||||
try:
|
# bind numa node
|
||||||
from numa import memory, schedule
|
schedule.run_on_nodes(numa_id)
|
||||||
|
memory.set_membind_nodes(numa_id)
|
||||||
schedule.run_on_nodes(numa_id)
|
logger.info(f"Rank: {global_rank} success bind process to numa node: {numa_id}")
|
||||||
memory.set_membind_nodes(numa_id)
|
|
||||||
except (AttributeError, ImportError):
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
print(f"Rank: {global_rank} success bind process to numa node: {numa_id}", flush=True)
|
|
||||||
|
|
|
@ -13,4 +13,5 @@ boto3
|
||||||
botocore
|
botocore
|
||||||
torch-scatter
|
torch-scatter
|
||||||
pyecharts
|
pyecharts
|
||||||
|
py-libnuma
|
||||||
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
|
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
|
Loading…
Reference in New Issue