feat: bind numa

pull/320/head
li126com 2023-09-22 15:32:52 +08:00
parent ea3d333144
commit cde4c57ce9
4 changed files with 18 additions and 11 deletions

View File

@ -133,6 +133,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
botocore \
torch-scatter \
pyecharts \
py-libnuma \
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
&& /opt/conda/bin/pip --no-cache-dir install \
--extra-index-url https://download.pytorch.org/whl/cu117 \

View File

@ -114,6 +114,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
botocore \
torch-scatter \
pyecharts \
py-libnuma \
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
&& /opt/conda/bin/pip --no-cache-dir install \
--extra-index-url https://download.pytorch.org/whl/cu117 \

View File

@ -6,9 +6,7 @@ import os
from pathlib import Path
from typing import Dict, Union
import numa
import torch
from pynvml.smi import nvidia_smi
from internlm.core.context import Config
from internlm.core.context import global_context as gpc
@ -480,6 +478,17 @@ def get_config_value(config, key, defalut):
def try_bind_numa(launcher):
try:
import numa
from numa import memory, schedule
from pynvml.smi import nvidia_smi
except (AttributeError, ImportError):
logger.info(
"Try bind numa failed! Package import error, if numa is not installed, "
"please implement: pip install --upgrade py-libnuma"
)
return
# get numa node number
numa_node_num = numa.info.get_max_node() + 1
# get total gpu number of current node
@ -509,12 +518,7 @@ def try_bind_numa(launcher):
per_numa = total_GPU_per_node // numa_node_num
numa_id = local_rank // per_numa
try:
from numa import memory, schedule
schedule.run_on_nodes(numa_id)
memory.set_membind_nodes(numa_id)
except (AttributeError, ImportError):
return
else:
print(f"Rank: {global_rank} success bind process to numa node: {numa_id}", flush=True)
# bind numa node
schedule.run_on_nodes(numa_id)
memory.set_membind_nodes(numa_id)
logger.info(f"Rank: {global_rank} success bind process to numa node: {numa_id}")

View File

@ -13,4 +13,5 @@ boto3
botocore
torch-scatter
pyecharts
py-libnuma
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html