mirror of https://github.com/InternLM/InternLM
feat(numa): bind numa if possible (#320)
* feat:add numa * feat:add bind numa * feat:add bind numa * feat:add bind numa * feat: bind numa * feat: bind numa * feat: add numa * feat:add numa * feat:add numa * try_bind_numa should not raise exception --------- Co-authored-by: 877825076@qq.com <877825076@qq.com>pull/367/head
parent
9284303a6d
commit
c1e30cff2c
|
@ -133,6 +133,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
|
|||
botocore \
|
||||
torch-scatter \
|
||||
pyecharts \
|
||||
py-libnuma \
|
||||
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||
|
|
|
@ -114,6 +114,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
|
|||
botocore \
|
||||
torch-scatter \
|
||||
pyecharts \
|
||||
py-libnuma \
|
||||
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||
|
|
|
@ -4,6 +4,7 @@ from .launch import (
|
|||
initialize_distributed_env,
|
||||
launch_from_slurm,
|
||||
launch_from_torch,
|
||||
try_bind_numa,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
|
@ -12,4 +13,5 @@ __all__ = [
|
|||
"launch_from_slurm",
|
||||
"launch_from_torch",
|
||||
"initialize_distributed_env",
|
||||
"try_bind_numa",
|
||||
]
|
||||
|
|
|
@ -16,6 +16,16 @@ from internlm.utils.common import get_master_node
|
|||
from internlm.utils.logger import get_logger
|
||||
from internlm.utils.timeout import llm_timeout
|
||||
|
||||
# check pacakge
|
||||
try:
|
||||
import numa
|
||||
from numa import memory, schedule
|
||||
from pynvml.smi import nvidia_smi
|
||||
except (AttributeError, ImportError):
|
||||
get_numa = False
|
||||
else:
|
||||
get_numa = True
|
||||
|
||||
logger = get_logger(__file__)
|
||||
|
||||
|
||||
|
@ -385,6 +395,8 @@ def launch_from_slurm(
|
|||
except KeyError as e:
|
||||
raise RuntimeError(f"Could not find {e} in the SLURM environment")
|
||||
|
||||
try_bind_numa(global_rank=rank, world_size=world_size)
|
||||
|
||||
launch(
|
||||
config=config,
|
||||
rank=rank,
|
||||
|
@ -418,6 +430,8 @@ def launch_from_torch(
|
|||
except KeyError as e:
|
||||
raise RuntimeError(f"Could not find {e} in the torch environment")
|
||||
|
||||
try_bind_numa(global_rank=rank, world_size=world_size, local_rank=local_rank)
|
||||
|
||||
launch(
|
||||
config=config,
|
||||
local_rank=local_rank,
|
||||
|
@ -447,6 +461,7 @@ def initialize_distributed_env(
|
|||
master_port (str): The master port for distributed training. 8888 by default.
|
||||
seed (int, optional): Specified random seed for every process. 1024 by default.
|
||||
"""
|
||||
|
||||
# close automatic garbage collection
|
||||
gc.disable()
|
||||
|
||||
|
@ -484,3 +499,45 @@ def get_config_value(config, key, defalut):
|
|||
except KeyError:
|
||||
value = defalut
|
||||
return value
|
||||
|
||||
|
||||
def try_bind_numa(global_rank, world_size, local_rank=None):
|
||||
# Early return if numa module not available
|
||||
if not get_numa:
|
||||
if global_rank == 0:
|
||||
logger.info(
|
||||
"Try bind numa failed! Package import error, if numa is not installed, "
|
||||
"please implement: pip install --upgrade py-libnuma, Ref: https://pypi.org/project/py-libnuma/"
|
||||
)
|
||||
|
||||
# get numa node number
|
||||
try:
|
||||
numa_node_num = numa.info.get_max_node() + 1
|
||||
# get total gpu number of current node
|
||||
nvsmi = nvidia_smi.getInstance()
|
||||
total_GPU_per_node = len(nvsmi.DeviceQuery("memory.total")["gpu"])
|
||||
|
||||
# return while total_GPU_per_node is larger than numa_node_num or is not divisible by numa_node_num
|
||||
if total_GPU_per_node <= numa_node_num:
|
||||
return
|
||||
if total_GPU_per_node % numa_node_num != 0:
|
||||
return
|
||||
# return while the number of processes is smaller than one node GPUs num
|
||||
if world_size < total_GPU_per_node:
|
||||
return
|
||||
|
||||
if local_rank is None:
|
||||
devices_per_node = torch.cuda.device_count()
|
||||
local_rank = global_rank % devices_per_node
|
||||
|
||||
# compute numa id for each locak rank
|
||||
per_numa = total_GPU_per_node // numa_node_num
|
||||
numa_id = local_rank // per_numa
|
||||
|
||||
# bind numa node
|
||||
schedule.run_on_nodes(numa_id)
|
||||
memory.set_membind_nodes(numa_id)
|
||||
except Exception:
|
||||
return # try_bind_numa should not raise exception
|
||||
else:
|
||||
logger.info(f"Rank: {global_rank} success bind process to numa node: {numa_id}")
|
||||
|
|
|
@ -13,4 +13,5 @@ boto3
|
|||
botocore
|
||||
torch-scatter
|
||||
pyecharts
|
||||
py-libnuma
|
||||
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
|
Loading…
Reference in New Issue