From cde4c57ce98dc8ee7147b34a96a450b2de49f5c8 Mon Sep 17 00:00:00 2001 From: li126com Date: Fri, 22 Sep 2023 15:32:52 +0800 Subject: [PATCH] feat: bind numa --- experiment/Dockerfile-centos | 1 + experiment/Dockerfile-ubuntu | 1 + internlm/initialize/launch.py | 26 +++++++++++++++----------- requirements/runtime.txt | 1 + 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/experiment/Dockerfile-centos b/experiment/Dockerfile-centos index 31ffc19..71c63e4 100644 --- a/experiment/Dockerfile-centos +++ b/experiment/Dockerfile-centos @@ -133,6 +133,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \ botocore \ torch-scatter \ pyecharts \ + py-libnuma \ -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \ && /opt/conda/bin/pip --no-cache-dir install \ --extra-index-url https://download.pytorch.org/whl/cu117 \ diff --git a/experiment/Dockerfile-ubuntu b/experiment/Dockerfile-ubuntu index 230a3b5..0675c2c 100644 --- a/experiment/Dockerfile-ubuntu +++ b/experiment/Dockerfile-ubuntu @@ -114,6 +114,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \ botocore \ torch-scatter \ pyecharts \ + py-libnuma \ -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \ && /opt/conda/bin/pip --no-cache-dir install \ --extra-index-url https://download.pytorch.org/whl/cu117 \ diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index e2be1fe..ee0cbad 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -6,9 +6,7 @@ import os from pathlib import Path from typing import Dict, Union -import numa import torch -from pynvml.smi import nvidia_smi from internlm.core.context import Config from internlm.core.context import global_context as gpc @@ -480,6 +478,17 @@ def get_config_value(config, key, defalut): def try_bind_numa(launcher): + try: + import numa + from numa import memory, schedule + from pynvml.smi import nvidia_smi + except (AttributeError, ImportError): + logger.info( + "Try bind numa failed! Package import error, if numa is not installed, " + "please implement: pip install --upgrade py-libnuma" + ) + return + # get numa node number numa_node_num = numa.info.get_max_node() + 1 # get total gpu number of current node @@ -509,12 +518,7 @@ def try_bind_numa(launcher): per_numa = total_GPU_per_node // numa_node_num numa_id = local_rank // per_numa - try: - from numa import memory, schedule - - schedule.run_on_nodes(numa_id) - memory.set_membind_nodes(numa_id) - except (AttributeError, ImportError): - return - else: - print(f"Rank: {global_rank} success bind process to numa node: {numa_id}", flush=True) + # bind numa node + schedule.run_on_nodes(numa_id) + memory.set_membind_nodes(numa_id) + logger.info(f"Rank: {global_rank} success bind process to numa node: {numa_id}") diff --git a/requirements/runtime.txt b/requirements/runtime.txt index f46d7ad..2fbef4a 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -13,4 +13,5 @@ boto3 botocore torch-scatter pyecharts +py-libnuma -f https://data.pyg.org/whl/torch-1.13.1+cu117.html \ No newline at end of file