From cde4c57ce98dc8ee7147b34a96a450b2de49f5c8 Mon Sep 17 00:00:00 2001
From: li126com <li126com2@126.com>
Date: Fri, 22 Sep 2023 15:32:52 +0800
Subject: [PATCH] feat: bind numa

---
 experiment/Dockerfile-centos  |  1 +
 experiment/Dockerfile-ubuntu  |  1 +
 internlm/initialize/launch.py | 26 +++++++++++++++-----------
 requirements/runtime.txt      |  1 +
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/experiment/Dockerfile-centos b/experiment/Dockerfile-centos
index 31ffc19..71c63e4 100644
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@@ -133,6 +133,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
     botocore \
     torch-scatter \
     pyecharts \
+    py-libnuma \
     -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
     && /opt/conda/bin/pip --no-cache-dir install \
     --extra-index-url https://download.pytorch.org/whl/cu117 \
diff --git a/experiment/Dockerfile-ubuntu b/experiment/Dockerfile-ubuntu
index 230a3b5..0675c2c 100644
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@@ -114,6 +114,7 @@ RUN /opt/conda/bin/pip --no-cache-dir install \
     botocore \
     torch-scatter \
     pyecharts \
+    py-libnuma \
     -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
     && /opt/conda/bin/pip --no-cache-dir install \
     --extra-index-url https://download.pytorch.org/whl/cu117 \
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index e2be1fe..ee0cbad 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -6,9 +6,7 @@ import os
 from pathlib import Path
 from typing import Dict, Union
 
-import numa
 import torch
-from pynvml.smi import nvidia_smi
 
 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
@@ -480,6 +478,17 @@ def get_config_value(config, key, defalut):
 
 
 def try_bind_numa(launcher):
+    try:
+        import numa
+        from numa import memory, schedule
+        from pynvml.smi import nvidia_smi
+    except (AttributeError, ImportError):
+        logger.info(
+            "Try bind numa failed! Package import error, if numa is not installed, "
+            "please implement: pip install --upgrade py-libnuma"
+        )
+        return
+
     # get numa node number
     numa_node_num = numa.info.get_max_node() + 1
     # get total gpu number of current node
@@ -509,12 +518,7 @@ def try_bind_numa(launcher):
     per_numa = total_GPU_per_node // numa_node_num
     numa_id = local_rank // per_numa
 
-    try:
-        from numa import memory, schedule
-
-        schedule.run_on_nodes(numa_id)
-        memory.set_membind_nodes(numa_id)
-    except (AttributeError, ImportError):
-        return
-    else:
-        print(f"Rank: {global_rank} success bind process to numa node: {numa_id}", flush=True)
+    # bind numa node
+    schedule.run_on_nodes(numa_id)
+    memory.set_membind_nodes(numa_id)
+    logger.info(f"Rank: {global_rank} success bind process to numa node: {numa_id}")
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index f46d7ad..2fbef4a 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -13,4 +13,5 @@ boto3
 botocore
 torch-scatter
 pyecharts
+py-libnuma
 -f https://data.pyg.org/whl/torch-1.13.1+cu117.html
\ No newline at end of file