Add multi-gpu deployment

2023-04-16 21:37:33 +08:00 · 2023-04-16 21:37:33 +08:00 · 4f95c09a3b
parent 90f2e47f54
commit 4f95c09a3b
5 changed files with 38 additions and 51 deletions
--- a/README.md
+++ b/README.md
@ -176,28 +176,26 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal
 即可使用在 Mac 上使用 GPU 加速模型推理。

 ### 多卡部署
-```shell
-pip install accelerate
-```
-
+如果你有多张 GPU，但是每张 GPU 的显存大小都不足以容纳完整的模型，那么可以将模型切分在多张GPU上。首先安装 accelerate: `pip install accelerate`，然后通过如下方法加载模型：
 ```python
-from utils import load_model_and_tokenizer
-
-model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2)
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
 ```
-即可将模型部署到多卡上进行推理。
+即可将模型部署到两张 GPU 上进行推理。你可以将 `num_gpus` 改为你希望使用的 GPU 数。默认是均匀切分的，你也可以传入 `device_map` 参数来自己指定。 

 ## 高效参数微调
 基于 [P-tuning v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调。具体使用方法详见 [ptuning/README.md](ptuning/README.md)。

 ## 更新信息
+**[2023/04/16]** 增加 INT8 量化后的模型 [ChatGLM-6B-INT8](https://huggingface.co/THUDM/chatglm-6b-int8)。增加多卡部署（感谢 [@Cherrysaber](https://github.com/Cherrysaber)）。
+
 **[2023/04/06]** 优化web demo的界面（感谢 [@tuteng0915](https://github.com/tuteng0915)）。移除embedding中的image token以减小显存占用（需要更新模型文件`pytorch_model-00001-of-00008.bin`和`pytorch_model-00008-of-00008.bin`，感谢 [@silverriver](https://github.com/silverriver) 提出的想法）。去掉了对 `icetk` 的依赖（需要更新模型文件`ice_text.model`）。

 **[2023/03/31]** 增加基于 [P-Tuning-v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调实现，INT4 量化级别下最低只需 7GB 显存即可进行模型微调。详见[高效参数微调方法](ptuning/README.md)。

 **[2023/03/23]** 增加 API 部署（感谢 [@LemonQu-GIT](https://github.com/LemonQu-GIT)）。增加 Embedding 量化模型 [ChatGLM-6B-INT4-QE](https://huggingface.co/THUDM/chatglm-6b-int4-qe)。增加配备 Apple Silicon 芯片的 Mac 上 GPU 加速的支持。

-**[2023/03/19]** 增加流式输出接口 `stream_chat`，已更新到网页版和命令行 Demo。修复输出中的中文标点。增加量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)
+**[2023/03/19]** 增加流式输出接口 `stream_chat`，已更新到网页版和命令行 Demo。修复输出中的中文标点。增加 INT4 量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)

 ## ChatGLM-6B 示例

--- a/README_en.md
+++ b/README_en.md
@ -175,22 +175,20 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal
 ```
 Then you can use GPU-accelerated model inference on Mac.

+### Multi-GPU Deployment
+If you have multiple GPUs, but the memory size of each GPU is not sufficient to accommodate the entire model, you can split the model across multiple GPUs. 
+
+First, install accelerate: `pip install accelerate`, and then load the model using the following method:
+```python
+from utils import load_model_on_gpus
+model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
+```
+
+This will deploy the model onto two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. By default, the model is split evenly, but you can also specify the `device_map` parameter to customize the splitting.
+
 ## Parameter-efficient Tuning
 Parameter-efficient tuning based on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). See [ptuning/README.md](ptuning/README.md) for details on how to use it.

-
-### Multi-GPU Deployment
-
-```shell
-pip install accelerate
-```
-
-```python
-from utils import load_model_and_tokenizer
-
-model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2)
-```
-
 ## ChatGLM-6B Examples

 The following are some Chinese examples with `web_demo.py`. Welcome to explore more possibility with ChatGLM-6B.
--- a/utils.py
+++ b/utils.py
@ -2,8 +2,7 @@ import os
 from typing import Dict, Tuple, Union, Optional

 from torch.nn import Module
-from transformers import AutoModel, AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers import AutoModel


 def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
@ -37,32 +36,21 @@ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:


 def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
-                       multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir",
-                       device_map: Optional[Dict[str, int]] = None,
-                       tokenizer: Optional[PreTrainedTokenizer] = None, **kwargs) -> Module:
-    from accelerate import load_checkpoint_and_dispatch
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
+    if num_gpus < 2 and device_map is None:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
+    else:
+        from accelerate import load_checkpoint_and_dispatch

-    model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs)
-    model = model.eval()
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs)
+        model = model.eval()

-    if device_map is None:
-        device_map = auto_configure_device_map(num_gpus)
+        if device_map is None:
+            device_map = auto_configure_device_map(num_gpus)

-    model = load_checkpoint_and_dispatch(
-        model, checkpoint_path, device_map=device_map, offload_folder="offload", offload_state_dict=True).half()
+        model = load_checkpoint_and_dispatch(
+            model, checkpoint_path, device_map=device_map, offload_folder="offload", offload_state_dict=True).half()

    return model


-def load_model_and_tokenizer(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 1,
-                             multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir",
-                             **kwargs) -> Tuple[Module, PreTrainedTokenizer]:
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs)
-    if num_gpus < 2:
-        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
-        model = model.eval()
-    else:
-        model = load_model_on_gpus(checkpoint_path, num_gpus=num_gpus,
-                                   multi_gpu_model_cache_dir=multi_gpu_model_cache_dir,
-                                   tokenizer=tokenizer, **kwargs)
-    return model, tokenizer
--- a/web_demo.py
+++ b/web_demo.py
@ -1,9 +1,10 @@
+from transformers import AutoModel, AutoTokenizer
 import gradio as gr
 import mdtex2html

-from utils import load_model_and_tokenizer
-
-model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1)
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+model = model.eval()

 """Override Chatbot.postprocess"""

--- a/web_demo2.py
+++ b/web_demo2.py
@ -1,7 +1,7 @@
 from transformers import AutoModel, AutoTokenizer
 import streamlit as st
 from streamlit_chat import message
-from utils import load_model_and_tokenizer
+

 st.set_page_config(
    page_title="ChatGLM-6b 演示",
@ -11,7 +11,9 @@ st.set_page_config(

@st.cache_resource
 def get_model():
-    model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1)
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+    model = model.eval()
    return tokenizer, model