Add multi-gpu deployment

2023-04-16 21:37:33 +08:00 · 2023-04-16 21:37:33 +08:00 · 4f95c09a3b
parent 90f2e47f54
commit 4f95c09a3b
5 changed files with 38 additions and 51 deletions
--- a/README.md
+++ b/README.md
@ -176,28 +176,26 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal
 即可使用在 Mac 上使用 GPU 加速模型推理。
 ### 多卡部署
-```shell
+如果你有多张 GPU，但是每张 GPU 的显存大小都不足以容纳完整的模型，那么可以将模型切分在多张GPU上。首先安装 accelerate: `pip install accelerate`，然后通过如下方法加载模型：
 pip install accelerate
 ```
 ```python
-from utils import load_model_and_tokenizer
+from utils import load_model_on_gpus
-
+model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
 model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2)
 ```
-即可将模型部署到多卡上进行推理。
+即可将模型部署到两张 GPU 上进行推理。你可以将 `num_gpus` 改为你希望使用的 GPU 数。默认是均匀切分的，你也可以传入 `device_map` 参数来自己指定。 
 ## 高效参数微调
 基于 [P-tuning v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调。具体使用方法详见 [ptuning/README.md](ptuning/README.md)。
 ## 更新信息
 **[2023/04/16]** 增加 INT8 量化后的模型 [ChatGLM-6B-INT8](https://huggingface.co/THUDM/chatglm-6b-int8)。增加多卡部署（感谢 [@Cherrysaber](https://github.com/Cherrysaber)）。
 **[2023/04/06]** 优化web demo的界面（感谢 [@tuteng0915](https://github.com/tuteng0915)）。移除embedding中的image token以减小显存占用（需要更新模型文件`pytorch_model-00001-of-00008.bin`和`pytorch_model-00008-of-00008.bin`，感谢 [@silverriver](https://github.com/silverriver) 提出的想法）。去掉了对 `icetk` 的依赖（需要更新模型文件`ice_text.model`）。
 **[2023/03/31]** 增加基于 [P-Tuning-v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调实现，INT4 量化级别下最低只需 7GB 显存即可进行模型微调。详见[高效参数微调方法](ptuning/README.md)。
 **[2023/03/23]** 增加 API 部署（感谢 [@LemonQu-GIT](https://github.com/LemonQu-GIT)）。增加 Embedding 量化模型 [ChatGLM-6B-INT4-QE](https://huggingface.co/THUDM/chatglm-6b-int4-qe)。增加配备 Apple Silicon 芯片的 Mac 上 GPU 加速的支持。
-**[2023/03/19]** 增加流式输出接口 `stream_chat`，已更新到网页版和命令行 Demo。修复输出中的中文标点。增加量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)
+**[2023/03/19]** 增加流式输出接口 `stream_chat`，已更新到网页版和命令行 Demo。修复输出中的中文标点。增加 INT4 量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)
 ## ChatGLM-6B 示例
--- a/README_en.md
+++ b/README_en.md
@ -175,22 +175,20 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal
 ```
 Then you can use GPU-accelerated model inference on Mac.
 ### Multi-GPU Deployment
 If you have multiple GPUs, but the memory size of each GPU is not sufficient to accommodate the entire model, you can split the model across multiple GPUs. 
 First, install accelerate: `pip install accelerate`, and then load the model using the following method:
 ```python
 from utils import load_model_on_gpus
 model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
 ```
 This will deploy the model onto two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. By default, the model is split evenly, but you can also specify the `device_map` parameter to customize the splitting.
 ## Parameter-efficient Tuning
 Parameter-efficient tuning based on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). See [ptuning/README.md](ptuning/README.md) for details on how to use it.
 ### Multi-GPU Deployment
 ```shell
 pip install accelerate
 ```
 ```python
 from utils import load_model_and_tokenizer
 model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2)
 ```
 ## ChatGLM-6B Examples
 The following are some Chinese examples with `web_demo.py`. Welcome to explore more possibility with ChatGLM-6B.
--- a/utils.py
+++ b/utils.py
@ -2,8 +2,7 @@ import os
 from typing import Dict, Tuple, Union, Optional
 from torch.nn import Module
-from transformers import AutoModel, AutoTokenizer
+from transformers import AutoModel
 from transformers.tokenization_utils import PreTrainedTokenizer
 def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
@ -37,9 +36,10 @@ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
 def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
-                       multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir",
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
-                       device_map: Optional[Dict[str, int]] = None,
+    if num_gpus < 2 and device_map is None:
-                       tokenizer: Optional[PreTrainedTokenizer] = None, **kwargs) -> Module:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
    else:
        from accelerate import load_checkpoint_and_dispatch
        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs)
@ -54,15 +54,3 @@ def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int =
    return model
 def load_model_and_tokenizer(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 1,
                             multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir",
                             **kwargs) -> Tuple[Module, PreTrainedTokenizer]:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs)
    if num_gpus < 2:
        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
        model = model.eval()
    else:
        model = load_model_on_gpus(checkpoint_path, num_gpus=num_gpus,
                                   multi_gpu_model_cache_dir=multi_gpu_model_cache_dir,
                                   tokenizer=tokenizer, **kwargs)
    return model, tokenizer
--- a/web_demo.py
+++ b/web_demo.py
@ -1,9 +1,10 @@
 from transformers import AutoModel, AutoTokenizer
 import gradio as gr
 import mdtex2html
-from utils import load_model_and_tokenizer
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
-model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1)
+model = model.eval()
 """Override Chatbot.postprocess"""
--- a/web_demo2.py
+++ b/web_demo2.py
@ -1,7 +1,7 @@
 from transformers import AutoModel, AutoTokenizer
 import streamlit as st
 from streamlit_chat import message
-from utils import load_model_and_tokenizer
+
 st.set_page_config(
    page_title="ChatGLM-6b 演示",
@ -11,7 +11,9 @@ st.set_page_config(
@st.cache_resource
 def get_model():
-    model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1)
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
    model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
    model = model.eval()
    return tokenizer, model