diff --git a/README.md b/README.md index 9b08ff7..6144f3f 100644 --- a/README.md +++ b/README.md @@ -176,28 +176,26 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal 即可使用在 Mac 上使用 GPU 加速模型推理。 ### 多卡部署 -```shell -pip install accelerate -``` - +如果你有多张 GPU,但是每张 GPU 的显存大小都不足以容纳完整的模型,那么可以将模型切分在多张GPU上。首先安装 accelerate: `pip install accelerate`,然后通过如下方法加载模型: ```python -from utils import load_model_and_tokenizer - -model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2) +from utils import load_model_on_gpus +model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2) ``` -即可将模型部署到多卡上进行推理。 +即可将模型部署到两张 GPU 上进行推理。你可以将 `num_gpus` 改为你希望使用的 GPU 数。默认是均匀切分的,你也可以传入 `device_map` 参数来自己指定。 ## 高效参数微调 基于 [P-tuning v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调。具体使用方法详见 [ptuning/README.md](ptuning/README.md)。 ## 更新信息 +**[2023/04/16]** 增加 INT8 量化后的模型 [ChatGLM-6B-INT8](https://huggingface.co/THUDM/chatglm-6b-int8)。增加多卡部署(感谢 [@Cherrysaber](https://github.com/Cherrysaber))。 + **[2023/04/06]** 优化web demo的界面(感谢 [@tuteng0915](https://github.com/tuteng0915))。移除embedding中的image token以减小显存占用(需要更新模型文件`pytorch_model-00001-of-00008.bin`和`pytorch_model-00008-of-00008.bin`,感谢 [@silverriver](https://github.com/silverriver) 提出的想法)。去掉了对 `icetk` 的依赖(需要更新模型文件`ice_text.model`)。 **[2023/03/31]** 增加基于 [P-Tuning-v2](https://github.com/THUDM/P-tuning-v2) 的高效参数微调实现,INT4 量化级别下最低只需 7GB 显存即可进行模型微调。详见[高效参数微调方法](ptuning/README.md)。 **[2023/03/23]** 增加 API 部署(感谢 [@LemonQu-GIT](https://github.com/LemonQu-GIT))。增加 Embedding 量化模型 [ChatGLM-6B-INT4-QE](https://huggingface.co/THUDM/chatglm-6b-int4-qe)。增加配备 Apple Silicon 芯片的 Mac 上 GPU 加速的支持。 -**[2023/03/19]** 增加流式输出接口 `stream_chat`,已更新到网页版和命令行 Demo。修复输出中的中文标点。增加量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4) +**[2023/03/19]** 增加流式输出接口 `stream_chat`,已更新到网页版和命令行 Demo。修复输出中的中文标点。增加 INT4 量化后的模型 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4) ## ChatGLM-6B 示例 diff --git a/README_en.md b/README_en.md index 257776e..e9929f2 100644 --- a/README_en.md +++ b/README_en.md @@ -175,22 +175,20 @@ model = AutoModel.from_pretrained("your local path", trust_remote_code=True).hal ``` Then you can use GPU-accelerated model inference on Mac. +### Multi-GPU Deployment +If you have multiple GPUs, but the memory size of each GPU is not sufficient to accommodate the entire model, you can split the model across multiple GPUs. + +First, install accelerate: `pip install accelerate`, and then load the model using the following method: +```python +from utils import load_model_on_gpus +model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2) +``` + +This will deploy the model onto two GPUs for inference. You can change `num_gpus` to the number of GPUs you want to use. By default, the model is split evenly, but you can also specify the `device_map` parameter to customize the splitting. + ## Parameter-efficient Tuning Parameter-efficient tuning based on [P-tuning v2](https://github.com/THUDM/P-tuning-v2). See [ptuning/README.md](ptuning/README.md) for details on how to use it. - -### Multi-GPU Deployment - -```shell -pip install accelerate -``` - -```python -from utils import load_model_and_tokenizer - -model, tokenizer = load_model_and_tokenizer("your local path", num_gpus=2) -``` - ## ChatGLM-6B Examples The following are some Chinese examples with `web_demo.py`. Welcome to explore more possibility with ChatGLM-6B. diff --git a/utils.py b/utils.py index 7d78362..bfb20b3 100644 --- a/utils.py +++ b/utils.py @@ -2,8 +2,7 @@ import os from typing import Dict, Tuple, Union, Optional from torch.nn import Module -from transformers import AutoModel, AutoTokenizer -from transformers.tokenization_utils import PreTrainedTokenizer +from transformers import AutoModel def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: @@ -37,32 +36,21 @@ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2, - multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir", - device_map: Optional[Dict[str, int]] = None, - tokenizer: Optional[PreTrainedTokenizer] = None, **kwargs) -> Module: - from accelerate import load_checkpoint_and_dispatch + device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module: + if num_gpus < 2 and device_map is None: + model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda() + else: + from accelerate import load_checkpoint_and_dispatch - model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs) - model = model.eval() + model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs) + model = model.eval() - if device_map is None: - device_map = auto_configure_device_map(num_gpus) + if device_map is None: + device_map = auto_configure_device_map(num_gpus) - model = load_checkpoint_and_dispatch( - model, checkpoint_path, device_map=device_map, offload_folder="offload", offload_state_dict=True).half() + model = load_checkpoint_and_dispatch( + model, checkpoint_path, device_map=device_map, offload_folder="offload", offload_state_dict=True).half() return model -def load_model_and_tokenizer(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 1, - multi_gpu_model_cache_dir: Union[str, os.PathLike] = "./temp_model_dir", - **kwargs) -> Tuple[Module, PreTrainedTokenizer]: - tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs) - if num_gpus < 2: - model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda() - model = model.eval() - else: - model = load_model_on_gpus(checkpoint_path, num_gpus=num_gpus, - multi_gpu_model_cache_dir=multi_gpu_model_cache_dir, - tokenizer=tokenizer, **kwargs) - return model, tokenizer diff --git a/web_demo.py b/web_demo.py index 4d9171b..97ea622 100644 --- a/web_demo.py +++ b/web_demo.py @@ -1,9 +1,10 @@ +from transformers import AutoModel, AutoTokenizer import gradio as gr import mdtex2html -from utils import load_model_and_tokenizer - -model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1) +tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) +model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() +model = model.eval() """Override Chatbot.postprocess""" diff --git a/web_demo2.py b/web_demo2.py index ebc9b8a..226682e 100644 --- a/web_demo2.py +++ b/web_demo2.py @@ -1,7 +1,7 @@ from transformers import AutoModel, AutoTokenizer import streamlit as st from streamlit_chat import message -from utils import load_model_and_tokenizer + st.set_page_config( page_title="ChatGLM-6b 演示", @@ -11,7 +11,9 @@ st.set_page_config( @st.cache_resource def get_model(): - model, tokenizer = load_model_and_tokenizer("THUDM/chatglm-6b", num_gpus=1) + tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) + model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda() + model = model.eval() return tokenizer, model