mirror of https://github.com/THUDM/ChatGLM-6B
commit
dc1a3df1ec
|
@ -1,5 +1,11 @@
|
|||
# ChatGLM-6B
|
||||
|
||||
## 修改介绍
|
||||
将模型加载到多张gpu卡中,根据gpu的数量自动分配平均的显存占用,需要安装accelerate
|
||||
```shell
|
||||
python -m pip install accelerate
|
||||
```
|
||||
请注意,仍然需要24GB的内存, 后续优化 TODO
|
||||
## 介绍
|
||||
|
||||
ChatGLM-6B 是一个开源的、支持中英双语的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
# ChatGLM-6B
|
||||
|
||||
## Modification
|
||||
Load the model into multiple GPUs and automatically allocate the average memory usage according to the number of GPUs.
|
||||
```shell
|
||||
python -m pip install accelerate
|
||||
```
|
||||
Please note that 24GB of cpu memory is still required. TODO optimization.”
|
||||
|
||||
## Introduction
|
||||
|
||||
ChatGLM-6B is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework, with 6.2 billion parameters. With the quantization technique, users can deploy locally on consumer-grade graphics cards (only 6GB of GPU memory is required at the INT4 quantization level).
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
'''
|
||||
Author: lichuang
|
||||
Date: 2023-03-23 09:18:13
|
||||
Description: 将模型加载到多张GPU卡中,根据gpu的数量自动分配平均的显存占用
|
||||
'''
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
from accelerate import load_checkpoint_and_dispatch
|
||||
|
||||
|
||||
def load_model_on_gpus(checkpoint_path, num_gpus=2):
|
||||
# 总共占用13GB显存,28层transformer每层0.39GB左右
|
||||
# 第一层 word_embeddings和最后一层 lm_head 层各占用1.2GB左右
|
||||
num_trans_layers = 28
|
||||
vram_per_layer = 0.39
|
||||
average = 13/num_gpus
|
||||
used = 1.2
|
||||
device_map = {'transformer.word_embeddings': 0,
|
||||
'transformer.final_layernorm': num_gpus-1, 'lm_head': num_gpus-1}
|
||||
gpu_target = 0
|
||||
for i in range(num_trans_layers):
|
||||
if used > average-vram_per_layer/2 and gpu_target < num_gpus:
|
||||
gpu_target += 1
|
||||
used = 0
|
||||
else:
|
||||
used += vram_per_layer
|
||||
device_map['transformer.layers.%d' % i] = gpu_target
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
checkpoint_path, trust_remote_code=True)
|
||||
model = model.eval()
|
||||
model = load_checkpoint_and_dispatch(
|
||||
model, checkpoint_path, device_map=device_map, offload_folder="offload", offload_state_dict=True).half()
|
||||
return model
|
|
@ -1,10 +1,10 @@
|
|||
import os
|
||||
import platform
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from chatglm_parallel import load_model_on_gpus
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
||||
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
|
||||
model = model.eval()
|
||||
model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
|
||||
|
||||
os_name = platform.system()
|
||||
clear_command = 'cls' if os_name == 'Windows' else 'clear'
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from transformers import AutoModel, AutoTokenizer
|
||||
import gradio as gr
|
||||
from chatglm_parallel import load_model_on_gpus
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
||||
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
|
||||
model = model.eval()
|
||||
model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
|
||||
|
||||
MAX_TURNS = 20
|
||||
MAX_BOXES = MAX_TURNS * 2
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from transformers import AutoModel, AutoTokenizer
|
||||
import streamlit as st
|
||||
from streamlit_chat import message
|
||||
from chatglm_parallel import load_model_on_gpus
|
||||
|
||||
|
||||
st.set_page_config(
|
||||
|
@ -12,8 +13,7 @@ st.set_page_config(
|
|||
@st.cache_resource
|
||||
def get_model():
|
||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
||||
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
|
||||
model = model.eval()
|
||||
model = load_model_on_gpus("THUDM/chatglm-6b", num_gpus=2)
|
||||
return tokenizer, model
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue