feat: Implement GPU layer allocation based on memory ratio

pull/265/head
saber 2023-03-28 21:57:49 +08:00
parent c313af0639
commit 8c16099064
1 changed files with 63 additions and 12 deletions

View File

@ -1,18 +1,64 @@
import os import os
from typing import Dict, Tuple, Union, Optional from typing import Dict, Tuple, Union, Optional, List
import torch
from torch.nn import Module from torch.nn import Module
from transformers import AutoModel, AutoTokenizer from transformers import AutoModel, AutoTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: def calculate_per_gpu_layers(gpu_list: List[int], total_layers) -> Dict[int, int]:
# 根据每个GPU的显存大小计算每个GPU应分配的层数
# 获取每个gpu的显存大小
gpu_memory_map = {
gpu: torch.cuda.get_device_properties(gpu).total_memory
for gpu in gpu_list
}
# 计算总显存大小
total_memory = sum(gpu_memory_map.values())
# 计算每个GPU的显存比例
gpu_memory_ratios = {
gpu: memory / total_memory
for gpu, memory in gpu_memory_map.items()
}
# 计算每个 GPU 应分配的层数
per_gpu_layers = {
gpu: int(round(total_layers * ratio))
for gpu, ratio in gpu_memory_ratios.items()
}
# 修正分配误差确保总层数为total_layers
while True:
diff = total_layers - sum(per_gpu_layers.values())
if diff > 0:
gpu_with_max_memory = max(gpu_memory_ratios, key=gpu_memory_ratios.get)
per_gpu_layers[gpu_with_max_memory] += diff
elif diff < 0:
gpu_with_min_memory = min(gpu_memory_ratios, key=gpu_memory_ratios.get)
per_gpu_layers[gpu_with_min_memory] -= -diff
else:
break
return per_gpu_layers
def auto_configure_device_map(num_gpus: int, gpu_list: Optional[List[int]] = None) -> Dict[str, int]:
# transformer.word_embeddings 占用1层 # transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层 # transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层 # transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上 # 总共30层分配到num_gpus张卡上
num_trans_layers = 28 num_trans_layers = 28
per_gpu_layers = 30 / num_gpus
if gpu_list is None:
gpu_list = list(range(num_gpus))
assert len(gpu_list) <= torch.cuda.device_count(), "分配的GPU数量超过了实际可用的GPU数量"
current_gpu_index = 0
# 获取每个gpu的承载的层数
per_gpu_layer_dict = calculate_per_gpu_layers(gpu_list, total_layers=num_trans_layers + 2)
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device # windows下 model.device 会被设置成 transformer.word_embeddings.device
@ -20,18 +66,23 @@ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# 在调用chat或者stream_chat时,input_ids会被放到model.device上 # 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上 # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0, device_map = {'transformer.word_embeddings': gpu_list[current_gpu_index],
'transformer.final_layernorm': 0, 'lm_head': 0} 'transformer.final_layernorm': gpu_list[current_gpu_index], 'lm_head': gpu_list[current_gpu_index]}
used = 2 used = 2
gpu_target = 0
# 分配剩余的层数
current_gpu = gpu_list[current_gpu_index]
for i in range(num_trans_layers): for i in range(num_trans_layers):
if used >= per_gpu_layers: if used < per_gpu_layer_dict[current_gpu]:
gpu_target += 1 device_map[f"transformer.layers.{i}"] = current_gpu
used = 0 used += 1
assert gpu_target < num_gpus else:
device_map[f'transformer.layers.{i}'] = gpu_target # 当前 GPU 的层数已分配完,切换到下一个 GPU
used += 1 current_gpu_index += 1
current_gpu = gpu_list[current_gpu_index]
device_map[f"transformer.layers.{i}"] = gpu_list[current_gpu]
used = 1
return device_map return device_map