Merge branch 'feature_add_moe' of https://github.com/blankde/InternLM into feature_add_moe

pull/375/head
Wenwen Qu 2023-08-15 17:58:52 +08:00
commit 3bfaad895a
3 changed files with 7 additions and 7 deletions

View File

@ -5,7 +5,7 @@ import torch
from internlm.core.context import global_context as gpc
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2, "ja": 3, "ar": 4, "kaoshi": 5}
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
def get_dataset_type_id(path):

View File

@ -39,7 +39,7 @@ def get_default_parser():
parser.add_argument("--local_rank", type=int, help="local rank on the node")
parser.add_argument("--backend", type=str, default="nccl", help="backend for distributed communication")
parser.add_argument("--seed", type=int, default=1024)
parser.add_argument("--profiling", default=True, action="store_true", help="enable/diable profiling.")
parser.add_argument("--profiling", default=False, action="store_true", help="enable/disable profiling.")
return parser

View File

@ -579,14 +579,16 @@ class HybridZeroOptimizer(BaseOptimizer):
# The following operations are performed only on the rank to which parameters are assigned.
if not self.param_group_has_params[group_id]:
continue
gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
# create flat gradient for the flat fp32 params
fp16_avg_grads = gradients
flat_fp16_avg_grads = flatten(fp16_avg_grads)
gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
flat_fp16_avg_grads = flatten(gradients)
self._grad_store.reset_average_gradients_by_group(group_id)
del gradients # release cuda memory
dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
del flat_fp16_avg_grads # release cuda memory
param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
assert (
@ -596,8 +598,6 @@ class HybridZeroOptimizer(BaseOptimizer):
single_grad_partition_groups.append(flat_fp32_avg_grads)
device = self._fp32_flat_param_groups_of_current_rank[group_id].device
self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
self._grad_store._averaged_gradients[group_id] = []
self._grad_store._averaged_gradients[group_id] = []
# unscale and clip grads
# get the global norm