From c7221cb2d457b8ac57e457a08171bce80f86bc1e Mon Sep 17 00:00:00 2001 From: HELSON Date: Fri, 29 Jul 2022 19:33:24 +0800 Subject: [PATCH] [hotfix] adapt ProcessGroup and Optimizer to ColoTensor (#1388) --- colossalai/nn/optimizer/cpu_adam.py | 4 ++-- colossalai/nn/optimizer/fused_adam.py | 4 ++-- colossalai/nn/optimizer/fused_lamb.py | 4 ++-- colossalai/nn/optimizer/fused_sgd.py | 2 +- colossalai/nn/optimizer/hybrid_adam.py | 4 ++-- colossalai/nn/optimizer/lamb.py | 4 ++-- colossalai/tensor/process_group.py | 14 +++++++++----- 7 files changed, 20 insertions(+), 16 deletions(-) diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 886265ab5..9b21337d0 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -143,9 +143,9 @@ class CPUAdam(NVMeOptimizer): state['step'] = 0 # gradient momentums - state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device) + state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device) # gradient variances - state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device) + state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device) self._post_state_init(p) state['step'] += 1 diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index 57908e19f..5814c28bd 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -122,9 +122,9 @@ class FusedAdam(torch.optim.Optimizer): # State initialization if len(state) == 0: # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg'] = torch.zeros_like(p) # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p) if p.dtype not in [torch.float16, torch.float32]: raise RuntimeError('FusedAdam only support fp16 and fp32.') diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py index 324d9ea1c..be12e6c62 100644 --- a/colossalai/nn/optimizer/fused_lamb.py +++ b/colossalai/nn/optimizer/fused_lamb.py @@ -162,9 +162,9 @@ class FusedLAMB(torch.optim.Optimizer): # State initialization if len(state) == 0: # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg'] = torch.zeros_like(p) # Exponential moving average of gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p) if p.dtype == torch.float16: g_16.append(p.grad.data) diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py index b948c5eaf..1185eef81 100644 --- a/colossalai/nn/optimizer/fused_sgd.py +++ b/colossalai/nn/optimizer/fused_sgd.py @@ -104,7 +104,7 @@ class FusedSGD(Optimizer): # momentum application can be skipped in the main kernel. if 'momentum_buffer' not in param_state: first_run = True - buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) + buf = param_state['momentum_buffer'] = torch.zeros_like(p) momentums.append(buf) else: first_run = False diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 2d33a62b1..73fabd865 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -116,9 +116,9 @@ class HybridAdam(NVMeOptimizer): state['step'] = 0 # gradient momentums - state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device) + state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device) # gradient variances - state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device) + state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device) self._post_state_init(p) state['step'] += 1 diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py index bcca990f0..7ac210957 100644 --- a/colossalai/nn/optimizer/lamb.py +++ b/colossalai/nn/optimizer/lamb.py @@ -67,9 +67,9 @@ class Lamb(Optimizer): if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg'] = torch.zeros_like(p) # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] diff --git a/colossalai/tensor/process_group.py b/colossalai/tensor/process_group.py index 454998c04..59d423b5b 100644 --- a/colossalai/tensor/process_group.py +++ b/colossalai/tensor/process_group.py @@ -22,7 +22,6 @@ class PyTorchProcessGroupDict(metaclass=SingletonMeta): self.logger = get_dist_logger('ProcessGroup') self.logger.info(f'NCCL initialize ProcessGroup on {rank_list}', ranks=[0]) - self.dict[pg_key] = torch.distributed.new_group(ranks=rank_list, backend=backend) return self.dict[pg_key] @@ -104,10 +103,15 @@ class ProcessGroup: def set_cpu_groups(self): if self.has_cpu_groups: return - # self.logger.info( - # f'{self._rank} Gloo initialize TP group on {self._tp_rank_list}, DP group on {self._dp_rank_list}') - PYTORCHPGDICT_.get(self._tp_rank_list, 'gloo') - PYTORCHPGDICT_.get(self._dp_rank_list, 'gloo') + + for i in range(self._dp_degree): + i_tp_list = [self._rank_list[i * self._tp_degree + j] for j in range(self._tp_degree)] + PYTORCHPGDICT_.get(i_tp_list, 'gloo') + + for j in range(self._tp_degree): + j_dp_list = [self._rank_list[i * self._tp_degree + j] for i in range(self._dp_degree)] + PYTORCHPGDICT_.get(j_dp_list, 'gloo') + self._has_cpu_groups = True @property