mirror of https://github.com/hpcaitech/ColossalAI
[hotfix] adapt ProcessGroup and Optimizer to ColoTensor (#1388)
parent
ad678921db
commit
c7221cb2d4
|
@ -143,9 +143,9 @@ class CPUAdam(NVMeOptimizer):
|
||||||
state['step'] = 0
|
state['step'] = 0
|
||||||
|
|
||||||
# gradient momentums
|
# gradient momentums
|
||||||
state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||||
# gradient variances
|
# gradient variances
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||||
self._post_state_init(p)
|
self._post_state_init(p)
|
||||||
|
|
||||||
state['step'] += 1
|
state['step'] += 1
|
||||||
|
|
|
@ -122,9 +122,9 @@ class FusedAdam(torch.optim.Optimizer):
|
||||||
# State initialization
|
# State initialization
|
||||||
if len(state) == 0:
|
if len(state) == 0:
|
||||||
# Exponential moving average of gradient values
|
# Exponential moving average of gradient values
|
||||||
state['exp_avg'] = torch.zeros_like(p.data)
|
state['exp_avg'] = torch.zeros_like(p)
|
||||||
# Exponential moving average of squared gradient values
|
# Exponential moving average of squared gradient values
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||||
|
|
||||||
if p.dtype not in [torch.float16, torch.float32]:
|
if p.dtype not in [torch.float16, torch.float32]:
|
||||||
raise RuntimeError('FusedAdam only support fp16 and fp32.')
|
raise RuntimeError('FusedAdam only support fp16 and fp32.')
|
||||||
|
|
|
@ -162,9 +162,9 @@ class FusedLAMB(torch.optim.Optimizer):
|
||||||
# State initialization
|
# State initialization
|
||||||
if len(state) == 0:
|
if len(state) == 0:
|
||||||
# Exponential moving average of gradient values
|
# Exponential moving average of gradient values
|
||||||
state['exp_avg'] = torch.zeros_like(p.data)
|
state['exp_avg'] = torch.zeros_like(p)
|
||||||
# Exponential moving average of gradient values
|
# Exponential moving average of gradient values
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||||
|
|
||||||
if p.dtype == torch.float16:
|
if p.dtype == torch.float16:
|
||||||
g_16.append(p.grad.data)
|
g_16.append(p.grad.data)
|
||||||
|
|
|
@ -104,7 +104,7 @@ class FusedSGD(Optimizer):
|
||||||
# momentum application can be skipped in the main kernel.
|
# momentum application can be skipped in the main kernel.
|
||||||
if 'momentum_buffer' not in param_state:
|
if 'momentum_buffer' not in param_state:
|
||||||
first_run = True
|
first_run = True
|
||||||
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
|
buf = param_state['momentum_buffer'] = torch.zeros_like(p)
|
||||||
momentums.append(buf)
|
momentums.append(buf)
|
||||||
else:
|
else:
|
||||||
first_run = False
|
first_run = False
|
||||||
|
|
|
@ -116,9 +116,9 @@ class HybridAdam(NVMeOptimizer):
|
||||||
state['step'] = 0
|
state['step'] = 0
|
||||||
|
|
||||||
# gradient momentums
|
# gradient momentums
|
||||||
state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
state['exp_avg'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||||
# gradient variances
|
# gradient variances
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
|
state['exp_avg_sq'] = torch.zeros_like(p, dtype=torch.float, device=target_device)
|
||||||
self._post_state_init(p)
|
self._post_state_init(p)
|
||||||
|
|
||||||
state['step'] += 1
|
state['step'] += 1
|
||||||
|
|
|
@ -67,9 +67,9 @@ class Lamb(Optimizer):
|
||||||
if len(state) == 0:
|
if len(state) == 0:
|
||||||
state['step'] = 0
|
state['step'] = 0
|
||||||
# Exponential moving average of gradient values
|
# Exponential moving average of gradient values
|
||||||
state['exp_avg'] = torch.zeros_like(p.data)
|
state['exp_avg'] = torch.zeros_like(p)
|
||||||
# Exponential moving average of squared gradient values
|
# Exponential moving average of squared gradient values
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||||
|
|
||||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||||
beta1, beta2 = group['betas']
|
beta1, beta2 = group['betas']
|
||||||
|
|
|
@ -22,7 +22,6 @@ class PyTorchProcessGroupDict(metaclass=SingletonMeta):
|
||||||
|
|
||||||
self.logger = get_dist_logger('ProcessGroup')
|
self.logger = get_dist_logger('ProcessGroup')
|
||||||
self.logger.info(f'NCCL initialize ProcessGroup on {rank_list}', ranks=[0])
|
self.logger.info(f'NCCL initialize ProcessGroup on {rank_list}', ranks=[0])
|
||||||
|
|
||||||
self.dict[pg_key] = torch.distributed.new_group(ranks=rank_list, backend=backend)
|
self.dict[pg_key] = torch.distributed.new_group(ranks=rank_list, backend=backend)
|
||||||
return self.dict[pg_key]
|
return self.dict[pg_key]
|
||||||
|
|
||||||
|
@ -104,10 +103,15 @@ class ProcessGroup:
|
||||||
def set_cpu_groups(self):
|
def set_cpu_groups(self):
|
||||||
if self.has_cpu_groups:
|
if self.has_cpu_groups:
|
||||||
return
|
return
|
||||||
# self.logger.info(
|
|
||||||
# f'{self._rank} Gloo initialize TP group on {self._tp_rank_list}, DP group on {self._dp_rank_list}')
|
for i in range(self._dp_degree):
|
||||||
PYTORCHPGDICT_.get(self._tp_rank_list, 'gloo')
|
i_tp_list = [self._rank_list[i * self._tp_degree + j] for j in range(self._tp_degree)]
|
||||||
PYTORCHPGDICT_.get(self._dp_rank_list, 'gloo')
|
PYTORCHPGDICT_.get(i_tp_list, 'gloo')
|
||||||
|
|
||||||
|
for j in range(self._tp_degree):
|
||||||
|
j_dp_list = [self._rank_list[i * self._tp_degree + j] for i in range(self._dp_degree)]
|
||||||
|
PYTORCHPGDICT_.get(j_dp_list, 'gloo')
|
||||||
|
|
||||||
self._has_cpu_groups = True
|
self._has_cpu_groups = True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
Loading…
Reference in New Issue