diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py index d174fc6ac..89f5f9fad 100644 --- a/colossalai/zero/sharded_optim/low_level_optim.py +++ b/colossalai/zero/sharded_optim/low_level_optim.py @@ -131,7 +131,10 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer): # partition these param groups for data parallel training # and add buffers to parameter store for future access for group_id, param_group in enumerate(self.optim.param_groups): - group_params = param_group['params'] + group_params = list() + for param in param_group['params']: + if param.requires_grad: + group_params.append(param) # add the fp16 params to fp16_param_groups for bookkeeping self._fp16_param_groups[group_id] = group_params