mirror of https://github.com/InternLM/InternLM
				
				
				
			Merge branch 'develop' into feature_add_moe
						commit
						d8e5397159
					
				| 
						 | 
				
			
			@ -5,7 +5,7 @@ import torch
 | 
			
		|||
 | 
			
		||||
from internlm.core.context import global_context as gpc
 | 
			
		||||
 | 
			
		||||
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2, "ja": 3, "ar": 4, "kaoshi": 5}
 | 
			
		||||
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_dataset_type_id(path):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -39,7 +39,7 @@ def get_default_parser():
 | 
			
		|||
    parser.add_argument("--local_rank", type=int, help="local rank on the node")
 | 
			
		||||
    parser.add_argument("--backend", type=str, default="nccl", help="backend for distributed communication")
 | 
			
		||||
    parser.add_argument("--seed", type=int, default=1024)
 | 
			
		||||
    parser.add_argument("--profiling", default=True, action="store_true", help="enable/diable profiling.")
 | 
			
		||||
    parser.add_argument("--profiling", default=False, action="store_true", help="enable/disable profiling.")
 | 
			
		||||
    return parser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -576,14 +576,16 @@ class HybridZeroOptimizer(BaseOptimizer):
 | 
			
		|||
            # The following operations are performed only on the rank to which parameters are assigned.
 | 
			
		||||
            if not self.param_group_has_params[group_id]:
 | 
			
		||||
                continue
 | 
			
		||||
            gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
 | 
			
		||||
 | 
			
		||||
            # create flat gradient for the flat fp32 params
 | 
			
		||||
            fp16_avg_grads = gradients
 | 
			
		||||
            flat_fp16_avg_grads = flatten(fp16_avg_grads)
 | 
			
		||||
            gradients = self._grad_store.get_averaged_gradients_by_group(group_id)
 | 
			
		||||
            flat_fp16_avg_grads = flatten(gradients)
 | 
			
		||||
            self._grad_store.reset_average_gradients_by_group(group_id)
 | 
			
		||||
            del gradients  # release cuda memory
 | 
			
		||||
 | 
			
		||||
            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
 | 
			
		||||
            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
 | 
			
		||||
            del flat_fp16_avg_grads  # release cuda memory
 | 
			
		||||
 | 
			
		||||
            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
 | 
			
		||||
            assert (
 | 
			
		||||
| 
						 | 
				
			
			@ -593,8 +595,6 @@ class HybridZeroOptimizer(BaseOptimizer):
 | 
			
		|||
            single_grad_partition_groups.append(flat_fp32_avg_grads)
 | 
			
		||||
            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
 | 
			
		||||
            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
 | 
			
		||||
            self._grad_store._averaged_gradients[group_id] = []
 | 
			
		||||
            self._grad_store._averaged_gradients[group_id] = []
 | 
			
		||||
 | 
			
		||||
        # unscale and clip grads
 | 
			
		||||
        # get the global norm
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue