mirror of https://github.com/InternLM/InternLM
				
				
				
			fix(moe): fix moe zero mode bug (#548)
* fix moe zero mode bugs * update moe config to fit training on 8 GPUpull/562/head
							parent
							
								
									bbb5651582
								
							
						
					
					
						commit
						513ebb9c3a
					
				| 
						 | 
				
			
			@ -141,7 +141,7 @@ model = dict(
 | 
			
		|||
    layer_norm_epsilon=1e-5,
 | 
			
		||||
    use_flash_attn=True,
 | 
			
		||||
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 | 
			
		||||
    num_experts=8,
 | 
			
		||||
    num_experts=4,
 | 
			
		||||
    moe_use_residual=False,
 | 
			
		||||
    moe_gate_k=2,
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -150,7 +150,7 @@ class HybridZeroOptimizer(BaseOptimizer):
 | 
			
		|||
            # if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
 | 
			
		||||
            zero_mode = (
 | 
			
		||||
                ParallelMode.ZERO1
 | 
			
		||||
                if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
 | 
			
		||||
                if gpc.get_world_size(ParallelMode.ZERO1) == 1 or param_group["dp_mode"] == ParallelMode.DATA
 | 
			
		||||
                else ParallelMode.EXPERT_DATA
 | 
			
		||||
            )
 | 
			
		||||
            self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue