mirror of https://github.com/InternLM/InternLM
				
				
				
			fix demo config to avoid implicity
							parent
							
								
									649af64c59
								
							
						
					
					
						commit
						347370a58a
					
				| 
						 | 
				
			
			@ -145,18 +145,18 @@ model = dict(
 | 
			
		|||
    moe_use_residual=False,
 | 
			
		||||
    moe_gate_k=2,
 | 
			
		||||
)
 | 
			
		||||
"""
 | 
			
		||||
zero1 parallel:
 | 
			
		||||
    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
        so parameters will be divided within the range of dp.
 | 
			
		||||
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
pipeline parallel (dict):
 | 
			
		||||
    1. size: int, the size of pipeline parallel.
 | 
			
		||||
    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
# zero1 parallel:
 | 
			
		||||
#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
#         so parameters will be divided within the range of dp.
 | 
			
		||||
#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
# pipeline parallel (dict):
 | 
			
		||||
#     1. size: int, the size of pipeline parallel.
 | 
			
		||||
#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
 | 
			
		||||
parallel = dict(
 | 
			
		||||
    zero1=dict(size=-1, fsdp=False),
 | 
			
		||||
    tensor=1,
 | 
			
		||||
| 
						 | 
				
			
			@ -176,4 +176,8 @@ monitor = dict(
 | 
			
		|||
    ),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model_type = "INTERNLM_MoE"
 | 
			
		||||
model_type = "INTERNLM_MoE"
 | 
			
		||||
 | 
			
		||||
# metric_dtype can be "fp32" or other string
 | 
			
		||||
# only when set to fp32 or unset will use fp32 to calc in metrics
 | 
			
		||||
metric_dtype = "fp32"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -146,18 +146,18 @@ model = dict(
 | 
			
		|||
    use_flash_attn=True,
 | 
			
		||||
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 | 
			
		||||
)
 | 
			
		||||
"""
 | 
			
		||||
zero1 parallel:
 | 
			
		||||
    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
        so parameters will be divided within the range of dp.
 | 
			
		||||
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
pipeline parallel (dict):
 | 
			
		||||
    1. size: int, the size of pipeline parallel.
 | 
			
		||||
    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
# zero1 parallel:
 | 
			
		||||
#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
#         so parameters will be divided within the range of dp.
 | 
			
		||||
#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
# pipeline parallel (dict):
 | 
			
		||||
#     1. size: int, the size of pipeline parallel.
 | 
			
		||||
#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
 | 
			
		||||
parallel = dict(
 | 
			
		||||
    zero1=dict(size=8, fsdp=False),
 | 
			
		||||
    tensor=1,
 | 
			
		||||
| 
						 | 
				
			
			@ -177,3 +177,7 @@ monitor = dict(
 | 
			
		|||
        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
 | 
			
		||||
    ),
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# metric_dtype can be "fp32" or other string
 | 
			
		||||
# only when set to fp32 or unset will use fp32 to calc in metrics
 | 
			
		||||
metric_dtype = "fp32"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue