mirror of https://github.com/InternLM/InternLM
				
				
				
			fix/ci train error (#200)
* fix(ci): fix ci train error * fix(ci): fix ci train error * fix(ci): fix ci train errorpull/204/head
							parent
							
								
									db13bc46bc
								
							
						
					
					
						commit
						5f2381af62
					
				| 
						 | 
				
			
			@ -15,6 +15,7 @@ MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 | 
			
		|||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
 | 
			
		||||
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
 | 
			
		||||
ckpt = dict(
 | 
			
		||||
    enable_save_ckpt=True,
 | 
			
		||||
    # Path to save training ckpt.
 | 
			
		||||
    save_ckpt_folder=SAVE_CKPT_FOLDER,
 | 
			
		||||
    # Path to continue training ckpt (load model weights and scheduler/context states).
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -108,6 +108,9 @@ def args_sanity_check():
 | 
			
		|||
        logger.info(f"valid_every: {data.valid_every}")
 | 
			
		||||
 | 
			
		||||
    # processing the checkpoint config
 | 
			
		||||
    if "enable_save_ckpt" not in gpc.config.ckpt:
 | 
			
		||||
        gpc.config.ckpt._add_item("enable_save_ckpt", False)
 | 
			
		||||
 | 
			
		||||
    if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
 | 
			
		||||
        gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -125,18 +128,16 @@ def args_sanity_check():
 | 
			
		|||
 | 
			
		||||
    if "async_upload" not in gpc.config.ckpt:
 | 
			
		||||
        gpc.config.ckpt._add_item("async_upload", False)
 | 
			
		||||
    else:
 | 
			
		||||
        if gpc.config.ckpt.async_upload:
 | 
			
		||||
            assert "save_ckpt_folder" in gpc.config.ckpt
 | 
			
		||||
            if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
 | 
			
		||||
                if gpc.is_rank_for_log():
 | 
			
		||||
                    logger.warning(
 | 
			
		||||
                        "Storing ckpt on file system does not support asynchronous storage, will use sync save!"
 | 
			
		||||
                    )
 | 
			
		||||
                gpc.config.ckpt.async_upload = False
 | 
			
		||||
            else:
 | 
			
		||||
                if "async_upload_tmp_folder" not in gpc.config.ckpt:
 | 
			
		||||
                    gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
 | 
			
		||||
 | 
			
		||||
    if "async_upload_tmp_folder" not in gpc.config.ckpt:
 | 
			
		||||
        gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
 | 
			
		||||
 | 
			
		||||
    if gpc.config.ckpt.async_upload:
 | 
			
		||||
        assert "save_ckpt_folder" in gpc.config.ckpt
 | 
			
		||||
        if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
 | 
			
		||||
            if gpc.is_rank_for_log():
 | 
			
		||||
                logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
 | 
			
		||||
            gpc.config.ckpt.async_upload = False
 | 
			
		||||
 | 
			
		||||
    if "snapshot_ckpt_folder" not in gpc.config.ckpt:
 | 
			
		||||
        gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
 | 
			
		||||
| 
						 | 
				
			
			@ -149,14 +150,14 @@ def args_sanity_check():
 | 
			
		|||
        gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
 | 
			
		||||
    ), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
 | 
			
		||||
 | 
			
		||||
    if "enable_save_ckpt" not in gpc.config.ckpt:
 | 
			
		||||
        gpc.config.ckpt._add_item("enable_save_ckpt", False)
 | 
			
		||||
 | 
			
		||||
    if gpc.is_rank_for_log():
 | 
			
		||||
        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
 | 
			
		||||
        logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
 | 
			
		||||
        logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
 | 
			
		||||
        logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
 | 
			
		||||
        logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
 | 
			
		||||
        if gpc.config.ckpt.async_upload:
 | 
			
		||||
            logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
 | 
			
		||||
 | 
			
		||||
    # initialization storage manager
 | 
			
		||||
    init_storage_manager(gpc.config.ckpt)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -383,12 +383,12 @@ class StorageManager(metaclass=SingletonMeta):
 | 
			
		|||
    }
 | 
			
		||||
    CLI_DICT = {}
 | 
			
		||||
 | 
			
		||||
    def __init__(self, enable_save, tmp_local_folde="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None:
 | 
			
		||||
    def __init__(self, enable_save, tmp_local_folder="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None:
 | 
			
		||||
        self._exception_list = []
 | 
			
		||||
        self._to_be_del_files = []
 | 
			
		||||
        self._async_stack = []
 | 
			
		||||
        self.upload_count = 0
 | 
			
		||||
        self.tmp_local_folder = tmp_local_folde
 | 
			
		||||
        self.tmp_local_folder = tmp_local_folder
 | 
			
		||||
        self.async_mode = async_mode
 | 
			
		||||
        self.has_warning = False
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -523,7 +523,6 @@ class StorageManager(metaclass=SingletonMeta):
 | 
			
		|||
                    pass
 | 
			
		||||
 | 
			
		||||
    async def _sync_tasks(self) -> Awaitable[None]:
 | 
			
		||||
 | 
			
		||||
        if not self._async_stack:
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -591,7 +590,7 @@ def init_storage_manager(ckpt_config):
 | 
			
		|||
    global storage_manager
 | 
			
		||||
    storage_manager = StorageManager(
 | 
			
		||||
        ckpt_config.enable_save_ckpt,
 | 
			
		||||
        tmp_local_folde=ckpt_config.async_upload_tmp_folder,
 | 
			
		||||
        tmp_local_folder=ckpt_config.async_upload_tmp_folder,
 | 
			
		||||
        async_mode=ckpt_config.async_upload,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue