diff --git a/ci_scripts/train/ci_7B_sft.py b/ci_scripts/train/ci_7B_sft.py index bc881c0..4f8477f 100644 --- a/ci_scripts/train/ci_7B_sft.py +++ b/ci_scripts/train/ci_7B_sft.py @@ -15,6 +15,7 @@ MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" SAVE_CKPT_FOLDER = "local:llm_ckpts" # LOAD_CKPT_FOLDER = "local:llm_ckpts/49" ckpt = dict( + enable_save_ckpt=True, # Path to save training ckpt. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to continue training ckpt (load model weights and scheduler/context states). diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 9955a96..a2bc833 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -108,6 +108,9 @@ def args_sanity_check(): logger.info(f"valid_every: {data.valid_every}") # processing the checkpoint config + if "enable_save_ckpt" not in gpc.config.ckpt: + gpc.config.ckpt._add_item("enable_save_ckpt", False) + if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0: gpc.config.ckpt._add_item("checkpoint_every", float("inf")) @@ -125,18 +128,16 @@ def args_sanity_check(): if "async_upload" not in gpc.config.ckpt: gpc.config.ckpt._add_item("async_upload", False) - else: - if gpc.config.ckpt.async_upload: - assert "save_ckpt_folder" in gpc.config.ckpt - if "boto3:" not in gpc.config.ckpt.save_ckpt_folder: - if gpc.is_rank_for_log(): - logger.warning( - "Storing ckpt on file system does not support asynchronous storage, will use sync save!" - ) - gpc.config.ckpt.async_upload = False - else: - if "async_upload_tmp_folder" not in gpc.config.ckpt: - gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/") + + if "async_upload_tmp_folder" not in gpc.config.ckpt: + gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/") + + if gpc.config.ckpt.async_upload: + assert "save_ckpt_folder" in gpc.config.ckpt + if "boto3:" not in gpc.config.ckpt.save_ckpt_folder: + if gpc.is_rank_for_log(): + logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!") + gpc.config.ckpt.async_upload = False if "snapshot_ckpt_folder" not in gpc.config.ckpt: gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot")) @@ -149,14 +150,14 @@ def args_sanity_check(): gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None ), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time." - if "enable_save_ckpt" not in gpc.config.ckpt: - gpc.config.ckpt._add_item("enable_save_ckpt", False) - if gpc.is_rank_for_log(): logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201 logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}") logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}") logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}") + logger.info(f"async_upload: {gpc.config.ckpt.async_upload}") + if gpc.config.ckpt.async_upload: + logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}") # initialization storage manager init_storage_manager(gpc.config.ckpt) diff --git a/internlm/utils/storage_manager.py b/internlm/utils/storage_manager.py index 481bd28..c9b42ea 100644 --- a/internlm/utils/storage_manager.py +++ b/internlm/utils/storage_manager.py @@ -383,12 +383,12 @@ class StorageManager(metaclass=SingletonMeta): } CLI_DICT = {} - def __init__(self, enable_save, tmp_local_folde="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None: + def __init__(self, enable_save, tmp_local_folder="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None: self._exception_list = [] self._to_be_del_files = [] self._async_stack = [] self.upload_count = 0 - self.tmp_local_folder = tmp_local_folde + self.tmp_local_folder = tmp_local_folder self.async_mode = async_mode self.has_warning = False @@ -523,7 +523,6 @@ class StorageManager(metaclass=SingletonMeta): pass async def _sync_tasks(self) -> Awaitable[None]: - if not self._async_stack: return @@ -591,7 +590,7 @@ def init_storage_manager(ckpt_config): global storage_manager storage_manager = StorageManager( ckpt_config.enable_save_ckpt, - tmp_local_folde=ckpt_config.async_upload_tmp_folder, + tmp_local_folder=ckpt_config.async_upload_tmp_folder, async_mode=ckpt_config.async_upload, )