fix/ci train error (#200)

* fix(ci): fix ci train error

* fix(ci): fix ci train error

* fix(ci): fix ci train error
pull/204/head
huangting4201 2023-08-16 11:11:27 +08:00 committed by GitHub
parent db13bc46bc
commit 5f2381af62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 19 deletions

View File

@ -15,6 +15,7 @@ MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
SAVE_CKPT_FOLDER = "local:llm_ckpts"
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
ckpt = dict(
enable_save_ckpt=True,
# Path to save training ckpt.
save_ckpt_folder=SAVE_CKPT_FOLDER,
# Path to continue training ckpt (load model weights and scheduler/context states).

View File

@ -108,6 +108,9 @@ def args_sanity_check():
logger.info(f"valid_every: {data.valid_every}")
# processing the checkpoint config
if "enable_save_ckpt" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("enable_save_ckpt", False)
if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
@ -125,18 +128,16 @@ def args_sanity_check():
if "async_upload" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload", False)
else:
if gpc.config.ckpt.async_upload:
assert "save_ckpt_folder" in gpc.config.ckpt
if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
if gpc.is_rank_for_log():
logger.warning(
"Storing ckpt on file system does not support asynchronous storage, will use sync save!"
)
gpc.config.ckpt.async_upload = False
else:
if "async_upload_tmp_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
if "async_upload_tmp_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
if gpc.config.ckpt.async_upload:
assert "save_ckpt_folder" in gpc.config.ckpt
if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
if gpc.is_rank_for_log():
logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
gpc.config.ckpt.async_upload = False
if "snapshot_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
@ -149,14 +150,14 @@ def args_sanity_check():
gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
if "enable_save_ckpt" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("enable_save_ckpt", False)
if gpc.is_rank_for_log():
logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201
logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
if gpc.config.ckpt.async_upload:
logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
# initialization storage manager
init_storage_manager(gpc.config.ckpt)

View File

@ -383,12 +383,12 @@ class StorageManager(metaclass=SingletonMeta):
}
CLI_DICT = {}
def __init__(self, enable_save, tmp_local_folde="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None:
def __init__(self, enable_save, tmp_local_folder="/dev/shm/test/", async_mode=True, n_async_workers=8) -> None:
self._exception_list = []
self._to_be_del_files = []
self._async_stack = []
self.upload_count = 0
self.tmp_local_folder = tmp_local_folde
self.tmp_local_folder = tmp_local_folder
self.async_mode = async_mode
self.has_warning = False
@ -523,7 +523,6 @@ class StorageManager(metaclass=SingletonMeta):
pass
async def _sync_tasks(self) -> Awaitable[None]:
if not self._async_stack:
return
@ -591,7 +590,7 @@ def init_storage_manager(ckpt_config):
global storage_manager
storage_manager = StorageManager(
ckpt_config.enable_save_ckpt,
tmp_local_folde=ckpt_config.async_upload_tmp_folder,
tmp_local_folder=ckpt_config.async_upload_tmp_folder,
async_mode=ckpt_config.async_upload,
)