mirror of https://github.com/InternLM/InternLM
auto resume
parent
68159c22a4
commit
66e4a8a847
|
@ -820,7 +820,8 @@ now step_count is {train_state.step_count}",
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
|
|
||||||
def query_latest_snapshot_step_boto3(self):
|
def query_latest_snapshot_step_boto3(self):
|
||||||
"""query_latest_snapshot_step_boto3
|
"""Query the latest snapshot step from the storage backend.
|
||||||
|
Currently, we only support the following storage backends: boto3, oss2 and volc.
|
||||||
Returns:
|
Returns:
|
||||||
Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
|
Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
|
||||||
"""
|
"""
|
||||||
|
@ -878,6 +879,7 @@ now step_count is {train_state.step_count}",
|
||||||
return load_path, max(snap_step, max_normal_step)
|
return load_path, max(snap_step, max_normal_step)
|
||||||
|
|
||||||
def query_latest_snapshot_step_local(self):
|
def query_latest_snapshot_step_local(self):
|
||||||
|
"""Query the latest snapshot step from the local file system."""
|
||||||
max_step, max_step_path = 0, None
|
max_step, max_step_path = 0, None
|
||||||
save_ckpt_folder = self.save_ckpt_folder.split(":")[1]
|
save_ckpt_folder = self.save_ckpt_folder.split(":")[1]
|
||||||
for root, _, files in os.walk(save_ckpt_folder, followlinks=True):
|
for root, _, files in os.walk(save_ckpt_folder, followlinks=True):
|
||||||
|
@ -894,26 +896,22 @@ now step_count is {train_state.step_count}",
|
||||||
return max_step_path, max_step
|
return max_step_path, max_step
|
||||||
|
|
||||||
def query_lastest_ckpt(self):
|
def query_lastest_ckpt(self):
|
||||||
|
"""Query the latest ckpt via the storage backend."""
|
||||||
latest_ckpt, step = None, -1
|
latest_ckpt, step = None, -1
|
||||||
# Training was automatically restarted by the process, forcing the latest snapshot to be read.
|
# Training was automatically restarted by the process, forcing the latest snapshot to be read.
|
||||||
if self.save_ckpt_folder:
|
if self.save_ckpt_folder:
|
||||||
backend, _ = try_get_storage_backend(self.save_ckpt_folder)
|
backend, _ = try_get_storage_backend(self.save_ckpt_folder)
|
||||||
if backend == "boto3":
|
if backend in ["boto3", "oss2", "volc"]:
|
||||||
latest_ckpt, step = self.query_latest_snapshot_step_boto3()
|
latest_ckpt, step = self.query_latest_snapshot_step_boto3()
|
||||||
if latest_ckpt and not latest_ckpt.startswith("boto3:"):
|
|
||||||
latest_ckpt = ":".join(["boto3", latest_ckpt])
|
|
||||||
elif backend == "oss2":
|
|
||||||
latest_ckpt, step = self.query_latest_snapshot_step_boto3()
|
|
||||||
if latest_ckpt and not latest_ckpt.startswith("oss2:"):
|
|
||||||
latest_ckpt = ":".join(["oss2", latest_ckpt])
|
|
||||||
elif backend == "volc":
|
|
||||||
latest_ckpt, step = self.query_latest_snapshot_step_boto3()
|
|
||||||
if latest_ckpt and not latest_ckpt.startswith("volc:"):
|
|
||||||
latest_ckpt = ":".join(["volc", latest_ckpt])
|
|
||||||
elif backend == "local":
|
elif backend == "local":
|
||||||
latest_ckpt, step = self.query_latest_snapshot_step_local()
|
latest_ckpt, step = self.query_latest_snapshot_step_local()
|
||||||
if latest_ckpt and not latest_ckpt.startswith("local:"):
|
else:
|
||||||
latest_ckpt = ":".join(["local", latest_ckpt])
|
raise NotImplementedError(
|
||||||
|
f"Unsupported backend: {backend}, " "Currently only support `boto3`, `oss2`, `volc` and `local`"
|
||||||
|
)
|
||||||
|
|
||||||
|
if latest_ckpt and not latest_ckpt.startswith(backend):
|
||||||
|
latest_ckpt = ":".join([backend, latest_ckpt])
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
if gpc.is_rank_for_log():
|
||||||
logger.info(f"Found latest ckpt {latest_ckpt if latest_ckpt else 'None'}, step: {step}...")
|
logger.info(f"Found latest ckpt {latest_ckpt if latest_ckpt else 'None'}, step: {step}...")
|
||||||
|
|
Loading…
Reference in New Issue