mirror of https://github.com/InternLM/InternLM
volc_path (#454)
parent
87a3c5c374
commit
e6d8ebc3e5
|
@ -4,6 +4,7 @@
|
||||||
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
|
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
|
@ -120,13 +121,17 @@ class TrainState:
|
||||||
self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
|
self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
|
||||||
|
|
||||||
def state_dict(self):
|
def state_dict(self):
|
||||||
|
if os.environ.get("CLUSTER_NAME") == "volc" and os.environ.get("petrelfs_tb_path") is not None:
|
||||||
|
tensorboard_folder = os.path.join(os.environ["petrelfs_tb_path"], os.environ["MLP_TASK_ID"])
|
||||||
|
else:
|
||||||
|
tensorboard_folder = self.tensorboard_folder
|
||||||
return {
|
return {
|
||||||
"batch_count": self.batch_count,
|
"batch_count": self.batch_count,
|
||||||
"num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
|
"num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
|
||||||
"num_consumed_tokens": self.num_consumed_tokens,
|
"num_consumed_tokens": self.num_consumed_tokens,
|
||||||
"inf_nan_skip_batches": self.inf_nan_skip_batches,
|
"inf_nan_skip_batches": self.inf_nan_skip_batches,
|
||||||
"step_count": self.step_count,
|
"step_count": self.step_count,
|
||||||
"tensorboard_folder": self.tensorboard_folder,
|
"tensorboard_folder": tensorboard_folder,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue