fix num_train_step update

grpo_dev
YeAnbang 2025-02-20 18:24:04 +08:00
parent 0171884664
commit 53834b74b9
7 changed files with 10 additions and 10 deletions

View File

@ -380,9 +380,9 @@ class DPOTrainer(SLTrainer):
self.accumulative_meter.get("accuracy"),
global_step,
)
self.num_train_step += 1
self.accumulative_meter.reset()
self.num_train_step += 1
if self.save_dir is not None and self.num_train_step > 0 and self.num_train_step % self.save_interval == 0:
# save checkpoint
self.coordinator.print_on_master("\nStart saving model checkpoint with running states")

View File

@ -231,7 +231,6 @@ class GRPOTrainer(OLTrainer):
experience:
sequences: [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>
"""
self.num_train_step += 1
self.actor.train()
num_actions = experience.action_log_probs.size(1)
# policy loss
@ -294,7 +293,7 @@ class GRPOTrainer(OLTrainer):
self.temperature_annealing_scheduler.step_forward()
# preparing logging model output and corresponding rewards.
if self.num_train_step % 10 == 1:
if self.num_train_step % 10 == 0:
response_text = self.experience_maker.tokenizer.batch_decode(
experience.sequences, skip_special_tokens=True
)
@ -327,6 +326,7 @@ class GRPOTrainer(OLTrainer):
self.writer.add_scalar("approx_kl", self.accumulative_meter.get("kl"), global_step)
self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), global_step)
self.accumulative_meter.reset()
self.num_train_step += 1
def _learn(self, update_step: int):
"""

View File

@ -256,7 +256,7 @@ class KTOTrainer(SLTrainer):
self.coordinator.print_on_master(
f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
)
self.num_train_step += 1
self.num_train_step += 1
step_bar.close()

View File

@ -233,7 +233,7 @@ class ORPOTrainer(SLTrainer):
self.coordinator.print_on_master(
f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
)
self.num_train_step += 1
self.num_train_step += 1
step_bar.close()

View File

@ -220,7 +220,6 @@ class PPOTrainer(OLTrainer):
experience:
sequences: [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>
"""
self.num_train_step += 1
self.actor.train()
self.critic.train()
num_actions = experience.action_log_probs.size(1)
@ -294,7 +293,7 @@ class PPOTrainer(OLTrainer):
self.critic_scheduler.step()
# preparing logging model output and corresponding rewards.
if self.num_train_step % 10 == 1:
if self.num_train_step % 10 == 0:
response_text = self.experience_maker.tokenizer.batch_decode(
experience.sequences, skip_special_tokens=True
)
@ -336,6 +335,7 @@ class PPOTrainer(OLTrainer):
self.writer.add_scalar("value", self.accumulative_meter.get("value"), self.num_train_step)
self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), self.num_train_step)
self.accumulative_meter.reset()
self.num_train_step += 1
def _learn(self, update_step: int):
"""

View File

@ -193,7 +193,7 @@ class RewardModelTrainer(SLTrainer):
self.coordinator.print_on_master(
f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"
)
self.num_train_step += 1
self.num_train_step += 1
step_bar.close()
def _eval(self, epoch):

View File

@ -152,9 +152,9 @@ class SFTTrainer(SLTrainer):
if self.writer:
self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), global_step)
self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], global_step)
self.num_train_step += 1
self.accumulative_meter.reset()
step_bar.update()
self.num_train_step += 1
# Save checkpoint
if (