[chat] typo accimulation_steps -> accumulation_steps (#3662)

pull/3679/head
tanitna 2023-04-28 00:42:57 -07:00 committed by GitHub
parent 816add7e7f
commit 1a60dc07a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 18 additions and 18 deletions

View File

@ -251,7 +251,7 @@ trainer = SFTTrainer(model=model,
eval_dataloader=eval_dataloader,
batch_size=args.batch_size,
max_epochs=args.max_epochs,
accimulation_steps = args.accimulation_steps
accumulation_steps = args.accumulation_steps
)
trainer.fit()
@ -278,7 +278,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--accimulation_steps 8 \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
@ -296,7 +296,7 @@ torchrun --standalone --nproc_per_node=1 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--accimulation_steps 8 \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \
@ -313,7 +313,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 1 \
--accimulation_steps 8 \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \

View File

@ -41,10 +41,10 @@ class SFTTrainer(Trainer):
train_dataloader: DataLoader,
eval_dataloader: DataLoader = None,
max_epochs: int = 2,
accimulation_steps: int = 8,
accumulation_steps: int = 8,
callbacks: List[Callback] = [],
) -> None:
if accimulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3:
if accumulation_steps > 1 and isinstance(strategy, ColossalAIStrategy) and strategy.stage == 3:
raise ValueError("Accumulation steps are not supported in stage 3 of ColossalAI")
super().__init__(strategy, max_epochs, callbacks=callbacks)
self.train_dataloader = train_dataloader
@ -52,8 +52,8 @@ class SFTTrainer(Trainer):
self.model = model
self.optimizer = optim
self.accimulation_steps = accimulation_steps
num_update_steps_per_epoch = len(train_dataloader) // self.accimulation_steps
self.accumulation_steps = accumulation_steps
num_update_steps_per_epoch = len(train_dataloader) // self.accumulation_steps
max_steps = math.ceil(self.max_epochs * num_update_steps_per_epoch)
self.scheduler = get_scheduler("cosine",
@ -67,7 +67,7 @@ class SFTTrainer(Trainer):
wandb.watch(self.model)
total_loss = 0
# epoch_bar = tqdm(range(self.epochs), desc='Epochs', disable=not is_rank_0())
step_bar = tqdm(range(len(self.train_dataloader) // self.accimulation_steps * self.max_epochs),
step_bar = tqdm(range(len(self.train_dataloader) // self.accumulation_steps * self.max_epochs),
desc=f'steps',
disable=not is_rank_0())
for epoch in range(self.max_epochs):
@ -85,20 +85,20 @@ class SFTTrainer(Trainer):
if loss >= 2.5 and is_rank_0():
logger.warning(f"batch_id:{batch_id}, abnormal loss: {loss}")
loss = loss / self.accimulation_steps
loss = loss / self.accumulation_steps
self.strategy.backward(loss, self.model, self.optimizer)
total_loss += loss.item()
# gradient accumulation
if (batch_id + 1) % self.accimulation_steps == 0:
if (batch_id + 1) % self.accumulation_steps == 0:
self.strategy.optimizer_step(self.optimizer)
self.optimizer.zero_grad()
self.scheduler.step()
if is_rank_0() and use_wandb:
wandb.log({
"loss": total_loss / self.accimulation_steps,
"loss": total_loss / self.accumulation_steps,
"lr": self.scheduler.get_last_lr()[0],
"epoch": epoch,
"batch_id": batch_id

View File

@ -62,7 +62,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 4 \
--accimulation_steps 8 \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \

View File

@ -154,7 +154,7 @@ def train(args):
eval_dataloader=eval_dataloader,
batch_size=args.batch_size,
max_epochs=args.max_epochs,
accimulation_steps=args.accimulation_steps)
accumulation_steps=args.accumulation_steps)
trainer.fit(logger=logger, log_interval=args.log_interval)
@ -183,7 +183,7 @@ if __name__ == '__main__':
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
parser.add_argument('--lr', type=float, default=5e-6)
parser.add_argument('--accimulation_steps', type=int, default=8)
parser.add_argument('--accumulation_steps', type=int, default=8)
parser.add_argument('--enable_peft_lora', action='store_true', default=False)
parser.add_argument("--is_short_text", action='store_true', default=False)
args = parser.parse_args()

View File

@ -159,7 +159,7 @@ def train(args):
train_dataloader=train_dataloader,
eval_dataloader=eval_dataloader,
max_epochs=args.max_epochs,
accimulation_steps=args.accimulation_steps)
accumulation_steps=args.accumulation_steps)
trainer.fit(logger=logger, use_wandb=args.use_wandb)
@ -189,7 +189,7 @@ if __name__ == '__main__':
parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
parser.add_argument('--lr', type=float, default=5e-6)
parser.add_argument('--accimulation_steps', type=int, default=8)
parser.add_argument('--accumulation_steps', type=int, default=8)
parser.add_argument('--use_wandb', default=False, action='store_true')
parser.add_argument('--grad_checkpoint', default=False, action='store_true')
args = parser.parse_args()

View File

@ -6,7 +6,7 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--save_path /path/to/Coati-7B \
--dataset /path/to/data.json \
--batch_size 4 \
--accimulation_steps 8 \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1 \