Browse Source

[example] add profile util for llama

pull/5751/head
hxwang 6 months ago
parent
commit
63c057cd8e
  1. 55
      examples/language/llama/benchmark.py
  2. 22
      examples/language/performance_evaluator.py

55
examples/language/llama/benchmark.py

@ -1,11 +1,12 @@
import argparse import argparse
import resource import resource
import time
from contextlib import nullcontext from contextlib import nullcontext
import torch import torch
from data_utils import RandomDataset from data_utils import RandomDataset
from model_utils import format_numel_str, get_model_numel from model_utils import format_numel_str, get_model_numel
from performance_evaluator import PerformanceEvaluator from performance_evaluator import PerformanceEvaluator, get_profile_context
from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM from transformers import AutoConfig, AutoModelForCausalLM
@ -76,6 +77,7 @@ def main():
parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel") parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled") parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled")
parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False) parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
parser.add_argument("--profile", action="store_true", help="Enable profiling", default=False)
args = parser.parse_args() args = parser.parse_args()
colossalai.launch_from_torch() colossalai.launch_from_torch()
@ -110,6 +112,7 @@ def main():
extra_dp_size=args.extra_dp, extra_dp_size=args.extra_dp,
enable_fused_normalization=torch.cuda.is_available(), enable_fused_normalization=torch.cuda.is_available(),
enable_flash_attention=args.xformers, enable_flash_attention=args.xformers,
max_prefetch=10,
) )
elif args.plugin == "gemini_auto": elif args.plugin == "gemini_auto":
plugin = GeminiPlugin( plugin = GeminiPlugin(
@ -246,25 +249,37 @@ def main():
f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB" f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
) )
if isinstance(plugin, HybridParallelPlugin) and args.pp > 1: with get_profile_context(
data_iter = iter(dataloader) args.profile,
for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()): 1,
performance_evaluator.on_step_start(step) len(dataloader) - 1,
booster.execute_pipeline( save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
data_iter, model, criterion=lambda outputs, inputs: outputs[0], optimizer=optimizer, return_loss=False ) as prof:
) if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
optimizer.step() data_iter = iter(dataloader)
optimizer.zero_grad() for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length)) performance_evaluator.on_step_start(step)
else: booster.execute_pipeline(
for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())): data_iter,
performance_evaluator.on_step_start(step) model,
outputs = model(**batch) criterion=lambda outputs, inputs: outputs[0],
loss = outputs[0] optimizer=optimizer,
booster.backward(loss, optimizer) return_loss=False,
optimizer.step() )
optimizer.zero_grad() optimizer.step()
performance_evaluator.on_step_end(**batch) optimizer.zero_grad()
performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
prof.step()
else:
for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
performance_evaluator.on_step_start(step)
outputs = model(**batch)
loss = outputs[0]
booster.backward(loss, optimizer)
optimizer.step()
optimizer.zero_grad()
performance_evaluator.on_step_end(**batch)
prof.step()
performance_evaluator.on_fit_end() performance_evaluator.on_fit_end()
coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB") coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")

22
examples/language/performance_evaluator.py

@ -4,6 +4,7 @@ from typing import Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from torch import Tensor from torch import Tensor
from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
from colossalai.accelerator import get_accelerator from colossalai.accelerator import get_accelerator
from colossalai.cluster import DistCoordinator from colossalai.cluster import DistCoordinator
@ -27,6 +28,27 @@ def all_reduce_mean(x: float, world_size: int) -> float:
return tensor.item() return tensor.item()
def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
class DummyProfiler:
def __init__(self):
self.step_number = 0
def step(self):
self.step_number += 1
if enable_flag:
return profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
on_trace_ready=tensorboard_trace_handler(save_dir),
# record_shapes=True,
# profile_memory=True,
with_stack=True,
)
else:
return DummyProfiler()
class Timer: class Timer:
def __init__(self) -> None: def __init__(self) -> None:
self.start_time: Optional[float] = None self.start_time: Optional[float] = None

Loading…
Cancel
Save