import time from typing import Any, Dict, List import torch import torch.fx import colossalai from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE from colossalai.fx.graph_module import ColoGraphModule from colossalai.fx.passes.meta_info_prop import MetaInfoProp from colossalai.fx.profiler import parameter_size from colossalai.utils import free_port if AUTOCHUNK_AVAILABLE: from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen from colossalai.fx.profiler import MetaTensor from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace def _benchmark_autochunk_unet_gm( model: Any, data: tuple, max_memory: int = None, ) -> None: model = model.cuda().eval() # build model and input meta_args, concrete_args = data if concrete_args is None: concrete_args = {} # trace the meta graph and setup codegen meta_graph = symbolic_trace( model, meta_args={k: v.to(torch.device("meta")) for k, v in meta_args}, concrete_args={k: v for k, v in concrete_args}, ) model = model.cuda().eval() interp = MetaInfoProp(meta_graph) meta_tensors = [i[1] for i in meta_args] + [i[1] for i in concrete_args] meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors] interp.propagate(*meta_tensors) codegen = AutoChunkCodeGen( meta_graph, max_memory=max_memory, ) # trace and recompile # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer graph = ColoTracer().trace( model.cuda().eval(), meta_args={k: v.to(torch.device("meta")) for k, v in meta_args}, concrete_args={k: v for k, v in concrete_args}, ) graph.set_codegen(codegen) gm = ColoGraphModule(model, graph, ckpt_codegen=False) gm.recompile() # init inputs inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args] inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs] model.cuda().eval() # bench para_mem = float(parameter_size(model)) / 1024**2 act_mem = _benchmark_memory(gm, inputs) speed = _benchmark_speed(gm, inputs) print("unet autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" % (speed, act_mem, para_mem, act_mem + para_mem)) def _benchmark_autochunk_unet_origin( model: Any, data: tuple, ) -> None: # build model and input meta_args, concrete_args = data if concrete_args is None: concrete_args = {} # init inputs inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args] inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs] model.cuda().eval() # bench para_mem = float(parameter_size(model)) / 1024**2 act_mem = _benchmark_memory(model, inputs) speed = _benchmark_speed(model, inputs) print("unet origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" % (speed, act_mem, para_mem, act_mem + para_mem)) return act_mem def _benchmark_memory(model, inputs): with torch.no_grad(): torch.cuda.reset_peak_memory_stats() now_mem = float(torch.cuda.memory_allocated()) / 1024**2 model(*inputs) new_max_mem = float(torch.cuda.max_memory_allocated()) / 1024**2 return new_max_mem - now_mem def _benchmark_speed(model, inputs, loop=5): with torch.no_grad(): for _ in range(loop // 2 + 1): model(*inputs) torch.cuda.synchronize() time1 = time.time() for _ in range(loop): model(*inputs) torch.cuda.synchronize() time2 = time.time() return (time2 - time1) / loop def benchmark_autochunk_unet(batch=1, height=448, width=448): from test_autochunk_unet import UNet2DModel, get_data model = UNet2DModel() latent_shape = (batch, 3, height // 7, width // 7) print("\nbatch: %d, height: %d, width: %d" % (batch, height, width)) max_mem = _benchmark_autochunk_unet_origin(model, get_data(latent_shape)) for ratio in [0.5, 0.4, 0.3, 0.2]: try: _benchmark_autochunk_unet_gm(model, get_data(latent_shape), max_mem * ratio) except RuntimeError as e: if e.args[0] == 'Search failed. Try a larger memory threshold.': break except Exception as e: raise e _benchmark_autochunk_unet_gm(model, get_data(latent_shape), None) if __name__ == "__main__": # launch colossalai colossalai.launch( config={}, rank=0, world_size=1, host="localhost", port=free_port(), backend="nccl", ) benchmark_autochunk_unet(batch=1, height=224 * 2, width=224 * 2) benchmark_autochunk_unet(batch=1, height=224 * 3, width=224 * 3) benchmark_autochunk_unet(batch=1, height=224 * 4, width=224 * 4)