import time from typing import Any import torch import torch.fx import colossalai from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE from colossalai.fx.graph_module import ColoGraphModule from colossalai.fx.passes.meta_info_prop import MetaInfoProp from colossalai.testing import free_port if AUTOCHUNK_AVAILABLE: from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen from colossalai.fx.profiler import MetaTensor from colossalai.fx.tracer.experimental import ColoTracer, symbolic_trace def _benchmark_evoformer_stack_gm( data_args: tuple, max_memory: int, get_model: Any, get_data: Any, ) -> None: # build model and input model = get_model().cpu().eval() meta_args, concrete_args = get_data(*data_args) if concrete_args is None: concrete_args = [] # trace the meta graph and setup codegen meta_graph = symbolic_trace( model, meta_args={k: v.to(torch.device("meta")) for k, v in meta_args}, concrete_args={k: v for k, v in concrete_args}, ) interp = MetaInfoProp(meta_graph) meta_tensors = [MetaTensor(i[1], fake_device="cpu") for i in meta_args] + [i[1] for i in concrete_args] interp.propagate(*meta_tensors) codegen = AutoChunkCodeGen( meta_graph, max_memory=max_memory, ) # trace and recompile # MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer graph = ColoTracer().trace( model, meta_args={k: v.to(torch.device("meta")) for k, v in meta_args}, concrete_args={k: v for k, v in concrete_args}, ) graph.set_codegen(codegen) gm = ColoGraphModule(model, graph, ckpt_codegen=False) gm.recompile() # init inputs inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args] inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs] model.cuda() # bench mem = _benchmark_memory(gm, inputs) speed = _benchmark_speed(gm, inputs) print("evoformer stack gm, mem: %.2fMB, time: %.4fs" % (mem, speed)) def _benchmark_evoformer_stack_origin( data_args: tuple, get_model: Any, get_data: Any, ) -> None: # build model and input model = get_model() meta_args, concrete_args = get_data(*data_args) if concrete_args is None: concrete_args = [] # init inputs inputs = [i[1] for i in meta_args] + [i[1] for i in concrete_args] inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs] model.cuda() # bench mem = _benchmark_memory(model, inputs) speed = _benchmark_speed(model, inputs) print("evoformer stack origin, mem: %.2fMB, time: %.4fs" % (mem, speed)) return mem def _benchmark_memory(model, inputs): with torch.no_grad(): torch.cuda.reset_peak_memory_stats() now_mem = torch.cuda.memory_allocated() / 1024**2 model(*inputs) new_max_mem = torch.cuda.max_memory_allocated() / 1024**2 return new_max_mem - now_mem def _benchmark_speed(model, inputs, loop=5): with torch.no_grad(): for _ in range(loop // 2 + 1): model(*inputs) torch.cuda.synchronize() time1 = time.time() for _ in range(loop): model(*inputs) torch.cuda.synchronize() time2 = time.time() return (time2 - time1) / loop def benchmark_evoformer_stack(data_args): from test_autochunk_evoformer_stack import get_data, get_model print("\nmsa len: %d, pair len: %d" % (data_args[0], data_args[1])) max_mem = _benchmark_evoformer_stack_origin(data_args, get_model, get_data) for ratio in [0.5, 0.4, 0.3, 0.2, 0.1]: try: _benchmark_evoformer_stack_gm(data_args, max_mem * ratio, get_model, get_data) except RuntimeError as e: if e.args[0] == "Search failed. Try a larger memory threshold.": break except Exception as e: raise e _benchmark_evoformer_stack_gm(data_args, None, get_model, get_data) if __name__ == "__main__": # launch colossalai colossalai.launch( config={}, rank=0, world_size=1, host="localhost", port=free_port(), backend="nccl", ) benchmark_evoformer_stack((256, 256)) benchmark_evoformer_stack((256, 512)) benchmark_evoformer_stack((256, 1024)) benchmark_evoformer_stack((256, 1280))