mirror of https://github.com/hpcaitech/ColossalAI
[autochunk] support complete benchmark (#3121)
* refact memory code * dont log free var memory * add memory align * update chunk target * update setting for new memory * finish test * update tracer * update typo * update test * add unet test * add bench * update bench * update bench * init * support vit * move to cpu * add cpu benchmarkpull/3128/head
parent
68577fbc43
commit
30dd13c450
|
@ -23,7 +23,7 @@ def _benchmark_evoformer_stack_gm(
|
|||
get_data: Any,
|
||||
) -> None:
|
||||
# build model and input
|
||||
model = get_model()
|
||||
model = get_model().cpu().eval()
|
||||
meta_args, concrete_args = get_data(*data_args)
|
||||
if concrete_args is None:
|
||||
concrete_args = []
|
||||
|
@ -35,7 +35,7 @@ def _benchmark_evoformer_stack_gm(
|
|||
concrete_args={k: v for k, v in concrete_args},
|
||||
)
|
||||
interp = MetaInfoProp(meta_graph)
|
||||
meta_tensors = [MetaTensor(i[1], fake_device="cuda:0") for i in meta_args] + [i[1] for i in concrete_args]
|
||||
meta_tensors = [MetaTensor(i[1], fake_device="cpu") for i in meta_args] + [i[1] for i in concrete_args]
|
||||
interp.propagate(*meta_tensors)
|
||||
codegen = AutoChunkCodeGen(
|
||||
meta_graph,
|
||||
|
|
|
@ -35,10 +35,9 @@ def _benchmark_autochunk_unet_gm(
|
|||
meta_args={k: v.to(torch.device("meta")) for k, v in meta_args},
|
||||
concrete_args={k: v for k, v in concrete_args},
|
||||
)
|
||||
model = model.cuda().eval()
|
||||
interp = MetaInfoProp(meta_graph)
|
||||
meta_tensors = [i[1] for i in meta_args] + [i[1] for i in concrete_args]
|
||||
meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
|
||||
meta_tensors = [MetaTensor(i, fake_device="cpu") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
|
||||
interp.propagate(*meta_tensors)
|
||||
codegen = AutoChunkCodeGen(
|
||||
meta_graph,
|
||||
|
@ -142,6 +141,7 @@ if __name__ == "__main__":
|
|||
port=free_port(),
|
||||
backend="nccl",
|
||||
)
|
||||
benchmark_autochunk_unet(batch=1, height=224 * 2, width=224 * 2)
|
||||
benchmark_autochunk_unet(batch=1, height=224 * 3, width=224 * 3)
|
||||
benchmark_autochunk_unet(batch=1, height=224 * 4, width=224 * 4)
|
||||
benchmark_autochunk_unet(batch=1, height=224 * 5, width=224 * 5)
|
||||
benchmark_autochunk_unet(batch=1, height=224 * 6, width=224 * 6)
|
||||
|
|
|
@ -22,7 +22,7 @@ def _benchmark_autochunk_gpt_gm(
|
|||
data: tuple,
|
||||
max_memory: int = None,
|
||||
) -> None:
|
||||
model = model.cuda().eval()
|
||||
model = model.eval().cpu()
|
||||
|
||||
# build model and input
|
||||
meta_args, concrete_args, sequence = data
|
||||
|
@ -37,7 +37,7 @@ def _benchmark_autochunk_gpt_gm(
|
|||
)
|
||||
interp = MetaInfoProp(meta_graph)
|
||||
meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
|
||||
meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
|
||||
meta_tensors = [MetaTensor(i, fake_device="cpu") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
|
||||
interp.propagate(*meta_tensors)
|
||||
codegen = AutoChunkCodeGen(
|
||||
meta_graph,
|
||||
|
@ -58,7 +58,7 @@ def _benchmark_autochunk_gpt_gm(
|
|||
# init inputs
|
||||
inputs = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
|
||||
inputs = [i.cuda() if isinstance(i, torch.Tensor) else i for i in inputs]
|
||||
model.cuda().eval()
|
||||
model.cuda()
|
||||
|
||||
# bench
|
||||
para_mem = float(parameter_size(model)) / 1024**2 * 6
|
||||
|
|
Loading…
Reference in New Issue