polish code

2022-11-15 10:46:51 +08:00 · 2022-11-15 10:46:51 +08:00 · fad3b6d1a6
parent 7e2bd1e428
commit fad3b6d1a6
1 changed files with 237 additions and 237 deletions
--- a/chunk_codegen.py
+++ b/chunk_codegen.py
@ -10,6 +10,13 @@ CODEGEN_AVAILABLE = True
 __all__ = ['ChunkCodeGen']


+def _delete_free_var_from_last_use(user_to_last_uses):
+    for key, value in user_to_last_uses.items():
+        for n in value:
+            if n.op == 'placeholder':
+                user_to_last_uses[key].remove(n)
+
+
 class NodeIndexTracer(object):
    def __init__(self, gm) -> None:
        self.gm = gm
@ -19,7 +26,7 @@ class NodeIndexTracer(object):
        self.idx_view_list = []
        self.idx_count = -1

-    def add_index(self):
+    def _add_index(self):
        """
        Update the count and return it. To record the idx number.
        
@ -29,7 +36,7 @@ class NodeIndexTracer(object):
        self.idx_count += 1
        return self.idx_count

-    def inherit_computation(self, node_from, node_to):
+    def _inherit_computation(self, node_from, node_to):
        """
        Inherit computed dim from node_from to node_to.
        If a dim in node_from is marked as computed and exists in node_to,
@ -39,13 +46,13 @@ class NodeIndexTracer(object):
            node_from (node): node to be inherited
            node_to (node): new node to inherit
        """        
-        _, compute_from = self.find_trace_from_node(node_from)
-        idx_to, compute_to = self.find_trace_from_node(node_to)
+        _, compute_from = self._find_trace_from_node(node_from)
+        idx_to, compute_to = self._find_trace_from_node(node_to)
        for i in compute_from:
            if i in idx_to and i not in compute_to:
                compute_to.append(i)
    
-    def mark_idx_equal(self, idx1, idx2):
+    def _mark_idx_equal(self, idx1, idx2):
        """
        Mark 2 index to be equal.

@ -55,7 +62,7 @@ class NodeIndexTracer(object):
        """        
        self.idx_trace_equal.append((idx1, idx2))
        
-    def mark_computation(self, node, idx, dim):
+    def _mark_computation(self, node, idx, dim):
        """
        Mark some dims of node as computed.

@ -64,7 +71,7 @@ class NodeIndexTracer(object):
            idx (int): node index
            dim (list or int): dims to be marked as computed
        """        
-        input_node_idx_trace = self.find_idx_trace_from_node(node)
+        input_node_idx_trace = self._find_idx_trace_from_node(node)
        if isinstance(dim, int):
            dim = [dim]
        for d in dim:
@ -72,7 +79,7 @@ class NodeIndexTracer(object):
            if cur_idx not in self.idx_trace_list[idx]['compute']:
                self.idx_trace_list[idx]['compute'].append(cur_idx)
    
-    def find_trace_from_node(self, node):
+    def _find_trace_from_node(self, node):
        """
        Find node idx and compute trace by the node.

@ -86,7 +93,7 @@ class NodeIndexTracer(object):
        node_dict = self.idx_trace_list[node_idx]
        return node_dict['idx'], node_dict['compute']
    
-    def find_idx_trace_from_node(self, node):
+    def _find_idx_trace_from_node(self, node):
        """
        Find node idx trace by the node.

@ -98,7 +105,7 @@ class NodeIndexTracer(object):
        node_idx = _find_idx_by_name(node.name, self.nodes_list)
        return self.idx_trace_list[node_idx]['idx']
    
-    def find_compute_trace_from_node(self, node):
+    def _find_compute_trace_from_node(self, node):
        """
        Find node compute trace by the node.

@ -110,7 +117,7 @@ class NodeIndexTracer(object):
        node_idx = _find_idx_by_name(node.name, self.nodes_list)
        return self.idx_trace_list[node_idx]['compute']
    
-    def assign_index_as_input(self, node, node_idx):
+    def _assign_index_as_input(self, node, node_idx):
        """
        Assign node's trace as its input node.

@ -124,7 +131,7 @@ class NodeIndexTracer(object):
        new_idx_trace = copy.deepcopy(input_node_idx_trace)
        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
    
-    def assign_all_index(self, node, node_idx):
+    def _assign_all_index(self, node, node_idx):
        """
        Add new index for all node's dims.

@ -135,10 +142,10 @@ class NodeIndexTracer(object):
        shape = node.meta['tensor_meta'].shape
        new_trace = []
        for _ in shape:
-            new_trace.append(self.add_index())
+            new_trace.append(self._add_index())
        self.idx_trace_list[node_idx]['idx'] = new_trace   

-    def assign_transpose_index(self, node, node_idx):
+    def _assign_transpose_index(self, node, node_idx):
        """
        Assign index for transpose op.
        1. swap input's dim according to transpose args
@ -149,16 +156,16 @@ class NodeIndexTracer(object):
            node_idx (int)
        """  
        tranpose_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
        
        new_idx_trace = copy.deepcopy(input_node_idx_trace)
        new_idx_trace[tranpose_dim[0]] = input_node_idx_trace[tranpose_dim[1]]
        new_idx_trace[tranpose_dim[1]] = input_node_idx_trace[tranpose_dim[0]]

        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
        
-    def assign_permute_index(self, node, node_idx):
+    def _assign_permute_index(self, node, node_idx):
        """
        Assign index for permute op.
        1. swap input's dim according to permute args
@ -169,16 +176,16 @@ class NodeIndexTracer(object):
            node_idx (int)
        """  
        permute_dim = node.args[1:]
-        input_node_idx_trace = self.find_idx_trace_from_node(node.args[0])
+        input_node_idx_trace = self._find_idx_trace_from_node(node.args[0])
        
        new_idx_trace = copy.deepcopy(input_node_idx_trace)
        for idx, d in enumerate(permute_dim):
            new_idx_trace[idx] = input_node_idx_trace[d]

        self.idx_trace_list[node_idx]['idx'] = new_idx_trace
-        self.inherit_computation(node.args[0], node)
+        self._inherit_computation(node.args[0], node)
        
-    def assign_linear_index(self, node, node_idx):
+    def _assign_linear_index(self, node, node_idx):
        """
        Assign index for linear op.
        1. copy trace from input node and change last index accroding to weight
@ -190,22 +197,22 @@ class NodeIndexTracer(object):
            node_idx (int)
        """  
        input_node, weight, bias = node.args
-        input_node_idx_trace = self.find_idx_trace_from_node(input_node)
-        weight_idx_trace = self.find_idx_trace_from_node(weight)
+        input_node_idx_trace = self._find_idx_trace_from_node(input_node)
+        weight_idx_trace = self._find_idx_trace_from_node(weight)
        
        new_idx_trace = copy.deepcopy(input_node_idx_trace)
        new_idx_trace[-1] = weight_idx_trace[1]
        self.idx_trace_list[node_idx]['idx'] = new_idx_trace

-        self.inherit_computation(input_node, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
+        self._inherit_computation(input_node, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(input_node_idx_trace[-1], weight_idx_trace[0])
        
        if bias:
-            bias_idx_trace = self.find_idx_trace_from_node(bias)
-            self.mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])
+            bias_idx_trace = self._find_idx_trace_from_node(bias)
+            self._mark_idx_equal(input_node_idx_trace[-1], bias_idx_trace[0])

-    def assign_matmul_index(self, node, node_idx):
+    def _assign_matmul_index(self, node, node_idx):
        """
        Assign index for matmul op.
        1. copy trace from matmul_left and change last index accroding to matmul_right. (assert they have same length)
@ -217,20 +224,20 @@ class NodeIndexTracer(object):
            node_idx (int)
        """  
        matmul_left, matmul_right = node.args
-        matmul_left_idx_trace = self.find_idx_trace_from_node(matmul_left)
-        matmul_right_idx_trace = self.find_idx_trace_from_node(matmul_right)
+        matmul_left_idx_trace = self._find_idx_trace_from_node(matmul_left)
+        matmul_right_idx_trace = self._find_idx_trace_from_node(matmul_right)
        
        assert(len(matmul_left_idx_trace) == len(matmul_right_idx_trace))
        new_idx_trace = copy.deepcopy(matmul_left_idx_trace)
        new_idx_trace[-1] = matmul_right_idx_trace[-1]
        self.idx_trace_list[node_idx]['idx'] = new_idx_trace

-        self.inherit_computation(matmul_left, node)
-        self.inherit_computation(matmul_right, node)
-        self.mark_computation(node, node_idx, [-1])
-        self.mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])
+        self._inherit_computation(matmul_left, node)
+        self._inherit_computation(matmul_right, node)
+        self._mark_computation(node, node_idx, [-1])
+        self._mark_idx_equal(matmul_left_idx_trace[-1], matmul_right_idx_trace[-2])

-    def assign_layernorm_index(self, node, idx):
+    def _assign_layernorm_index(self, node, idx):
        """
        Assign index for layernorm op.
        1. assign index as input node
@ -240,11 +247,11 @@ class NodeIndexTracer(object):
            node (node)
            node_idx (int)
        """
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [-1, -2])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [-1, -2])
    
-    def assign_elementwise_index(self, node, idx):
+    def _assign_elementwise_index(self, node, idx):
        """
        Assign index for element-wise op (eg. relu sigmoid add mul).
        1. assign index as input node
@ -254,12 +261,12 @@ class NodeIndexTracer(object):
            node (node)
            node_idx (int)
        """  
-        self.assign_index_as_input(node, idx)
+        self._assign_index_as_input(node, idx)
        for node_in in node.args:
            if type(node_in) not in (int, float):
-                self.inherit_computation(node_in, node)
+                self._inherit_computation(node_in, node)
                
-    def assign_softmax_index(self, node, idx):
+    def _assign_softmax_index(self, node, idx):
        """
        Assign index for softmax op.
        1. assign index as input node
@ -269,11 +276,11 @@ class NodeIndexTracer(object):
            node (node)
            node_idx (int)
        """  
-        self.assign_index_as_input(node, idx)
-        self.inherit_computation(node.args[0], node)
-        self.mark_computation(node, idx, [node.kwargs['dim']])
+        self._assign_index_as_input(node, idx)
+        self._inherit_computation(node.args[0], node)
+        self._mark_computation(node, idx, [node.kwargs['dim']])

-    def assign_view_reshape_index(self, node, node_idx):
+    def _assign_view_reshape_index(self, node, node_idx):
        """
        Assign index for view and reshape op.
        1. get origin shape and target shape by meta info.
@ -325,22 +332,22 @@ class NodeIndexTracer(object):
            raise NotImplementedError("shape" + str(origin_shape) + 'and' + str(target_shape) + "view not implemented")

        # get new index
-        origin_trace = self.find_idx_trace_from_node(origin_node)
+        origin_trace = self._find_idx_trace_from_node(origin_node)
        new_trace = copy.deepcopy(origin_trace)
        dim_from.reverse()
        for i in dim_from:
            new_trace.pop(i)
        for i in dim_to:
-            new_trace.insert(i, self.add_index())
+            new_trace.insert(i, self._add_index())
        self.idx_trace_list[node_idx]['idx'] = new_trace
        
        # inherit computation
-        self.inherit_computation(origin_node, node)
-        compute_log = self.find_compute_trace_from_node(origin_node)
+        self._inherit_computation(origin_node, node)
+        compute_log = self._find_compute_trace_from_node(origin_node)
        for i in dim_from:
            if origin_trace[i] in compute_log:
                for j in dim_to:
-                    self.mark_computation(node, node_idx, [j])
+                    self._mark_computation(node, node_idx, [j])
                break
        
        # log view, not used now
@ -353,25 +360,25 @@ class NodeIndexTracer(object):
    def trace_node_idx(self):
        for idx, node in enumerate(self.nodes_list):
            if node.op == 'placeholder':
-                self.assign_all_index(node, idx)
+                self._assign_all_index(node, idx)
            elif node.op == 'call_method':
                if 'transpose' in node.name:
-                    self.assign_transpose_index(node, idx)
+                    self._assign_transpose_index(node, idx)
                elif 'permute' in node.name:
-                    self.assign_permute_index(node, idx)
+                    self._assign_permute_index(node, idx)
                elif 'view' in node.name or 'reshape' in node.name:
-                    self.assign_view_reshape_index(node, idx)
+                    self._assign_view_reshape_index(node, idx)
                else:
                    raise NotImplementedError(node.name, "method not implemented yet!")
            elif node.op == 'call_function':
                if 'linear' in node.name:
-                    self.assign_linear_index(node, idx)
+                    self._assign_linear_index(node, idx)
                elif 'matmul' in node.name:
-                    self.assign_matmul_index(node, idx)
+                    self._assign_matmul_index(node, idx)
                elif 'softmax' in node.name:
-                    self.assign_softmax_index(node, idx)
+                    self._assign_softmax_index(node, idx)
                elif any(n in node.name for n in ['mul', 'add', 'sigmoid', 'relu']):
-                    self.assign_elementwise_index(node, idx)
+                    self._assign_elementwise_index(node, idx)
                elif 'getattr' in node.name:
                    continue # get attr like shape
                elif 'getitem' in node.name:
@ -380,206 +387,198 @@ class NodeIndexTracer(object):
                    raise NotImplementedError(node.name, "function not implemented yet!")
            elif node.op == 'call_module':
                if any(n in node.name for n in ['layernorm', 'norm']):
-                    self.assign_layernorm_index(node, idx)
+                    self._assign_layernorm_index(node, idx)
                else:
                    raise NotImplementedError(node.name, "module not implemented yet!")
            elif node.op == 'get_attr':
-                self.assign_all_index(node, idx) # get param
+                self._assign_all_index(node, idx) # get param
            elif node.op == 'output':
                continue
            else:
                raise NotImplementedError(node.op, "op not implemented yet!")


-def _get_meta_node_size(x):
-    x = x.meta['tensor_meta']
-    x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
-    return x
+class MemoryEstimator(object):
+    def __init__(self) -> None:
+        pass

+    def _get_meta_node_size(self, x):
+        x = x.meta['tensor_meta']
+        x = x.numel * torch.tensor([], dtype=x.dtype).element_size()
+        return x

-def _get_output_node_size(n):
-    fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
-    return activation_size(fwd_out)
+    def _get_output_node_size(self, n):
+        fwd_out = {x.uuid: x for x in n.meta["fwd_out"] if isinstance(x, torch.Tensor) and hasattr(x, 'uuid')}
+        return activation_size(fwd_out)

-
-def _get_delete_node_size(user, user_to_last_uses):
-    if user.op in ('placeholder', 'output'):
+    def _get_delete_node_size(self, user, user_to_last_uses):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        if len(nodes_to_delete):
+            delete_size = sum([self._get_output_node_size(i) for i in nodes_to_delete])
+            return delete_size
        return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    if len(nodes_to_delete):
-        delete_size = sum([_get_output_node_size(i) for i in nodes_to_delete])
-        return delete_size
-    return 0

+    def _get_last_usr(self, nodes):
+        node_to_last_use: Dict[Node, Node] = {}
+        user_to_last_uses: Dict[Node, List[Node]] = {}

-def _get_last_usr(nodes):
-    node_to_last_use: Dict[Node, Node] = {}
-    user_to_last_uses: Dict[Node, List[Node]] = {}
+        def register_last_uses(n: Node, user: Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)

-    def register_last_uses(n: Node, user: Node):
-        if n not in node_to_last_use:
-            node_to_last_use[n] = user
-            user_to_last_uses.setdefault(user, []).append(n)
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+        return user_to_last_uses

-    for node in reversed(nodes):
-        map_arg(node.args, lambda n: register_last_uses(n, node))
-        map_arg(node.kwargs, lambda n: register_last_uses(n, node))
-    return user_to_last_uses
+    def _get_contiguous_memory(self, node, not_contiguous_list, delete=False):
+        mem = 0
+        not_contiguous_ops = ['transpose', 'permute']

+        if node.op == 'call_function' and 'matmul' in node.name:
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # matmul won't change origin tensor, but create a tmp copy
+                    mem += self._get_output_node_size(n)
+        elif node.op == 'call_module':
+            for n in node.args:
+                if n in not_contiguous_list:
+                    # module will just make origin tensor to contiguous
+                    if delete:
+                        not_contiguous_list.remove(n)
+        elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)
+        elif any(i in node.args for i in not_contiguous_list):
+            if node not in not_contiguous_list:
+                not_contiguous_list.append(node)

-def _delete_free_var_from_last_use(user_to_last_uses):
-    for key, value in user_to_last_uses.items():
-        for n in value:
-            if n.op == 'placeholder':
-                user_to_last_uses[key].remove(n)
+        return mem

-
-def _get_contiguous_memory(node, not_contiguous_list, delete=False):
-    mem = 0
-    not_contiguous_ops = ['transpose', 'permute']
-
-    if node.op == 'call_function' and 'matmul' in node.name:
-        for n in node.args:
-            if n in not_contiguous_list:
-                # matmul won't change origin tensor, but create a tmp copy
-                mem += _get_output_node_size(n)
-    elif node.op == 'call_module':
-        for n in node.args:
-            if n in not_contiguous_list:
-                # module will just make origin tensor to contiguous
-                if delete:
-                    not_contiguous_list.remove(n)
-    elif node.op == 'call_method' and any(i in node.name for i in not_contiguous_ops):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-    elif any(i in node.args for i in not_contiguous_list):
-        if node not in not_contiguous_list:
-            not_contiguous_list.append(node)
-
-    return mem
-
-
-def _estimate_inference_mem(gm: torch.fx.GraphModule):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    for node in gm.graph.nodes:
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-            act_memory_after_node_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
-            act_memory += _get_output_node_size(node) / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
-            act_memory_after_node_log.append(act_memory)
-
-    print("no chunk")
-    _print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
-    _print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
-    
-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
-
-
-def _get_chunk_ratio(node, chunk_dim, chunk_size):
-    shape = node.meta['tensor_meta'].shape
-    chunk_ratio = float(chunk_size) / shape[chunk_dim]
-    return chunk_ratio
-
-
-def _get_chunk_delete_node_size(user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
-    if user.op in ('placeholder', 'output'):
-        return 0
-    nodes_to_delete = user_to_last_uses.get(user, [])
-    delete_size = 0
-    for n in nodes_to_delete:
-        node_idx = _find_idx_by_name(n.name, node_list)
-        if start_node <= node_idx < end_node:
-            delete_size += _get_output_node_size(n) * chunk_ratio
-    return delete_size
-
-
-def _print_mem_log(log, nodes, title=None):
-    if title:
-        print(title)
-    for idx, (l, n) in enumerate(zip(log, nodes)):
-        print("%s:%.2f \t" % (n.name, l), end='')
-        if (idx + 1) % 3 == 0:
-            print("")
-    print("\n")
-
-
-def _estimate_chunk_inference_mem(gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
-    act_memory = 0.0
-    act_memory_peak_log = []
-    act_memory_after_node_log = []
-    not_contiguous_list = []
-    user_to_last_uses = _get_last_usr(list(gm.graph.nodes))
-    _delete_free_var_from_last_use(user_to_last_uses)
-    within_chunk = False
-    region_idx = 0
-    chunk_ratio = 1 # use it to estimate chunk mem
-    node_list = list(gm.graph.nodes)
-
-    for idx, node in enumerate(node_list):
-        # if node in chunk start nodes, change chunk ratio and add chunk_tensor
-        if idx in start_nodes:
-            within_chunk = True
-            chunk_ratio = _get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
-            act_memory += _get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
-            
-        # if node is placeholder, just add the size of the node
-        if node.op == 'placeholder':
-            act_memory += _get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
-            act_memory_peak_log.append(act_memory)
-        # skip output
-        elif node.op == 'output':
-            continue
-        # node is an operation, calculate tmp, output node and delete node memory
-        else:
-            # forward memory
-            # TODO: permute will create a tmp copy if not contiguous
-            act_memory += _get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
-            act_memory += _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            # record max act memory
-            act_memory_peak_log.append(act_memory)
-            # delete useless memory
-            act_memory -= _get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
-            if within_chunk:
-                act_memory -= _get_chunk_delete_node_size(
-                    node, user_to_last_uses, chunk_ratio, node_list, 
-                    start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+    def estimate_inference_mem(self, gm: torch.fx.GraphModule):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        for node in gm.graph.nodes:
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+                act_memory_after_node_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
            else:
-                act_memory -= _get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
-            
-        if idx in end_nodes:
-            act_memory -= _get_output_node_size(node) * chunk_ratio / (1024 ** 2)
-            within_chunk = False
-            chunk_ratio = 1
-            region_idx += 1
+                # forward memory
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) / (1024 ** 2)
+                act_memory_after_node_log.append(act_memory)
+
+        print("no chunk")
+        self._print_mem_log(act_memory_peak_log, list(gm.graph.nodes), "peak")
+        self._print_mem_log(act_memory_after_node_log, list(gm.graph.nodes), "after")
        
-        act_memory_after_node_log.append(act_memory)
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory

-    print("chunk")
-    _print_mem_log(act_memory_peak_log, node_list, "peak")
-    _print_mem_log(act_memory_after_node_log, node_list, "after")

-    param_memory = parameter_size(gm)
-    return act_memory + param_memory, param_memory
+    def _get_chunk_ratio(self, node, chunk_dim, chunk_size):
+        shape = node.meta['tensor_meta'].shape
+        chunk_ratio = float(chunk_size) / shape[chunk_dim]
+        return chunk_ratio
+
+
+    def _get_chunk_delete_node_size(self, user, user_to_last_uses, chunk_ratio, node_list, start_node, end_node):
+        if user.op in ('placeholder', 'output'):
+            return 0
+        nodes_to_delete = user_to_last_uses.get(user, [])
+        delete_size = 0
+        for n in nodes_to_delete:
+            node_idx = _find_idx_by_name(n.name, node_list)
+            if start_node <= node_idx < end_node:
+                delete_size += self._get_output_node_size(n) * chunk_ratio
+        return delete_size
+
+
+    def _print_mem_log(self, log, nodes, title=None):
+        if title:
+            print(title)
+        for idx, (l, n) in enumerate(zip(log, nodes)):
+            print("%s:%.2f \t" % (n.name, l), end='')
+            if (idx + 1) % 3 == 0:
+                print("")
+        print("\n")
+
+
+    def estimate_chunk_inference_mem(self, gm: torch.fx.GraphModule, start_nodes, end_nodes, chunk_dims, chunk_sizes):
+        act_memory = 0.0
+        act_memory_peak_log = []
+        act_memory_after_node_log = []
+        not_contiguous_list = []
+        user_to_last_uses = self._get_last_usr(list(gm.graph.nodes))
+        _delete_free_var_from_last_use(user_to_last_uses)
+        within_chunk = False
+        region_idx = 0
+        chunk_ratio = 1 # use it to estimate chunk mem
+        node_list = list(gm.graph.nodes)
+
+        for idx, node in enumerate(node_list):
+            # if node in chunk start nodes, change chunk ratio and add chunk_tensor
+            if idx in start_nodes:
+                within_chunk = True
+                chunk_ratio = self._get_chunk_ratio(node, chunk_dims[region_idx], chunk_sizes[region_idx])
+                act_memory += self._get_output_node_size(node_list[end_nodes[region_idx]]) / (1024 ** 2)
+                
+            # if node is placeholder, just add the size of the node
+            if node.op == 'placeholder':
+                act_memory += self._get_meta_node_size(node) * chunk_ratio / (1024 ** 2)
+                act_memory_peak_log.append(act_memory)
+            # skip output
+            elif node.op == 'output':
+                continue
+            # node is an operation, calculate tmp, output node and delete node memory
+            else:
+                # forward memory
+                # TODO: permute will create a tmp copy if not contiguous
+                act_memory += self._get_contiguous_memory(node, not_contiguous_list) * chunk_ratio / (1024 ** 2)
+                act_memory += self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                # record max act memory
+                act_memory_peak_log.append(act_memory)
+                # delete useless memory
+                act_memory -= self._get_contiguous_memory(node, not_contiguous_list, delete=True) * chunk_ratio / (1024 ** 2)
+                if within_chunk:
+                    act_memory -= self._get_chunk_delete_node_size(
+                        node, user_to_last_uses, chunk_ratio, node_list, 
+                        start_nodes[region_idx], end_nodes[region_idx]) / (1024 ** 2)
+                else:
+                    act_memory -= self._get_delete_node_size(node, user_to_last_uses) / (1024 ** 2)
+                
+            if idx in end_nodes:
+                act_memory -= self._get_output_node_size(node) * chunk_ratio / (1024 ** 2)
+                within_chunk = False
+                chunk_ratio = 1
+                region_idx += 1
+            
+            act_memory_after_node_log.append(act_memory)
+
+        print("chunk")
+        self._print_mem_log(act_memory_peak_log, node_list, "peak")
+        self._print_mem_log(act_memory_after_node_log, node_list, "after")
+
+        param_memory = parameter_size(gm)
+        return act_memory + param_memory, param_memory


 def _gen_chunk_slice_dim(chunk_dim, chunk_idx_name, shape):
@ -695,8 +694,9 @@ def emit_code_with_chunk(body, ckpt_func, nodes, emit_node_func, delete_unused_v
    within_chunk_region = False

    node_list = list(nodes)
-    _estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
-    _estimate_inference_mem(meta_graph)
+    memory_estimator = MemoryEstimator()
+    memory_estimator.estimate_chunk_inference_mem(meta_graph, chunk_starts, chunk_ends, [1], [2])
+    memory_estimator.estimate_inference_mem(meta_graph)
    node_index_tracer = NodeIndexTracer(meta_graph)
    node_index_tracer.trace_node_idx()