[inference] streaming Linear 1D Row inference (#1874)

2022-11-10 17:03:21 +08:00 · 2022-11-10 17:03:21 +08:00 · c2947dadf1
parent a141681260
commit c2947dadf1
4 changed files with 629 additions and 554 deletions
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@ -597,9 +597,12 @@ class Linear1D_Row(ParallelLayer):
                 parallel_input: bool = True,
                 skip_bias_add: bool = False,
                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 stream_chunk_num: int = 1):
        super().__init__()

+        self.stream_chunk_num = stream_chunk_num
+
        # Keep input parameters
        self.in_features = in_features
        self.out_features = out_features
@ -617,6 +620,9 @@ class Linear1D_Row(ParallelLayer):
        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
        self.weight = Parameter(torch.empty(self.out_features, self.input_size_per_partition, **factory_kwargs))

+        if self.stream_chunk_num > 1:
+            # TODO() work for inference only
+            self.chunk_weight()
        if bias:
            self.bias = Parameter(torch.empty(self.out_features, **factory_kwargs))
        else:
@ -626,6 +632,9 @@ class Linear1D_Row(ParallelLayer):
        self._set_tensor_parallel_attributes()
        set_parallel_input(False)

+    def chunk_weight(self):
+        self.weight_list = torch.chunk(self.weight, self.stream_chunk_num, dim=0)
+
    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
        fan_in, fan_out = self.in_features, self.out_features
        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
@ -696,10 +705,17 @@ class Linear1D_Row(ParallelLayer):
                input_.shape, self.weight.shape, self.weight.shape[-1] * gpc.tensor_parallel_size)
            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)

-        output_parallel = F.linear(input_, self.weight)
-        # output_parallel = linear_with_async_comm(input_, self.weight, None, ParallelMode.PARALLEL_1D, False)
-        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
-
+        if self.stream_chunk_num > 1:
+            output_parallel_list = [None for i in range(self.stream_chunk_num)]
+            for i in range(self.stream_chunk_num):
+                output_parallel_list[i] = F.linear(input_, self.weight_list[i])
+                output_parallel_list[i] = reduce_input(output_parallel_list[i], ParallelMode.PARALLEL_1D)
+            output = torch.cat(output_parallel_list, dim=-1)
+        else:
+            print(input_.shape, self.weight.shape)
+            output_parallel = F.linear(input_, self.weight)
+            # output_parallel = linear_with_async_comm(input_, self.weight, None, ParallelMode.PARALLEL_1D, False)
+            output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
        if not self.skip_bias_add:
            if self.bias is not None:
                output = output + self.bias
--- a/tests/test_fx/test_complete_workflow.py
+++ b/tests/test_fx/test_complete_workflow.py
@ -32,7 +32,7 @@ class MLP(torch.nn.Module):
        return x


-def run_workflow(world_size):
+def run_workflow(world_size, dev):
    # initailization
    with LazyInitContext() as ctx:
        model = MLP(16)
@ -46,7 +46,7 @@ def run_workflow(world_size):
    gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)

    # annotate
-    annotated_gm = transformer_mlp_pass(gm, process_group=ProcessGroup())
+    annotated_gm = transformer_mlp_pass(gm, process_group=ProcessGroup(tp_degree=world_size))
    annotated_gm.recompile()

    # materialization and sharding
@ -61,22 +61,25 @@ def run_workflow(world_size):

    # test forward to make sure that IR transform will produce the same results
    # like how ColoTensor would do it normally
-    data = torch.rand(4, 16)
+    data = torch.rand(4, 16, device=dev)
    non_fx_out = model(data)
    fx_out = annotated_gm(data)
    assert torch.equal(non_fx_out, fx_out), f'{non_fx_out} vs {fx_out}'


-def run_dist(rank, world_size, port):
+def run_dist(rank, world_size, dev, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_workflow(world_size)
+    run_workflow(world_size, dev)


@pytest.mark.dist
@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('dev', ['cuda', 'cpu'])
@rerun_if_address_is_in_use()
-def test_complete_workflow(world_size):
-    run_func = partial(run_dist, world_size=world_size, port=free_port())
+def test_complete_workflow(world_size, dev):
+    if dev == 'cpu' and world_size > 1:
+        return
+    run_func = partial(run_dist, world_size=world_size, dev=dev, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


--- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
+++ b/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
--- a/tests/test_layers/test_1d/test_1d.py
+++ b/tests/test_layers/test_1d/test_1d.py
@ -1,46 +1,49 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from functools import partial
-
-import pytest
-import torch
-import torch.multiprocessing as mp
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers
-from colossalai.initialize import launch
-from colossalai.utils import free_port
-from colossalai.testing import rerun_if_address_is_in_use
-from checks_1d.check_layer_1d import *
-
-CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=4, mode='1d')),)
-
-
-def check_layer(rank, world_size, port):
-    disable_existing_loggers()
-    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-
-    check_linear_col()
-    check_linear_row()
-    check_embed()
-    check_vocab_parallel_embed()
-    check_classifier_no_given_weight()
-    check_vocab_parallel_classifier_no_given_weight()
-    check_classifier_given_embed_weight()
-    check_vocab_parallel_classifier_given_embed_weight()
-    check_vocab_parallel_loss()
-
-    gpc.destroy()
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_1d():
-    world_size = 4
-    run_func = partial(check_layer, world_size=world_size, port=free_port())
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_1d()
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from functools import partial
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+from checks_1d.check_layer_1d import *
+
+from colossalai.core import global_context as gpc
+from colossalai.initialize import launch
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import rerun_if_address_is_in_use
+from colossalai.utils import free_port
+
+CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=4, mode='1d')),)
+
+
+def check_layer(rank, world_size, port):
+    disable_existing_loggers()
+    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+
+    check_linear_col()
+    check_linear_row()
+    check_embed()
+    check_vocab_parallel_embed()
+    check_classifier_no_given_weight()
+    check_vocab_parallel_classifier_no_given_weight()
+    check_classifier_given_embed_weight()
+    check_vocab_parallel_classifier_given_embed_weight()
+    check_vocab_parallel_loss()
+
+    check_linear_row_stream_inference()
+
+    gpc.destroy()
+    torch.cuda.empty_cache()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_1d():
+    world_size = 4
+    run_func = partial(check_layer, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_1d()