diff --git a/colossalai/tensor/_ops/linear.py b/colossalai/tensor/_ops/linear.py
index 32b9b1b74..a57599e6e 100644
--- a/colossalai/tensor/_ops/linear.py
+++ b/colossalai/tensor/_ops/linear.py
@@ -6,6 +6,7 @@ from colossalai.nn.layer.parallel_1d._utils import split_forward_gather_backward
 from colossalai.nn.layer.utils import divide
 from colossalai.core import global_context as gpc
 from packaging import version
+from colossalai.utils.cuda import get_current_device
 
 @colo_op_impl(torch.nn.functional.linear)
 def colo_linear(types, args, kwargs, pg):
@@ -39,12 +40,15 @@ def colo_linear(types, args, kwargs, pg):
             # Input:S[1]
             input_per_partition = split_forward_gather_backward(input_tensor, ParallelMode.PARALLEL_1D, dim=-1)
             # Output:P
-            partial_output = torch.nn.functional.linear(input_per_partition, weight.torch_tensor())
+            device = get_current_device() # TODO where to put to(deivce)?
+            weight_ = weight.torch_tensor().to(device)
+            partial_output = torch.nn.functional.linear(input_per_partition, weight_)
             # Reduce(Output)
             output = reduce_input(partial_output, ParallelMode.PARALLEL_1D)
             # Bias
             if bias is not None:
-                output = output + bias
+                bias_ = bias.to(device)
+                output = output + bias_
             return output
             
         else:
diff --git a/colossalai/tensor/colo_tensor.py b/colossalai/tensor/colo_tensor.py
index ad2b28e7f..9a99d4c7d 100644
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -3,7 +3,10 @@ from .op_wrapper import _COLOSSAL_OPS
 import torch
 from typing import Tuple, Optional
 from numpy import product
-
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.nn.layer.utils import divide
+from colossalai.utils.cuda import get_current_device
 
 class ColoTensor(object):
     """ Data Structure for Tensor in Colossal-AI
@@ -85,6 +88,28 @@ class ColoTensor(object):
                                              device=self._device)
         return self._torch_tensor
 
+    def set_spec(self, spec: str, lazy_shard: bool=False) -> None:
+        self._shard_spec = spec
+        if lazy_shard == False:
+            self._shard()
+
+    def _shard(self):
+        assert self._shard_spec is not None, 'You should call set_spec() before _shard() ColoTensor.'
+        if self._shard_spec == "1Drow": # TODO It actually represents the sharding layout for Linear-1Drow-weight, but we make it simpler now.
+            num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+            local_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+            dim = -1
+            chunk_size = divide(self._size[dim], num_partition)
+            device = get_current_device()
+            # Reshape to get shard for this rank and we don't want autograd
+            # recording here for the narrow op and 'local_shard' should be a
+            # leaf variable in the autograd graph.
+            self._torch_tensor = self._torch_tensor.narrow(dim, 
+                    local_rank * chunk_size, chunk_size).detach().contiguous() # TODO Shall we clone() here since detach() will point to the old tensor?
+            self._torch_tensor.requires_grad = self._requires_grad
+            self._size = self._torch_tensor.size()
+            self._device = device # TODO A `fake` device now because torch_tensor.device always = cpu
+
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         global _COLOSSAL_OPS
diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py
index 487de2062..00a05440c 100644
--- a/tests/components_to_test/simple_net.py
+++ b/tests/components_to_test/simple_net.py
@@ -1,10 +1,11 @@
+from zmq import device
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from colossalai.nn import CheckpointModule
 from .utils.dummy_data_generator import DummyDataGenerator
 from .registry import non_distributed_component_funcs
-
+from colossalai.utils.cuda import get_current_device
 
 class SimpleNet(CheckpointModule):
     """
@@ -25,8 +26,8 @@ class SimpleNet(CheckpointModule):
 class DummyDataLoader(DummyDataGenerator):
 
     def generate(self):
-        data = torch.rand(16, 4)
-        label = torch.randint(low=0, high=2, size=(16,))
+        data = torch.rand(16, 4, device=get_current_device())
+        label = torch.randint(low=0, high=2, size=(16,), device=get_current_device())
         return data, label
 
 
diff --git a/tests/test_tensor/test_linear_tp.py b/tests/test_tensor/test_linear_tp.py
index 4119d60b3..760818efc 100644
--- a/tests/test_tensor/test_linear_tp.py
+++ b/tests/test_tensor/test_linear_tp.py
@@ -35,7 +35,7 @@ def run_linear_tp1d_row_test():
 
     W_shape = (out_features, in_features)
     W_master = torch.randn(W_shape, dtype=dtype, device=device)
-    W = broadcast_tensor_chunk(W_master, chunk_size=DEPTH, local_rank=local_rank)
+    W = broadcast_tensor_chunk(W_master, chunk_size=1)
     W.requires_grad = True
 
     B_shape = (out_features)
@@ -45,7 +45,7 @@ def run_linear_tp1d_row_test():
 
     # replace the torch nn.Parameters with ColoTensor
     sharded_weight = ColoTensor.init_from_torch_tensor(W)
-    sharded_weight._shard_spec = "1Drow"
+    sharded_weight.set_spec(spec="1Drow") # reshard
     sharded_bias = ColoTensor.init_from_torch_tensor(B)
     replace_parameter_add_grad(layer, sharded_weight, sharded_bias)
     out = layer(A)
diff --git a/tests/test_tensor/test_net_tp.py b/tests/test_tensor/test_net_tp.py
index c39fa34c5..e63e786a2 100644
--- a/tests/test_tensor/test_net_tp.py
+++ b/tests/test_tensor/test_net_tp.py
@@ -23,9 +23,9 @@ def run_simple_net():
     with ColoInitContext():
         model = model_builder(checkpoint=True)
 
-    # TODO(jzy) we set the Specs for weight of each linear.
-    # model.proj1.weight.set_spec('1Drow')
-    # model.proj2.weight.set_spec('1Drow')
+    # we set the Specs for weight of each linear.
+    model.proj1.weight.set_spec('1Drow')
+    model.proj2.weight.set_spec('1Drow')
 
     for i, (data, label) in enumerate(train_dataloader):
         output = model(data)