[doc] add deepspeed citation and copyright (#2996)

* [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyright
2023-03-04 20:08:11 +08:00 · 2023-03-04 20:08:11 +08:00 · 823f3b9cf4
parent e0a1c1321c
commit 823f3b9cf4
19 changed files with 336 additions and 289 deletions
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -1,16 +1,16 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch

-from typing import List, Iterable
+from typing import Iterable, List, Optional, Type
+
+from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss

-from colossalai.logging import get_dist_logger
-from torch import Tensor
-from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
-from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
-from typing import Optional, Type
 from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
+from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger


@ -93,7 +93,7 @@ class Engine:
        if self.uses_pipeline:
            self._schedule.pre_processing(self)

-        #register hook if any
+        # register hook if any
        if len(self._ophook_list) > 0:
            register_ophooks_recursively(self._model, self._ophook_list)

--- a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
+++ b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
@ -1,7 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 import operator
 from functools import reduce
 from typing import Any, Optional, Tuple, Union
+
 import torch
+
 from ..registry import meta_profiler_function


--- a/colossalai/fx/profiler/experimental/profiler_module/convolution.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/convolution.py
@ -1,8 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
+import math
 import operator
 from functools import reduce
-import math
 from typing import Tuple
+
 import torch
+
 from ..registry import meta_profiler_module


--- a/colossalai/fx/profiler/experimental/profiler_module/normalization.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/normalization.py
@ -1,5 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 from typing import Tuple, Union
+
 import torch
+
 from ..registry import meta_profiler_module


--- a/colossalai/gemini/ophooks/utils.py
+++ b/colossalai/gemini/ophooks/utils.py
@ -1,7 +1,7 @@
-import torch
-from typing import List, Callable, Optional
-
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from abc import ABC, abstractmethod
+from typing import Callable, List, Optional
+
 import torch


--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
 */
 #include "cublas_wrappers.h"

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
 */
 #pragma once

--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
@ -1,68 +1,69 @@
-#pragma once
-
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-*/
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-#include <array>
-
-#include "cublas_wrappers.h"
-#include "kernels.h"
-
-template <typename T>
-class FeedForward {
- public:
-  struct Config {
-    int outputSize;
-    int inputSize;
-    std::array<int, 3> gemm_algos;
-    Config(int outputs, int inputs)
-        : outputSize(outputs),
-          inputSize(inputs),
-          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
-  };
-
-  FeedForward(Config config) : config_(config) {}
-
-  ~FeedForward() {}
-
-  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
-               cublasHandle_t &_cublasHandle) {
-    float alpha = T(1.);
-    float beta = T(0.);
-
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
-                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
-                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
-  }
-  void Backward(int bsz, const T *out_grad, const T *input_ptr,
-                const T *weights, T *weights_grad, T *bias_grad,
-                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
-                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
-                bool compute_bias = true) {
-    float alpha = (T)1.0, beta = (T)0.0;
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
-                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
-                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
-
-    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
-                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
-                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
-    if (compute_bias) {
-      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
-                                           config_.outputSize, stream);
-    }
-  }
-
-  void reset_size(int outputSize, int inputSize) {
-    config_.outputSize = outputSize;
-    config_.inputSize = inputSize;
-  }
-
- private:
-  Config config_;
-};
+#pragma once
+
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
+*/
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+#include "kernels.h"
+
+template <typename T>
+class FeedForward {
+ public:
+  struct Config {
+    int outputSize;
+    int inputSize;
+    std::array<int, 3> gemm_algos;
+    Config(int outputs, int inputs)
+        : outputSize(outputs),
+          inputSize(inputs),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+  };
+
+  FeedForward(Config config) : config_(config) {}
+
+  ~FeedForward() {}
+
+  void Forward(int bsz, const T *input_ptr, const T *weights, T *out,
+               cublasHandle_t &_cublasHandle) {
+    float alpha = T(1.);
+    float beta = T(0.);
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize,
+                   bsz, config_.inputSize, &alpha, &beta, weights, input_ptr,
+                   out, cublasGemmAlgo_t(config_.gemm_algos[0]));
+  }
+  void Backward(int bsz, const T *out_grad, const T *input_ptr,
+                const T *weights, T *weights_grad, T *bias_grad,
+                cublasHandle_t &_cublasHandle, cudaStream_t &stream,
+                T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr,
+                bool compute_bias = true) {
+    float alpha = (T)1.0, beta = (T)0.0;
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize,
+                   config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad,
+                   weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1]));
+
+    cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize,
+                   bsz, config_.outputSize, &alpha, &beta, weights, out_grad,
+                   inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2]));
+    if (compute_bias) {
+      launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz,
+                                           config_.outputSize, stream);
+    }
+  }
+
+  void reset_size(int outputSize, int inputSize) {
+    config_.outputSize = outputSize;
+    config_.inputSize = inputSize;
+  }
+
+ private:
+  Config config_;
+};
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
@ -1,99 +1,100 @@
-/* Copyright 2021 The LightSeq Team
-   Copyright Microsoft DeepSpeed
-   This file is adapted from Microsoft DeepSpeed
-*/
-#pragma once
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-
-#include <array>
-
-#include "cublas_wrappers.h"
-
-template <typename T>
-class StridedBatchGemm {
- public:
-  struct Config {
-    int m;
-    int n;
-    int k;
-    float alpha;
-    float beta;
-    cublasOperation_t op_A;
-    cublasOperation_t op_B;
-    std::array<int, 3> gemm_algos;
-
-    Config(float param_alpha, float param_beta, cublasOperation_t opA,
-           cublasOperation_t opB)
-        : alpha(param_alpha),
-          beta(param_beta),
-          op_A(opA),
-          op_B(opB),
-          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
-    void SetConfig(int mm, int nn, int kk) {
-      m = mm;
-      n = nn;
-      k = kk;
-    }
-  };
-
-  StridedBatchGemm(const Config &config) : _config(config) {}
-
-  virtual ~StridedBatchGemm() {}
-
-  void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b,
-               cublasHandle_t handle) {
-    int stride_a = _config.m * _config.k;
-    int stride_b = _config.n * _config.k;
-    int stride_c = _config.m * _config.n;
-
-    cublas_strided_batched_gemm(
-        handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
-        _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
-        stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0]));
-  }
-
-  void Backward(int bsz, const T *d_output, const T *_buffer_a,
-                const T *_buffer_b, cublasHandle_t handle,
-                T *inpGradA = nullptr, T *inpGradB = nullptr) {
-    int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
-    int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
-
-    int stride_a = mb * _config.n;
-    int stride_b = _config.n * kb;
-    int stride_c = _config.m * _config.k;
-
-    // B need to transpose.
-    cublasOperation_t op_b =
-        (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-    // Calculate d_A.
-    cublas_strided_batched_gemm(
-        handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
-        (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
-        (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA,
-        CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz,
-        cublasGemmAlgo_t(_config.gemm_algos[1]));
-
-    // A need to transpose.
-    cublasOperation_t op_a =
-        (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
-
-    stride_a = _config.m * _config.k;
-    stride_b = _config.m * _config.n;
-    stride_c = _config.n * _config.k;
-
-    // Calculate d_B.
-    cublas_strided_batched_gemm(
-        handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
-        _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,
-        stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2]));
-  }
-
-  inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
-
- private:
-  Config _config;
-};
+/* Copyright 2021 The LightSeq Team
+   Copyright Microsoft DeepSpeed
+   This file is adapted from Microsoft DeepSpeed
+   Licensed under the MIT License.
+*/
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <array>
+
+#include "cublas_wrappers.h"
+
+template <typename T>
+class StridedBatchGemm {
+ public:
+  struct Config {
+    int m;
+    int n;
+    int k;
+    float alpha;
+    float beta;
+    cublasOperation_t op_A;
+    cublasOperation_t op_B;
+    std::array<int, 3> gemm_algos;
+
+    Config(float param_alpha, float param_beta, cublasOperation_t opA,
+           cublasOperation_t opB)
+        : alpha(param_alpha),
+          beta(param_beta),
+          op_A(opA),
+          op_B(opB),
+          gemm_algos(std::array<int, 3>({99, 99, 99})) {}
+    void SetConfig(int mm, int nn, int kk) {
+      m = mm;
+      n = nn;
+      k = kk;
+    }
+  };
+
+  StridedBatchGemm(const Config &config) : _config(config) {}
+
+  virtual ~StridedBatchGemm() {}
+
+  void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b,
+               cublasHandle_t handle) {
+    int stride_a = _config.m * _config.k;
+    int stride_b = _config.n * _config.k;
+    int stride_c = _config.m * _config.n;
+
+    cublas_strided_batched_gemm(
+        handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta,
+        _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a,
+        stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0]));
+  }
+
+  void Backward(int bsz, const T *d_output, const T *_buffer_a,
+                const T *_buffer_b, cublasHandle_t handle,
+                T *inpGradA = nullptr, T *inpGradB = nullptr) {
+    int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m);
+    int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k);
+
+    int stride_a = mb * _config.n;
+    int stride_b = _config.n * kb;
+    int stride_c = _config.m * _config.k;
+
+    // B need to transpose.
+    cublasOperation_t op_b =
+        (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+    // Calculate d_A.
+    cublas_strided_batched_gemm(
+        handle, mb, kb, _config.n, &_config.alpha, &_config.beta,
+        (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output),
+        (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA,
+        CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz,
+        cublasGemmAlgo_t(_config.gemm_algos[1]));
+
+    // A need to transpose.
+    cublasOperation_t op_a =
+        (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
+
+    stride_a = _config.m * _config.k;
+    stride_b = _config.m * _config.n;
+    stride_c = _config.n * _config.k;
+
+    // Calculate d_B.
+    cublas_strided_batched_gemm(
+        handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta,
+        _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b,
+        stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2]));
+  }
+
+  inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+
+ private:
+  Config _config;
+};
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
@ -1,5 +1,10 @@
 // modified from
 // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
@ -1,12 +1,18 @@
-// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+// modified from
+// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include "compat.h"
-
 #include <assert.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "compat.h"

 // #include <iostream>

@ -17,117 +23,108 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};

 template <int n>
-struct TensorListMetadata
-{
-    void *addresses[n][depth_to_max_tensors[n - 1]];
-    int sizes[depth_to_max_tensors[n - 1]];
-    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
-    int start_tensor_this_launch;
+struct TensorListMetadata {
+  void *addresses[n][depth_to_max_tensors[n - 1]];
+  int sizes[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a
+                                                   // full int.
+  int start_tensor_this_launch;
 };

 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(
-    int chunk_size,
-    volatile int *noop_flag,
-    T tl,
-    U callable,
-    ArgTypes... args)
-{
-    // Hand the chunk information to the user-supplied functor to process however it likes.
-    callable(chunk_size, noop_flag, tl, args...);
+__global__ void multi_tensor_apply_kernel(int chunk_size,
+                                          volatile int *noop_flag, T tl,
+                                          U callable, ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(chunk_size, noop_flag, tl, args...);
 }

 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
-    int block_size,
-    int chunk_size,
-    const at::Tensor &noop_flag,
-    const std::vector<std::vector<at::Tensor>> &tensor_lists,
-    T callable,
-    ArgTypes... args)
-{
-    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
-    int len0 = tensor_lists[0].size();
-    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-    auto ref_device = tensor_lists[0][0].device();
-    TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
-    {
-        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
-        for (int t = 0; t < tensor_lists[l].size(); t++)
-        {
-            // TODO:  Print which tensor fails.
-            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+    int block_size, int chunk_size, const at::Tensor &noop_flag,
+    const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  for (int l = 0; l < tensor_lists.size();
+       l++)  // No range-based for because I need indices
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0,
+                "Size mismatch among tensor lists");
+    for (int t = 0; t < tensor_lists[l].size(); t++) {
+      // TODO:  Print which tensor fails.
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
 #ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+      contiguous_memory =
+          (contiguous_memory ||
+           tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 #endif
-            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
-        }
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                  "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
+                  "Size mismatch");
    }
+  }

-    int ntensors = tensor_lists[0].size();
+  int ntensors = tensor_lists[0].size();

-    TensorListMetadata<depth> tl;
+  TensorListMetadata<depth> tl;

-    const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-    auto stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::cuda::getCurrentCUDAStream();

-    tl.start_tensor_this_launch = 0;
-    int loc_block_info = 0;
-    int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++)
-    {
-        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-        for (int d = 0; d < depth; d++)
-            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-        loc_tensor_info++;
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (int t = 0; t < ntensors; t++) {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;

-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+    int chunks_this_tensor =
+        (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;

-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
-        {
-            // std::cout << chunks_this_tensor << std::endl;
-            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-            tl.block_to_chunk[loc_block_info] = chunk;
-            loc_block_info++;
+    for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+      // std::cout << chunks_this_tensor << std::endl;
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;

-            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-                                 chunk == chunks_this_tensor - 1);
-            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
-            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk)
-            {
-                // using accscalar_t = acc_type<scalar_t, true>;
-                multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-                    chunk_size,
-                    noop_flag.DATA_PTR<int>(),
-                    tl,
-                    callable,
-                    args...);
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if (tensors_full || blocks_full || last_chunk) {
+        // using accscalar_t = acc_type<scalar_t, true>;
+        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+            chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);

-                AT_CUDA_CHECK(cudaGetLastError());
+        AT_CUDA_CHECK(cudaGetLastError());

-                // Reset.  The control flow possibilities here make my brain hurt.
-                loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1)
-                {
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
-                    loc_tensor_info = 0;
-                    tl.start_tensor_this_launch = t + 1;
-                }
-                else
-                {
-                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
-                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
-                    for (int d = 0; d < depth; d++)
-                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
-                    loc_tensor_info = 1;
-                    tl.start_tensor_this_launch = t;
-                }
-            }
+        // Reset.  The control flow possibilities here make my brain hurt.
+        loc_block_info = 0;
+        if (chunk == chunks_this_tensor - 1) {
+          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
+          // << std::endl;
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        } else {
+          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
+          // << std::endl;
+          tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
        }
+      }
    }
-}
+  }
+}
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@ -1,4 +1,9 @@
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+   Licensed under the MIT License.
+*/
 #include <ATen/ATen.h>

 #include "compat.h"
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@ -1,4 +1,11 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+Licensed under the MIT License.
+'''
 import torch

 from colossalai.registry import OPTIMIZERS
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import math
 import warnings
 from enum import Enum
--- a/colossalai/utils/model/utils.py
+++ b/colossalai/utils/model/utils.py
@ -1,7 +1,12 @@
-import torch
+# This code has been adapted from the DeepSpeed library.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
 import functools
 from typing import Optional

+import torch
+

 def substitute_init_recursively(cls, func, visited: set):
    for subcls in cls.__subclasses__():
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import functools
 import itertools
 from collections import OrderedDict
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from functools import partial
 from typing import Optional

--- a/colossalai/zero/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py
@ -1,3 +1,4 @@
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from enum import Enum
 from os import stat
 from typing import Dict, Optional, Tuple
@ -5,20 +6,21 @@ from typing import Dict, Optional, Tuple
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage)
-from colossalai.zero.sharded_model import ShardedModelV2
-from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
 from torch import Tensor
 from torch.distributed import ProcessGroup
 from torch.nn.parameter import Parameter
 from torch.optim import Optimizer
-from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState)
+
+from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
 from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
+from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
+from colossalai.logging import get_dist_logger
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.zero.sharded_model import ShardedModelV2
+from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32


 class OptimState(Enum):
@ -36,9 +38,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
    `PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_

    GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
-    which is detected by a runtime memory tracer. 
+    which is detected by a runtime memory tracer.

-    We place as many OS chunks in the margin space as possible. 
+    We place as many OS chunks in the margin space as possible.

    The size of margin space can be controlled by ``gpu_margin_mem_ratio``.
    If it is set as ``0.0``, it is the same as classical ZeRO optimizer.
@ -54,8 +56,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
        sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
            shard strategy provided by sharded model to shard param fp32 tensors.
        optimizer (Optimizer): An Optimizer instance.
-        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) 
-            which will be used when using hybrid CPU optimizer. 
+        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
+            which will be used when using hybrid CPU optimizer.
            This argument is meaningless when `tensor_placement_policy` of `ShardedModelV2` is not "auto".
            Defaults to 0.0.
        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@ -1,3 +1,7 @@
+# This code has been adapted from the DeepSpeed library.
+# Copyright (c) Microsoft Corporation.
+
+# Licensed under the MIT License.
 import importlib
 import os
 import time