[doc] add deepspeed citation and copyright (#2996)

* [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyright
2023-03-04 20:08:11 +08:00 · 2023-03-04 20:08:11 +08:00 · 823f3b9cf4
parent e0a1c1321c
commit 823f3b9cf4
19 changed files with 336 additions and 289 deletions
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -1,16 +1,16 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
-from typing import List, Iterable
+from typing import Iterable, List, Optional, Type
 from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 from colossalai.logging import get_dist_logger
 from torch import Tensor
 from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
 from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
 from typing import Optional, Type
 from colossalai.engine.gradient_handler import BaseGradientHandler
 from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
 from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
@ -93,7 +93,7 @@ class Engine:
        if self.uses_pipeline:
            self._schedule.pre_processing(self)
-        #register hook if any
+        # register hook if any
        if len(self._ophook_list) > 0:
            register_ophooks_recursively(self._model, self._ophook_list)
--- a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
+++ b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py
@ -1,7 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import operator
 from functools import reduce
 from typing import Any, Optional, Tuple, Union
 import torch
 from ..registry import meta_profiler_function
--- a/colossalai/fx/profiler/experimental/profiler_module/convolution.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/convolution.py
@ -1,8 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import math
 import operator
 from functools import reduce
 import math
 from typing import Tuple
 import torch
 from ..registry import meta_profiler_module
--- a/colossalai/fx/profiler/experimental/profiler_module/normalization.py
+++ b/colossalai/fx/profiler/experimental/profiler_module/normalization.py
@ -1,5 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 from typing import Tuple, Union
 import torch
 from ..registry import meta_profiler_module
--- a/colossalai/gemini/ophooks/utils.py
+++ b/colossalai/gemini/ophooks/utils.py
@ -1,7 +1,7 @@
-import torch
+# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from typing import List, Callable, Optional
 from abc import ABC, abstractmethod
 from typing import Callable, List, Optional
 import torch
--- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
+++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu
@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
   Licensed under the MIT License.
 */
 #include "cublas_wrappers.h"
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h
@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
   Licensed under the MIT License.
 */
 #pragma once
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h
@ -3,6 +3,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
   Licensed under the MIT License.
 */
 #include <cuda.h>
 #include <cuda_fp16.h>
--- a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
+++ b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h
@ -1,6 +1,7 @@
 /* Copyright 2021 The LightSeq Team
   Copyright Microsoft DeepSpeed
   This file is adapted from Microsoft DeepSpeed
   Licensed under the MIT License.
 */
 #pragma once
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
@ -1,5 +1,10 @@
 // modified from
 // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu
 /* Copyright 2020 The Microsoft DeepSpeed Team
   Copyright NVIDIA/apex
   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
   Licensed under the MIT License.
 */
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh
@ -1,12 +1,18 @@
-// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
+// modified from
 // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
 /* Copyright 2020 The Microsoft DeepSpeed Team
   Copyright NVIDIA/apex
   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
   Licensed under the MIT License.
 */
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "compat.h"
 #include <assert.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "compat.h"
 // #include <iostream>
@ -17,54 +23,52 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
 template <int n>
-struct TensorListMetadata
+struct TensorListMetadata {
 {
  void *addresses[n][depth_to_max_tensors[n - 1]];
  int sizes[depth_to_max_tensors[n - 1]];
  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-    int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
+  int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a
                                                   // full int.
  int start_tensor_this_launch;
 };
 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(
+__global__ void multi_tensor_apply_kernel(int chunk_size,
-    int chunk_size,
+                                          volatile int *noop_flag, T tl,
-    volatile int *noop_flag,
+                                          U callable, ArgTypes... args) {
-    T tl,
+  // Hand the chunk information to the user-supplied functor to process however
-    U callable,
+  // it likes.
    ArgTypes... args)
 {
    // Hand the chunk information to the user-supplied functor to process however it likes.
  callable(chunk_size, noop_flag, tl, args...);
 }
 template <int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
-    int block_size,
+    int block_size, int chunk_size, const at::Tensor &noop_flag,
-    int chunk_size,
+    const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
-    const at::Tensor &noop_flag,
+    ArgTypes... args) {
    const std::vector<std::vector<at::Tensor>> &tensor_lists,
    T callable,
    ArgTypes... args)
 {
  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
  int len0 = tensor_lists[0].size();
  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
  auto ref_device = tensor_lists[0][0].device();
  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
-    for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
+  for (int l = 0; l < tensor_lists.size();
-    {
+       l++)  // No range-based for because I need indices
        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
        for (int t = 0; t < tensor_lists[l].size(); t++)
  {
    TORCH_CHECK(tensor_lists[l].size() == len0,
                "Size mismatch among tensor lists");
    for (int t = 0; t < tensor_lists[l].size(); t++) {
      // TODO:  Print which tensor fails.
      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
 #ifdef VERSION_GE_1_5
-            contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+      contiguous_memory =
          (contiguous_memory ||
           tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 #endif
      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-            TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
-            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+                  "A tensor was not on the same device as the first tensor");
      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
                  "Size mismatch");
    }
  }
@ -78,17 +82,16 @@ void multi_tensor_apply(
  tl.start_tensor_this_launch = 0;
  int loc_block_info = 0;
  int loc_tensor_info = 0;
-    for (int t = 0; t < ntensors; t++)
+  for (int t = 0; t < ntensors; t++) {
    {
    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
    for (int d = 0; d < depth; d++)
      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
    loc_tensor_info++;
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+    int chunks_this_tensor =
        (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
        {
      // std::cout << chunks_this_tensor << std::endl;
      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
      tl.block_to_chunk[loc_block_info] = chunk;
@ -98,29 +101,23 @@ void multi_tensor_apply(
                           chunk == chunks_this_tensor - 1);
      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
-            if (tensors_full || blocks_full || last_chunk)
+      if (tensors_full || blocks_full || last_chunk) {
            {
        // using accscalar_t = acc_type<scalar_t, true>;
        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
-                    chunk_size,
+            chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
                    noop_flag.DATA_PTR<int>(),
                    tl,
                    callable,
                    args...);
        AT_CUDA_CHECK(cudaGetLastError());
        // Reset.  The control flow possibilities here make my brain hurt.
        loc_block_info = 0;
-                if (chunk == chunks_this_tensor - 1)
+        if (chunk == chunks_this_tensor - 1) {
-                {
+          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
-                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
+          // << std::endl;
          loc_tensor_info = 0;
          tl.start_tensor_this_launch = t + 1;
-                }
+        } else {
-                else
+          // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
-                {
+          // << std::endl;
                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
          tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
          for (int d = 0; d < depth; d++)
            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@ -1,4 +1,9 @@
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 /* Copyright 2020 The Microsoft DeepSpeed Team
   Copyright NVIDIA/apex
   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
   Licensed under the MIT License.
 */
 #include <ATen/ATen.h>
 #include "compat.h"
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@ -1,4 +1,11 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
 '''
 Copyright 2020 The Microsoft DeepSpeed Team
 Copyright NVIDIA/apex
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 Licensed under the MIT License.
 '''
 import torch
 from colossalai.registry import OPTIMIZERS
--- a/colossalai/nn/optimizer/zero_optimizer.py
+++ b/colossalai/nn/optimizer/zero_optimizer.py
@ -1,3 +1,4 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import math
 import warnings
 from enum import Enum
--- a/colossalai/utils/model/utils.py
+++ b/colossalai/utils/model/utils.py
@ -1,7 +1,12 @@
-import torch
+# This code has been adapted from the DeepSpeed library.
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import functools
 from typing import Optional
 import torch
 def substitute_init_recursively(cls, func, visited: set):
    for subcls in cls.__subclasses__():
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@ -1,3 +1,4 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import functools
 import itertools
 from collections import OrderedDict
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
@ -1,3 +1,4 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from functools import partial
 from typing import Optional
--- a/colossalai/zero/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py
@ -1,3 +1,4 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 from enum import Enum
 from os import stat
 from typing import Dict, Optional, Tuple
@ -5,20 +6,21 @@ from typing import Dict, Optional, Tuple
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage)
 from colossalai.zero.sharded_model import ShardedModelV2
 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
 from torch import Tensor
 from torch.distributed import ProcessGroup
 from torch.nn.parameter import Parameter
 from torch.optim import Optimizer
-from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState)
+
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
 from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
 from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.zero.sharded_model import ShardedModelV2
 from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
 class OptimState(Enum):
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@ -1,3 +1,7 @@
 # This code has been adapted from the DeepSpeed library.
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 import importlib
 import os
 import time