[doc] add deepspeed citation and copyright (#2996)

* [doc] add deepspeed citation and copyright

* [doc] add deepspeed citation and copyright

* [doc] add deepspeed citation and copyright
pull/2999/head
ver217 2023-03-04 20:08:11 +08:00 committed by GitHub
parent e0a1c1321c
commit 823f3b9cf4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 336 additions and 289 deletions

View File

@ -1,16 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
from typing import List, Iterable from typing import Iterable, List, Optional, Type
from torch import Tensor
from torch.nn import Module from torch.nn import Module
from torch.nn.modules.loss import _Loss from torch.nn.modules.loss import _Loss
from colossalai.logging import get_dist_logger
from torch import Tensor
from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
from typing import Optional, Type
from colossalai.engine.gradient_handler import BaseGradientHandler from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
@ -93,7 +93,7 @@ class Engine:
if self.uses_pipeline: if self.uses_pipeline:
self._schedule.pre_processing(self) self._schedule.pre_processing(self)
#register hook if any # register hook if any
if len(self._ophook_list) > 0: if len(self._ophook_list) > 0:
register_ophooks_recursively(self._model, self._ophook_list) register_ophooks_recursively(self._model, self._ophook_list)

View File

@ -1,7 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import operator import operator
from functools import reduce from functools import reduce
from typing import Any, Optional, Tuple, Union from typing import Any, Optional, Tuple, Union
import torch import torch
from ..registry import meta_profiler_function from ..registry import meta_profiler_function

View File

@ -1,8 +1,13 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import math
import operator import operator
from functools import reduce from functools import reduce
import math
from typing import Tuple from typing import Tuple
import torch import torch
from ..registry import meta_profiler_module from ..registry import meta_profiler_module

View File

@ -1,5 +1,10 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from typing import Tuple, Union from typing import Tuple, Union
import torch import torch
from ..registry import meta_profiler_module from ..registry import meta_profiler_module

View File

@ -1,7 +1,7 @@
import torch # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
from typing import List, Callable, Optional
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, List, Optional
import torch import torch

View File

@ -1,6 +1,7 @@
/* Copyright 2021 The LightSeq Team /* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/ */
#include "cublas_wrappers.h" #include "cublas_wrappers.h"

View File

@ -1,6 +1,7 @@
/* Copyright 2021 The LightSeq Team /* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/ */
#pragma once #pragma once

View File

@ -3,6 +3,7 @@
/* Copyright 2021 The LightSeq Team /* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/ */
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>

View File

@ -1,6 +1,7 @@
/* Copyright 2021 The LightSeq Team /* Copyright 2021 The LightSeq Team
Copyright Microsoft DeepSpeed Copyright Microsoft DeepSpeed
This file is adapted from Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed
Licensed under the MIT License.
*/ */
#pragma once #pragma once

View File

@ -1,5 +1,10 @@
// modified from // modified from
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Licensed under the MIT License.
*/
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>

View File

@ -1,12 +1,18 @@
// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh // modified from
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Licensed under the MIT License.
*/
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include "compat.h"
#include <assert.h> #include <assert.h>
#include <c10/cuda/CUDAGuard.h>
#include "compat.h"
// #include <iostream> // #include <iostream>
@ -17,54 +23,52 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template <int n> template <int n>
struct TensorListMetadata struct TensorListMetadata {
{
void *addresses[n][depth_to_max_tensors[n - 1]]; void *addresses[n][depth_to_max_tensors[n - 1]];
int sizes[depth_to_max_tensors[n - 1]]; int sizes[depth_to_max_tensors[n - 1]];
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int. int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a
// full int.
int start_tensor_this_launch; int start_tensor_this_launch;
}; };
template <typename T, typename U, typename... ArgTypes> template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel( __global__ void multi_tensor_apply_kernel(int chunk_size,
int chunk_size, volatile int *noop_flag, T tl,
volatile int *noop_flag, U callable, ArgTypes... args) {
T tl, // Hand the chunk information to the user-supplied functor to process however
U callable, // it likes.
ArgTypes... args)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...); callable(chunk_size, noop_flag, tl, args...);
} }
template <int depth, typename T, typename... ArgTypes> template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply( void multi_tensor_apply(
int block_size, int block_size, int chunk_size, const at::Tensor &noop_flag,
int chunk_size, const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
const at::Tensor &noop_flag, ArgTypes... args) {
const std::vector<std::vector<at::Tensor>> &tensor_lists,
T callable,
ArgTypes... args)
{
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size(); int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
auto ref_device = tensor_lists[0][0].device(); auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda"); TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices for (int l = 0; l < tensor_lists.size();
{ l++) // No range-based for because I need indices
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for (int t = 0; t < tensor_lists[l].size(); t++)
{ {
TORCH_CHECK(tensor_lists[l].size() == len0,
"Size mismatch among tensor lists");
for (int t = 0; t < tensor_lists[l].size(); t++) {
// TODO: Print which tensor fails. // TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous(); bool contiguous_memory = tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5 #ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); contiguous_memory =
(contiguous_memory ||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif #endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor"); TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch"); "A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
"Size mismatch");
} }
} }
@ -78,17 +82,16 @@ void multi_tensor_apply(
tl.start_tensor_this_launch = 0; tl.start_tensor_this_launch = 0;
int loc_block_info = 0; int loc_block_info = 0;
int loc_tensor_info = 0; int loc_tensor_info = 0;
for (int t = 0; t < ntensors; t++) for (int t = 0; t < ntensors; t++) {
{
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for (int d = 0; d < depth; d++) for (int d = 0; d < depth; d++)
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
loc_tensor_info++; loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; int chunks_this_tensor =
(tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
{
// std::cout << chunks_this_tensor << std::endl; // std::cout << chunks_this_tensor << std::endl;
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk; tl.block_to_chunk[loc_block_info] = chunk;
@ -98,29 +101,23 @@ void multi_tensor_apply(
chunk == chunks_this_tensor - 1); chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
if (tensors_full || blocks_full || last_chunk) if (tensors_full || blocks_full || last_chunk) {
{
// using accscalar_t = acc_type<scalar_t, true>; // using accscalar_t = acc_type<scalar_t, true>;
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>( multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
chunk_size, chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
noop_flag.DATA_PTR<int>(),
tl,
callable,
args...);
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
// Reset. The control flow possibilities here make my brain hurt. // Reset. The control flow possibilities here make my brain hurt.
loc_block_info = 0; loc_block_info = 0;
if (chunk == chunks_this_tensor - 1) if (chunk == chunks_this_tensor - 1) {
{ // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl; // << std::endl;
loc_tensor_info = 0; loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1; tl.start_tensor_this_launch = t + 1;
} } else {
else // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
{ // << std::endl;
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
tl.sizes[0] = tl.sizes[loc_tensor_info - 1]; tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
for (int d = 0; d < depth; d++) for (int d = 0; d < depth; d++)
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1]; tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];

View File

@ -1,4 +1,9 @@
/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */ /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Licensed under the MIT License.
*/
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include "compat.h" #include "compat.h"

View File

@ -1,4 +1,11 @@
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
'''
Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
Licensed under the MIT License.
'''
import torch import torch
from colossalai.registry import OPTIMIZERS from colossalai.registry import OPTIMIZERS

View File

@ -1,3 +1,4 @@
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
import math import math
import warnings import warnings
from enum import Enum from enum import Enum

View File

@ -1,7 +1,12 @@
import torch # This code has been adapted from the DeepSpeed library.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import functools import functools
from typing import Optional from typing import Optional
import torch
def substitute_init_recursively(cls, func, visited: set): def substitute_init_recursively(cls, func, visited: set):
for subcls in cls.__subclasses__(): for subcls in cls.__subclasses__():

View File

@ -1,3 +1,4 @@
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
import functools import functools
import itertools import itertools
from collections import OrderedDict from collections import OrderedDict

View File

@ -1,3 +1,4 @@
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
from functools import partial from functools import partial
from typing import Optional from typing import Optional

View File

@ -1,3 +1,4 @@
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
from enum import Enum from enum import Enum
from os import stat from os import stat
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple
@ -5,20 +6,21 @@ from typing import Dict, Optional, Tuple
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage)
from colossalai.zero.sharded_model import ShardedModelV2
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
from torch import Tensor from torch import Tensor
from torch.distributed import ProcessGroup from torch.distributed import ProcessGroup
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from torch.optim import Optimizer from torch.optim import Optimizer
from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState)
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.zero.sharded_model import ShardedModelV2
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
class OptimState(Enum): class OptimState(Enum):

View File

@ -1,3 +1,7 @@
# This code has been adapted from the DeepSpeed library.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import importlib import importlib
import os import os
import time import time