mirror of https://github.com/hpcaitech/ColossalAI
[doc] add deepspeed citation and copyright (#2996)
* [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyright * [doc] add deepspeed citation and copyrightpull/2999/head
parent
e0a1c1321c
commit
823f3b9cf4
|
@ -1,16 +1,16 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
|
||||
from typing import List, Iterable
|
||||
from typing import Iterable, List, Optional, Type
|
||||
|
||||
from torch import Tensor
|
||||
from torch.nn import Module
|
||||
from torch.nn.modules.loss import _Loss
|
||||
|
||||
from colossalai.logging import get_dist_logger
|
||||
from torch import Tensor
|
||||
from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
|
||||
from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
|
||||
from typing import Optional, Type
|
||||
from colossalai.engine.gradient_handler import BaseGradientHandler
|
||||
from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
|
||||
from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
|
||||
|
@ -93,7 +93,7 @@ class Engine:
|
|||
if self.uses_pipeline:
|
||||
self._schedule.pre_processing(self)
|
||||
|
||||
#register hook if any
|
||||
# register hook if any
|
||||
if len(self._ophook_list) > 0:
|
||||
register_ophooks_recursively(self._model, self._ophook_list)
|
||||
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
|
||||
# Licensed under the MIT License.
|
||||
import operator
|
||||
from functools import reduce
|
||||
from typing import Any, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..registry import meta_profiler_function
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
|
||||
# Licensed under the MIT License.
|
||||
import math
|
||||
import operator
|
||||
from functools import reduce
|
||||
import math
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from ..registry import meta_profiler_module
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
|
||||
# Licensed under the MIT License.
|
||||
from typing import Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..registry import meta_profiler_module
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
from typing import List, Callable, Optional
|
||||
|
||||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright 2021 The LightSeq Team
|
||||
Copyright Microsoft DeepSpeed
|
||||
This file is adapted from Microsoft DeepSpeed
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#include "cublas_wrappers.h"
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright 2021 The LightSeq Team
|
||||
Copyright Microsoft DeepSpeed
|
||||
This file is adapted from Microsoft DeepSpeed
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
/* Copyright 2021 The LightSeq Team
|
||||
Copyright Microsoft DeepSpeed
|
||||
This file is adapted from Microsoft DeepSpeed
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright 2021 The LightSeq Team
|
||||
Copyright Microsoft DeepSpeed
|
||||
This file is adapted from Microsoft DeepSpeed
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
// modified from
|
||||
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu
|
||||
/* Copyright 2020 The Microsoft DeepSpeed Team
|
||||
Copyright NVIDIA/apex
|
||||
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
|
||||
// modified from
|
||||
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh
|
||||
/* Copyright 2020 The Microsoft DeepSpeed Team
|
||||
Copyright NVIDIA/apex
|
||||
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "compat.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
// #include <iostream>
|
||||
|
||||
|
@ -17,54 +23,52 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
|
|||
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
|
||||
|
||||
template <int n>
|
||||
struct TensorListMetadata
|
||||
{
|
||||
struct TensorListMetadata {
|
||||
void *addresses[n][depth_to_max_tensors[n - 1]];
|
||||
int sizes[depth_to_max_tensors[n - 1]];
|
||||
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
|
||||
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
|
||||
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a
|
||||
// full int.
|
||||
int start_tensor_this_launch;
|
||||
};
|
||||
|
||||
template <typename T, typename U, typename... ArgTypes>
|
||||
__global__ void multi_tensor_apply_kernel(
|
||||
int chunk_size,
|
||||
volatile int *noop_flag,
|
||||
T tl,
|
||||
U callable,
|
||||
ArgTypes... args)
|
||||
{
|
||||
// Hand the chunk information to the user-supplied functor to process however it likes.
|
||||
__global__ void multi_tensor_apply_kernel(int chunk_size,
|
||||
volatile int *noop_flag, T tl,
|
||||
U callable, ArgTypes... args) {
|
||||
// Hand the chunk information to the user-supplied functor to process however
|
||||
// it likes.
|
||||
callable(chunk_size, noop_flag, tl, args...);
|
||||
}
|
||||
|
||||
template <int depth, typename T, typename... ArgTypes>
|
||||
void multi_tensor_apply(
|
||||
int block_size,
|
||||
int chunk_size,
|
||||
const at::Tensor &noop_flag,
|
||||
const std::vector<std::vector<at::Tensor>> &tensor_lists,
|
||||
T callable,
|
||||
ArgTypes... args)
|
||||
{
|
||||
int block_size, int chunk_size, const at::Tensor &noop_flag,
|
||||
const std::vector<std::vector<at::Tensor>> &tensor_lists, T callable,
|
||||
ArgTypes... args) {
|
||||
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
|
||||
int len0 = tensor_lists[0].size();
|
||||
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
|
||||
auto ref_device = tensor_lists[0][0].device();
|
||||
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
|
||||
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
|
||||
{
|
||||
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
|
||||
for (int t = 0; t < tensor_lists[l].size(); t++)
|
||||
for (int l = 0; l < tensor_lists.size();
|
||||
l++) // No range-based for because I need indices
|
||||
{
|
||||
TORCH_CHECK(tensor_lists[l].size() == len0,
|
||||
"Size mismatch among tensor lists");
|
||||
for (int t = 0; t < tensor_lists[l].size(); t++) {
|
||||
// TODO: Print which tensor fails.
|
||||
bool contiguous_memory = tensor_lists[l][t].is_contiguous();
|
||||
#ifdef VERSION_GE_1_5
|
||||
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
|
||||
contiguous_memory =
|
||||
(contiguous_memory ||
|
||||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
|
||||
#endif
|
||||
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
|
||||
TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
|
||||
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
|
||||
TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
|
||||
"A tensor was not on the same device as the first tensor");
|
||||
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(),
|
||||
"Size mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,17 +82,16 @@ void multi_tensor_apply(
|
|||
tl.start_tensor_this_launch = 0;
|
||||
int loc_block_info = 0;
|
||||
int loc_tensor_info = 0;
|
||||
for (int t = 0; t < ntensors; t++)
|
||||
{
|
||||
for (int t = 0; t < ntensors; t++) {
|
||||
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
|
||||
for (int d = 0; d < depth; d++)
|
||||
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
|
||||
loc_tensor_info++;
|
||||
|
||||
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
|
||||
int chunks_this_tensor =
|
||||
(tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
|
||||
|
||||
for (int chunk = 0; chunk < chunks_this_tensor; chunk++)
|
||||
{
|
||||
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
|
||||
// std::cout << chunks_this_tensor << std::endl;
|
||||
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
|
||||
tl.block_to_chunk[loc_block_info] = chunk;
|
||||
|
@ -98,29 +101,23 @@ void multi_tensor_apply(
|
|||
chunk == chunks_this_tensor - 1);
|
||||
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
|
||||
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
|
||||
if (tensors_full || blocks_full || last_chunk)
|
||||
{
|
||||
if (tensors_full || blocks_full || last_chunk) {
|
||||
// using accscalar_t = acc_type<scalar_t, true>;
|
||||
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
|
||||
chunk_size,
|
||||
noop_flag.DATA_PTR<int>(),
|
||||
tl,
|
||||
callable,
|
||||
args...);
|
||||
chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Reset. The control flow possibilities here make my brain hurt.
|
||||
loc_block_info = 0;
|
||||
if (chunk == chunks_this_tensor - 1)
|
||||
{
|
||||
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
|
||||
if (chunk == chunks_this_tensor - 1) {
|
||||
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3
|
||||
// << std::endl;
|
||||
loc_tensor_info = 0;
|
||||
tl.start_tensor_this_launch = t + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
|
||||
} else {
|
||||
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3
|
||||
// << std::endl;
|
||||
tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
|
||||
for (int d = 0; d < depth; d++)
|
||||
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
|
||||
/* Copyright 2020 The Microsoft DeepSpeed Team
|
||||
Copyright NVIDIA/apex
|
||||
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
|
||||
Licensed under the MIT License.
|
||||
*/
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
|
|
@ -1,4 +1,11 @@
|
|||
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
|
||||
'''
|
||||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
|
||||
Copyright NVIDIA/apex
|
||||
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
|
||||
Licensed under the MIT License.
|
||||
'''
|
||||
import torch
|
||||
|
||||
from colossalai.registry import OPTIMIZERS
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
import math
|
||||
import warnings
|
||||
from enum import Enum
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
import torch
|
||||
# This code has been adapted from the DeepSpeed library.
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import functools
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def substitute_init_recursively(cls, func, visited: set):
|
||||
for subcls in cls.__subclasses__():
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
import functools
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# this code is inspired by the DeepSpeed library and implemented with our own design from scratch
|
||||
from enum import Enum
|
||||
from os import stat
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
@ -5,20 +6,21 @@ from typing import Dict, Optional, Tuple
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.optimizer import ColossalaiOptimizer
|
||||
from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage)
|
||||
from colossalai.zero.sharded_model import ShardedModelV2
|
||||
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
|
||||
from torch import Tensor
|
||||
from torch.distributed import ProcessGroup
|
||||
from torch.nn.parameter import Parameter
|
||||
from torch.optim import Optimizer
|
||||
from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState)
|
||||
|
||||
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
|
||||
from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
|
||||
from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.optimizer import ColossalaiOptimizer
|
||||
from colossalai.zero.sharded_model import ShardedModelV2
|
||||
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
|
||||
|
||||
|
||||
class OptimState(Enum):
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# This code has been adapted from the DeepSpeed library.
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
|
||||
# Licensed under the MIT License.
|
||||
import importlib
|
||||
import os
|
||||
import time
|
||||
|
|
Loading…
Reference in New Issue