diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py index 146a29669..59d8e1058 100644 --- a/colossalai/engine/_base_engine.py +++ b/colossalai/engine/_base_engine.py @@ -1,16 +1,16 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch -from typing import List, Iterable +from typing import Iterable, List, Optional, Type + +from torch import Tensor from torch.nn import Module from torch.nn.modules.loss import _Loss -from colossalai.logging import get_dist_logger -from torch import Tensor -from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook -from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule -from typing import Optional, Type from colossalai.engine.gradient_handler import BaseGradientHandler +from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule +from colossalai.gemini.ophooks import BaseOpHook, register_ophooks_recursively from colossalai.logging import get_dist_logger @@ -93,7 +93,7 @@ class Engine: if self.uses_pipeline: self._schedule.pre_processing(self) - #register hook if any + # register hook if any if len(self._ophook_list) > 0: register_ophooks_recursively(self._model, self._ophook_list) diff --git a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py index 2cf50133d..8d1c8a8c6 100644 --- a/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py +++ b/colossalai/fx/profiler/experimental/profiler_function/arithmetic.py @@ -1,7 +1,12 @@ +# Copyright (c) Microsoft Corporation. + +# Licensed under the MIT License. import operator from functools import reduce from typing import Any, Optional, Tuple, Union + import torch + from ..registry import meta_profiler_function diff --git a/colossalai/fx/profiler/experimental/profiler_module/convolution.py b/colossalai/fx/profiler/experimental/profiler_module/convolution.py index 3193489fe..a4c15b91e 100644 --- a/colossalai/fx/profiler/experimental/profiler_module/convolution.py +++ b/colossalai/fx/profiler/experimental/profiler_module/convolution.py @@ -1,8 +1,13 @@ +# Copyright (c) Microsoft Corporation. + +# Licensed under the MIT License. +import math import operator from functools import reduce -import math from typing import Tuple + import torch + from ..registry import meta_profiler_module diff --git a/colossalai/fx/profiler/experimental/profiler_module/normalization.py b/colossalai/fx/profiler/experimental/profiler_module/normalization.py index e9939da7b..49e5e6fa5 100644 --- a/colossalai/fx/profiler/experimental/profiler_module/normalization.py +++ b/colossalai/fx/profiler/experimental/profiler_module/normalization.py @@ -1,5 +1,10 @@ +# Copyright (c) Microsoft Corporation. + +# Licensed under the MIT License. from typing import Tuple, Union + import torch + from ..registry import meta_profiler_module diff --git a/colossalai/gemini/ophooks/utils.py b/colossalai/gemini/ophooks/utils.py index fe08405c8..84e8298c1 100644 --- a/colossalai/gemini/ophooks/utils.py +++ b/colossalai/gemini/ophooks/utils.py @@ -1,7 +1,7 @@ -import torch -from typing import List, Callable, Optional - +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch from abc import ABC, abstractmethod +from typing import Callable, List, Optional + import torch diff --git a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu index 68be1f6d7..09f34763f 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu +++ b/colossalai/kernel/cuda_native/csrc/kernels/cublas_wrappers.cu @@ -1,6 +1,7 @@ /* Copyright 2021 The LightSeq Team Copyright Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed + Licensed under the MIT License. */ #include "cublas_wrappers.h" diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h index 7ebb9ce48..90255152b 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h +++ b/colossalai/kernel/cuda_native/csrc/kernels/include/cublas_wrappers.h @@ -1,6 +1,7 @@ /* Copyright 2021 The LightSeq Team Copyright Microsoft DeepSpeed This file is adapted from Microsoft DeepSpeed + Licensed under the MIT License. */ #pragma once diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h index ec963259f..8186da1ee 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h +++ b/colossalai/kernel/cuda_native/csrc/kernels/include/feed_forward.h @@ -1,68 +1,69 @@ -#pragma once - -/* Copyright 2021 The LightSeq Team - Copyright Microsoft DeepSpeed - This file is adapted from Microsoft DeepSpeed -*/ -#include -#include -#include - -#include - -#include "cublas_wrappers.h" -#include "kernels.h" - -template -class FeedForward { - public: - struct Config { - int outputSize; - int inputSize; - std::array gemm_algos; - Config(int outputs, int inputs) - : outputSize(outputs), - inputSize(inputs), - gemm_algos(std::array({99, 99, 99})) {} - }; - - FeedForward(Config config) : config_(config) {} - - ~FeedForward() {} - - void Forward(int bsz, const T *input_ptr, const T *weights, T *out, - cublasHandle_t &_cublasHandle) { - float alpha = T(1.); - float beta = T(0.); - - cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize, - bsz, config_.inputSize, &alpha, &beta, weights, input_ptr, - out, cublasGemmAlgo_t(config_.gemm_algos[0])); - } - void Backward(int bsz, const T *out_grad, const T *input_ptr, - const T *weights, T *weights_grad, T *bias_grad, - cublasHandle_t &_cublasHandle, cudaStream_t &stream, - T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr, - bool compute_bias = true) { - float alpha = (T)1.0, beta = (T)0.0; - cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize, - config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad, - weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1])); - - cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize, - bsz, config_.outputSize, &alpha, &beta, weights, out_grad, - inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2])); - if (compute_bias) { - launch_fuse_transpose_bias_kernel(out_grad, bias_grad, bsz, - config_.outputSize, stream); - } - } - - void reset_size(int outputSize, int inputSize) { - config_.outputSize = outputSize; - config_.inputSize = inputSize; - } - - private: - Config config_; -}; +#pragma once + +/* Copyright 2021 The LightSeq Team + Copyright Microsoft DeepSpeed + This file is adapted from Microsoft DeepSpeed + Licensed under the MIT License. +*/ +#include +#include +#include + +#include + +#include "cublas_wrappers.h" +#include "kernels.h" + +template +class FeedForward { + public: + struct Config { + int outputSize; + int inputSize; + std::array gemm_algos; + Config(int outputs, int inputs) + : outputSize(outputs), + inputSize(inputs), + gemm_algos(std::array({99, 99, 99})) {} + }; + + FeedForward(Config config) : config_(config) {} + + ~FeedForward() {} + + void Forward(int bsz, const T *input_ptr, const T *weights, T *out, + cublasHandle_t &_cublasHandle) { + float alpha = T(1.); + float beta = T(0.); + + cublas_gemm_ex(_cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, config_.outputSize, + bsz, config_.inputSize, &alpha, &beta, weights, input_ptr, + out, cublasGemmAlgo_t(config_.gemm_algos[0])); + } + void Backward(int bsz, const T *out_grad, const T *input_ptr, + const T *weights, T *weights_grad, T *bias_grad, + cublasHandle_t &_cublasHandle, cudaStream_t &stream, + T *inp_grad_out = nullptr, T *out_grad_trans_out = nullptr, + bool compute_bias = true) { + float alpha = (T)1.0, beta = (T)0.0; + cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, config_.inputSize, + config_.outputSize, bsz, &alpha, &beta, input_ptr, out_grad, + weights_grad, cublasGemmAlgo_t(config_.gemm_algos[1])); + + cublas_gemm_ex(_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, config_.inputSize, + bsz, config_.outputSize, &alpha, &beta, weights, out_grad, + inp_grad_out, cublasGemmAlgo_t(config_.gemm_algos[2])); + if (compute_bias) { + launch_fuse_transpose_bias_kernel(out_grad, bias_grad, bsz, + config_.outputSize, stream); + } + } + + void reset_size(int outputSize, int inputSize) { + config_.outputSize = outputSize; + config_.inputSize = inputSize; + } + + private: + Config config_; +}; diff --git a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h index 3120660b9..d386650e8 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h +++ b/colossalai/kernel/cuda_native/csrc/kernels/include/strided_batch_gemm.h @@ -1,99 +1,100 @@ -/* Copyright 2021 The LightSeq Team - Copyright Microsoft DeepSpeed - This file is adapted from Microsoft DeepSpeed -*/ -#pragma once - -#include -#include -#include - -#include - -#include "cublas_wrappers.h" - -template -class StridedBatchGemm { - public: - struct Config { - int m; - int n; - int k; - float alpha; - float beta; - cublasOperation_t op_A; - cublasOperation_t op_B; - std::array gemm_algos; - - Config(float param_alpha, float param_beta, cublasOperation_t opA, - cublasOperation_t opB) - : alpha(param_alpha), - beta(param_beta), - op_A(opA), - op_B(opB), - gemm_algos(std::array({99, 99, 99})) {} - void SetConfig(int mm, int nn, int kk) { - m = mm; - n = nn; - k = kk; - } - }; - - StridedBatchGemm(const Config &config) : _config(config) {} - - virtual ~StridedBatchGemm() {} - - void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b, - cublasHandle_t handle) { - int stride_a = _config.m * _config.k; - int stride_b = _config.n * _config.k; - int stride_c = _config.m * _config.n; - - cublas_strided_batched_gemm( - handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta, - _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a, - stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0])); - } - - void Backward(int bsz, const T *d_output, const T *_buffer_a, - const T *_buffer_b, cublasHandle_t handle, - T *inpGradA = nullptr, T *inpGradB = nullptr) { - int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m); - int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k); - - int stride_a = mb * _config.n; - int stride_b = _config.n * kb; - int stride_c = _config.m * _config.k; - - // B need to transpose. - cublasOperation_t op_b = - (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T); - - // Calculate d_A. - cublas_strided_batched_gemm( - handle, mb, kb, _config.n, &_config.alpha, &_config.beta, - (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output), - (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA, - CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz, - cublasGemmAlgo_t(_config.gemm_algos[1])); - - // A need to transpose. - cublasOperation_t op_a = - (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T); - - stride_a = _config.m * _config.k; - stride_b = _config.m * _config.n; - stride_c = _config.n * _config.k; - - // Calculate d_B. - cublas_strided_batched_gemm( - handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta, - _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b, - stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2])); - } - - inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); } - - private: - Config _config; -}; +/* Copyright 2021 The LightSeq Team + Copyright Microsoft DeepSpeed + This file is adapted from Microsoft DeepSpeed + Licensed under the MIT License. +*/ +#pragma once + +#include +#include +#include + +#include + +#include "cublas_wrappers.h" + +template +class StridedBatchGemm { + public: + struct Config { + int m; + int n; + int k; + float alpha; + float beta; + cublasOperation_t op_A; + cublasOperation_t op_B; + std::array gemm_algos; + + Config(float param_alpha, float param_beta, cublasOperation_t opA, + cublasOperation_t opB) + : alpha(param_alpha), + beta(param_beta), + op_A(opA), + op_B(opB), + gemm_algos(std::array({99, 99, 99})) {} + void SetConfig(int mm, int nn, int kk) { + m = mm; + n = nn; + k = kk; + } + }; + + StridedBatchGemm(const Config &config) : _config(config) {} + + virtual ~StridedBatchGemm() {} + + void Forward(int bsz, T *output, const T *_buffer_a, const T *_buffer_b, + cublasHandle_t handle) { + int stride_a = _config.m * _config.k; + int stride_b = _config.n * _config.k; + int stride_c = _config.m * _config.n; + + cublas_strided_batched_gemm( + handle, _config.m, _config.n, _config.k, &_config.alpha, &_config.beta, + _buffer_a, _buffer_b, output, _config.op_A, _config.op_B, stride_a, + stride_b, stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[0])); + } + + void Backward(int bsz, const T *d_output, const T *_buffer_a, + const T *_buffer_b, cublasHandle_t handle, + T *inpGradA = nullptr, T *inpGradB = nullptr) { + int mb = (_config.op_A == CUBLAS_OP_T ? _config.k : _config.m); + int kb = (_config.op_A == CUBLAS_OP_T ? _config.m : _config.k); + + int stride_a = mb * _config.n; + int stride_b = _config.n * kb; + int stride_c = _config.m * _config.k; + + // B need to transpose. + cublasOperation_t op_b = + (_config.op_B == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T); + + // Calculate d_A. + cublas_strided_batched_gemm( + handle, mb, kb, _config.n, &_config.alpha, &_config.beta, + (_config.op_A == CUBLAS_OP_T ? _buffer_b : d_output), + (_config.op_A == CUBLAS_OP_T ? d_output : _buffer_b), inpGradA, + CUBLAS_OP_N, op_b, stride_a, stride_b, stride_c, bsz, + cublasGemmAlgo_t(_config.gemm_algos[1])); + + // A need to transpose. + cublasOperation_t op_a = + (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T); + + stride_a = _config.m * _config.k; + stride_b = _config.m * _config.n; + stride_c = _config.n * _config.k; + + // Calculate d_B. + cublas_strided_batched_gemm( + handle, _config.k, _config.n, _config.m, &_config.alpha, &_config.beta, + _buffer_a, d_output, inpGradB, op_a, CUBLAS_OP_N, stride_a, stride_b, + stride_c, bsz, cublasGemmAlgo_t(_config.gemm_algos[2])); + } + + inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); } + + private: + Config _config; +}; diff --git a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu index afd34bb96..9cc3ae1ea 100644 --- a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu +++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu @@ -1,5 +1,10 @@ // modified from // https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_adam.cu +/* Copyright 2020 The Microsoft DeepSpeed Team + Copyright NVIDIA/apex + This file is adapted from fused adam in NVIDIA/apex, commit a109f85 + Licensed under the MIT License. +*/ #include #include #include diff --git a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh index 9ce411911..ec55dd320 100644 --- a/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh +++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_apply.cuh @@ -1,12 +1,18 @@ -// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh +// modified from +// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_apply.cuh +/* Copyright 2020 The Microsoft DeepSpeed Team + Copyright NVIDIA/apex + This file is adapted from fused adam in NVIDIA/apex, commit a109f85 + Licensed under the MIT License. +*/ #include #include #include #include -#include -#include "compat.h" - #include +#include + +#include "compat.h" // #include @@ -17,117 +23,108 @@ constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; template -struct TensorListMetadata -{ - void *addresses[n][depth_to_max_tensors[n - 1]]; - int sizes[depth_to_max_tensors[n - 1]]; - unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; - int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int. - int start_tensor_this_launch; +struct TensorListMetadata { + void *addresses[n][depth_to_max_tensors[n - 1]]; + int sizes[depth_to_max_tensors[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a + // full int. + int start_tensor_this_launch; }; template -__global__ void multi_tensor_apply_kernel( - int chunk_size, - volatile int *noop_flag, - T tl, - U callable, - ArgTypes... args) -{ - // Hand the chunk information to the user-supplied functor to process however it likes. - callable(chunk_size, noop_flag, tl, args...); +__global__ void multi_tensor_apply_kernel(int chunk_size, + volatile int *noop_flag, T tl, + U callable, ArgTypes... args) { + // Hand the chunk information to the user-supplied functor to process however + // it likes. + callable(chunk_size, noop_flag, tl, args...); } template void multi_tensor_apply( - int block_size, - int chunk_size, - const at::Tensor &noop_flag, - const std::vector> &tensor_lists, - T callable, - ArgTypes... args) -{ - TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); - int len0 = tensor_lists[0].size(); - TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); - auto ref_device = tensor_lists[0][0].device(); - TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda"); - for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices - { - TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists"); - for (int t = 0; t < tensor_lists[l].size(); t++) - { - // TODO: Print which tensor fails. - bool contiguous_memory = tensor_lists[l][t].is_contiguous(); + int block_size, int chunk_size, const at::Tensor &noop_flag, + const std::vector> &tensor_lists, T callable, + ArgTypes... args) { + TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); + int len0 = tensor_lists[0].size(); + TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); + auto ref_device = tensor_lists[0][0].device(); + TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda"); + for (int l = 0; l < tensor_lists.size(); + l++) // No range-based for because I need indices + { + TORCH_CHECK(tensor_lists[l].size() == len0, + "Size mismatch among tensor lists"); + for (int t = 0; t < tensor_lists[l].size(); t++) { + // TODO: Print which tensor fails. + bool contiguous_memory = tensor_lists[l][t].is_contiguous(); #ifdef VERSION_GE_1_5 - contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); + contiguous_memory = + (contiguous_memory || + tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); #endif - TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); - TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor"); - TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch"); - } + TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); + TORCH_CHECK(tensor_lists[l][t].device() == ref_device, + "A tensor was not on the same device as the first tensor"); + TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), + "Size mismatch"); } + } - int ntensors = tensor_lists[0].size(); + int ntensors = tensor_lists[0].size(); - TensorListMetadata tl; + TensorListMetadata tl; - const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); - auto stream = at::cuda::getCurrentCUDAStream(); + const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); + auto stream = at::cuda::getCurrentCUDAStream(); - tl.start_tensor_this_launch = 0; - int loc_block_info = 0; - int loc_tensor_info = 0; - for (int t = 0; t < ntensors; t++) - { - tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); - for (int d = 0; d < depth; d++) - tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); - loc_tensor_info++; + tl.start_tensor_this_launch = 0; + int loc_block_info = 0; + int loc_tensor_info = 0; + for (int t = 0; t < ntensors; t++) { + tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) + tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); + loc_tensor_info++; - int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; + int chunks_this_tensor = + (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size; - for (int chunk = 0; chunk < chunks_this_tensor; chunk++) - { - // std::cout << chunks_this_tensor << std::endl; - tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; - tl.block_to_chunk[loc_block_info] = chunk; - loc_block_info++; + for (int chunk = 0; chunk < chunks_this_tensor; chunk++) { + // std::cout << chunks_this_tensor << std::endl; + tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tl.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; - bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && - chunk == chunks_this_tensor - 1); - bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); - bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); - if (tensors_full || blocks_full || last_chunk) - { - // using accscalar_t = acc_type; - multi_tensor_apply_kernel<<>>( - chunk_size, - noop_flag.DATA_PTR(), - tl, - callable, - args...); + bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks_this_tensor - 1); + bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]); + bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1); + if (tensors_full || blocks_full || last_chunk) { + // using accscalar_t = acc_type; + multi_tensor_apply_kernel<<>>( + chunk_size, noop_flag.DATA_PTR(), tl, callable, args...); - AT_CUDA_CHECK(cudaGetLastError()); + AT_CUDA_CHECK(cudaGetLastError()); - // Reset. The control flow possibilities here make my brain hurt. - loc_block_info = 0; - if (chunk == chunks_this_tensor - 1) - { - // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl; - loc_tensor_info = 0; - tl.start_tensor_this_launch = t + 1; - } - else - { - // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 << std::endl; - tl.sizes[0] = tl.sizes[loc_tensor_info - 1]; - for (int d = 0; d < depth; d++) - tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1]; - loc_tensor_info = 1; - tl.start_tensor_this_launch = t; - } - } + // Reset. The control flow possibilities here make my brain hurt. + loc_block_info = 0; + if (chunk == chunks_this_tensor - 1) { + // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 + // << std::endl; + loc_tensor_info = 0; + tl.start_tensor_this_launch = t + 1; + } else { + // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 + // << std::endl; + tl.sizes[0] = tl.sizes[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) + tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1]; + loc_tensor_info = 1; + tl.start_tensor_this_launch = t; } + } } -} \ No newline at end of file + } +} diff --git a/colossalai/kernel/cuda_native/csrc/type_shim.h b/colossalai/kernel/cuda_native/csrc/type_shim.h index b4011c5ba..2f180a778 100644 --- a/colossalai/kernel/cuda_native/csrc/type_shim.h +++ b/colossalai/kernel/cuda_native/csrc/type_shim.h @@ -1,4 +1,9 @@ /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */ +/* Copyright 2020 The Microsoft DeepSpeed Team + Copyright NVIDIA/apex + This file is adapted from fused adam in NVIDIA/apex, commit a109f85 + Licensed under the MIT License. +*/ #include #include "compat.h" diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index 941866d55..987af8a96 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -1,4 +1,11 @@ # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py +''' +Copyright 2020 The Microsoft DeepSpeed Team + +Copyright NVIDIA/apex +This file is adapted from fused adam in NVIDIA/apex, commit a109f85 +Licensed under the MIT License. +''' import torch from colossalai.registry import OPTIMIZERS diff --git a/colossalai/nn/optimizer/zero_optimizer.py b/colossalai/nn/optimizer/zero_optimizer.py index 712daed06..422ebb7a3 100644 --- a/colossalai/nn/optimizer/zero_optimizer.py +++ b/colossalai/nn/optimizer/zero_optimizer.py @@ -1,3 +1,4 @@ +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch import math import warnings from enum import Enum diff --git a/colossalai/utils/model/utils.py b/colossalai/utils/model/utils.py index 75bb18df6..f49607376 100644 --- a/colossalai/utils/model/utils.py +++ b/colossalai/utils/model/utils.py @@ -1,7 +1,12 @@ -import torch +# This code has been adapted from the DeepSpeed library. +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + import functools from typing import Optional +import torch + def substitute_init_recursively(cls, func, visited: set): for subcls in cls.__subclasses__(): diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py index ae3a61998..094f7d76a 100644 --- a/colossalai/zero/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/sharded_model/sharded_model_v2.py @@ -1,3 +1,4 @@ +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch import functools import itertools from collections import OrderedDict diff --git a/colossalai/zero/sharded_optim/low_level_optim.py b/colossalai/zero/sharded_optim/low_level_optim.py index 502b1c4d9..49fb8b54b 100644 --- a/colossalai/zero/sharded_optim/low_level_optim.py +++ b/colossalai/zero/sharded_optim/low_level_optim.py @@ -1,3 +1,4 @@ +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch from functools import partial from typing import Optional diff --git a/colossalai/zero/sharded_optim/sharded_optim_v2.py b/colossalai/zero/sharded_optim/sharded_optim_v2.py index 401ff988d..43a0b7d76 100644 --- a/colossalai/zero/sharded_optim/sharded_optim_v2.py +++ b/colossalai/zero/sharded_optim/sharded_optim_v2.py @@ -1,3 +1,4 @@ +# this code is inspired by the DeepSpeed library and implemented with our own design from scratch from enum import Enum from os import stat from typing import Dict, Optional, Tuple @@ -5,20 +6,21 @@ from typing import Dict, Optional, Tuple import torch import torch.distributed as dist import torch.nn as nn -from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn.optimizer import ColossalaiOptimizer -from colossalai.gemini.tensor_utils import (colo_model_data_tensor_move_inline, colo_tensor_mem_usage) -from colossalai.zero.sharded_model import ShardedModelV2 -from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32 from torch import Tensor from torch.distributed import ProcessGroup from torch.nn.parameter import Parameter from torch.optim import Optimizer -from colossalai.gemini.stateful_tensor import (StatefulTensor, TensorState) + +from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState from colossalai.gemini.tensor_placement_policy import AutoTensorPlacementPolicy +from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage +from colossalai.logging import get_dist_logger +from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.zero.sharded_model import ShardedModelV2 +from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32 class OptimState(Enum): @@ -36,9 +38,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer): `PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_ GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory, - which is detected by a runtime memory tracer. + which is detected by a runtime memory tracer. - We place as many OS chunks in the margin space as possible. + We place as many OS chunks in the margin space as possible. The size of margin space can be controlled by ``gpu_margin_mem_ratio``. If it is set as ``0.0``, it is the same as classical ZeRO optimizer. @@ -54,8 +56,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer): sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the shard strategy provided by sharded model to shard param fp32 tensors. optimizer (Optimizer): An Optimizer instance. - gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) - which will be used when using hybrid CPU optimizer. + gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) + which will be used when using hybrid CPU optimizer. This argument is meaningless when `tensor_placement_policy` of `ShardedModelV2` is not "auto". Defaults to 0.0. initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32. diff --git a/op_builder/builder.py b/op_builder/builder.py index 140a10c09..b9f44decc 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -1,3 +1,7 @@ +# This code has been adapted from the DeepSpeed library. +# Copyright (c) Microsoft Corporation. + +# Licensed under the MIT License. import importlib import os import time