From 355ffb386e36c59cdf93f82a73e94077e6b7c774 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Fri, 23 Dec 2022 20:57:41 +0800 Subject: [PATCH] [builder] unified cpu_optim fused_optim inferface (#2190) --- colossalai/amp/naive_amp/_fp16_optimizer.py | 9 +-------- colossalai/kernel/__init__.py | 16 ++++++++++++++-- colossalai/nn/optimizer/fused_adam.py | 7 ++----- colossalai/nn/optimizer/fused_lamb.py | 6 +----- colossalai/nn/optimizer/fused_sgd.py | 7 ++----- colossalai/nn/optimizer/hybrid_adam.py | 7 +------ colossalai/utils/common.py | 15 ++++----------- .../multi_tensor_apply/multi_tensor_apply.py | 2 +- tests/test_optimizer/test_fused_adam_kernel.py | 9 ++------- 9 files changed, 28 insertions(+), 50 deletions(-) diff --git a/colossalai/amp/naive_amp/_fp16_optimizer.py b/colossalai/amp/naive_amp/_fp16_optimizer.py index e7571460f..8eecacb77 100644 --- a/colossalai/amp/naive_amp/_fp16_optimizer.py +++ b/colossalai/amp/naive_amp/_fp16_optimizer.py @@ -3,19 +3,12 @@ import torch import torch.distributed as dist - -try: - from colossalai._C import fused_optim -except: - print('Colossalai should be built with cuda extension to use the FP16 optimizer') - from colossalai.kernel.op_builder.fused_optim import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() - from torch.distributed import ProcessGroup from torch.optim import Optimizer from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.kernel import fused_optim from colossalai.logging import get_dist_logger from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier diff --git a/colossalai/kernel/__init__.py b/colossalai/kernel/__init__.py index 42c95729a..113ec79da 100644 --- a/colossalai/kernel/__init__.py +++ b/colossalai/kernel/__init__.py @@ -1,3 +1,15 @@ -from .cuda_native import LayerNorm, FusedScaleMaskSoftmax, MultiHeadAttention +from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention -__all__ = ["LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention"] +try: + from colossalai._C import fused_optim +except: + from colossalai.kernel.op_builder.fused_optim import FusedOptimBuilder + fused_optim = FusedOptimBuilder().load() + +try: + from colossalai._C import cpu_optim +except ImportError: + from colossalai.kernel.op_builder import CPUAdamBuilder + cpu_optim = CPUAdamBuilder().load() + +__all__ = ["fused_optim", "cpu_optim", "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention"] diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index adc65d654..c81d122d4 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -65,11 +65,8 @@ class FusedAdam(torch.optim.Optimizer): self.adamw_mode = 1 if adamw_mode else 0 self.set_grad_none = set_grad_none if multi_tensor_applier.available: - try: - from colossalai._C import fused_optim - except: - from colossalai.kernel.op_builder.fused_optim import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() + from colossalai.kernel import fused_optim + # Skip buffer self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self.multi_tensor_adam = fused_optim.multi_tensor_adam diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py index b480b8cd5..a78b351fc 100644 --- a/colossalai/nn/optimizer/fused_lamb.py +++ b/colossalai/nn/optimizer/fused_lamb.py @@ -76,11 +76,7 @@ class FusedLAMB(torch.optim.Optimizer): max_grad_norm=max_grad_norm) super(FusedLAMB, self).__init__(params, defaults) if multi_tensor_applier.available: - try: - from colossalai._C import fused_optim - except: - from colossalai.kernel.op_builder.fused_optim import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() + from colossalai.kernel import fused_optim self.multi_tensor_l2norm = fused_optim.multi_tensor_l2norm # Skip buffer diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py index a0141473b..2596c0bcd 100644 --- a/colossalai/nn/optimizer/fused_sgd.py +++ b/colossalai/nn/optimizer/fused_sgd.py @@ -80,11 +80,8 @@ class FusedSGD(Optimizer): self.wd_after_momentum = wd_after_momentum if multi_tensor_applier.available: - try: - from colossalai._C import fused_optim - except: - from colossalai.kernel.op_builder import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() + from colossalai.kernel import fused_optim + # Skip buffer self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 8ff543d34..5504411aa 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -76,13 +76,8 @@ class HybridAdam(NVMeOptimizer): default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction) super(HybridAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir) self.adamw_mode = adamw_mode - try: - from colossalai._C import cpu_optim, fused_optim - except ImportError: - from colossalai.kernel.op_builder import CPUAdamBuilder, FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() - cpu_optim = CPUAdamBuilder().load() + from colossalai.kernel import cpu_optim, fused_optim self.cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode) self.gpu_adam_op = fused_optim.multi_tensor_adam diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index 496ac136a..3ff72d037 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -4,28 +4,21 @@ import functools import os import random import socket +from collections import defaultdict +from contextlib import contextmanager from pathlib import Path from typing import Callable, Dict, List, Optional, Union import torch +import torch.distributed as dist from torch._six import inf from torch.nn.parameter import Parameter -try: - from colossalai._C import fused_optim -except: - from colossalai.kernel.op_builder import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() - -from collections import defaultdict -from contextlib import contextmanager - -import torch.distributed as dist - from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env +from colossalai.kernel import fused_optim from colossalai.tensor import ColoParameter, ProcessGroup from .multi_tensor_apply import multi_tensor_applier diff --git a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py index 6eda9834b..b9d98d019 100644 --- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py +++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py @@ -14,7 +14,7 @@ class MultiTensorApply(object): def __init__(self, chunk_size): try: - import colossalai._C.fused_optim + from colossalai.kernel import fused_optim MultiTensorApply.available = True self.chunk_size = chunk_size except ImportError as err: diff --git a/tests/test_optimizer/test_fused_adam_kernel.py b/tests/test_optimizer/test_fused_adam_kernel.py index 0668e7a46..f0188e9fa 100644 --- a/tests/test_optimizer/test_fused_adam_kernel.py +++ b/tests/test_optimizer/test_fused_adam_kernel.py @@ -46,13 +46,8 @@ def torch_adam_update( @parameterize('p_dtype', [torch.float, torch.half]) @parameterize('g_dtype', [torch.float, torch.half]) def test_adam(adamw, step, p_dtype, g_dtype): - try: - import colossalai._C.fused_optim - fused_adam = colossalai._C.fused_optim.multi_tensor_adam - except: - from colossalai.kernel.op_builder import FusedOptimBuilder - fused_optim = FusedOptimBuilder().load() - fused_adam = fused_optim.multi_tensor_adam + from colossalai.kernel import fused_optim + fused_adam = fused_optim.multi_tensor_adam dummy_overflow_buf = torch.cuda.IntTensor([0])