Browse Source

[elixir] moved simulator build to op_builder (#3939)

feature/elixir
Frank Lee 1 year ago committed by GitHub
parent
commit
86ff5c152b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 6
      colossalai/elixir/search/simulator.py
  2. 4
      op_builder/__init__.py
  3. 39
      op_builder/elixir_simulator.py
  4. 8
      setup.py
  5. 5
      tests/test_elixir/test_src/test_move.py

6
colossalai/elixir/search/simulator.py

@ -1,10 +1,12 @@
import math
from colossalai.kernel.op_builder import ElixirSimulatorBuilder
from .utils import to_divide
def calc_move_times(param_per_step: list, param_to_chunk: dict, n_blocks: int):
from colossalai.elixir.simulator import move_count
simulator = ElixirSimulatorBuilder().load()
chunk_per_step = list()
for param_set in param_per_step:
@ -17,7 +19,7 @@ def calc_move_times(param_per_step: list, param_to_chunk: dict, n_blocks: int):
if len(id_set) > 0:
chunk_per_step.append(list(id_set))
return move_count(chunk_per_step, n_blocks)
return simulator.move_count(chunk_per_step, n_blocks)
def find_optimal_chunk_size(

4
op_builder/__init__.py

@ -1,4 +1,5 @@
from .cpu_adam import CPUAdamBuilder
from .elixir_simulator import ElixirSimulatorBuilder
from .fused_optim import FusedOptimBuilder
from .layernorm import LayerNormBuilder
from .moe import MOEBuilder
@ -14,10 +15,11 @@ ALL_OPS = {
'scaled_masked_softmax': ScaledMaskedSoftmaxBuilder,
'scaled_upper_triangle_masked_softmax': ScaledUpperTrainglemaskedSoftmaxBuilder,
'layernorm': LayerNormBuilder,
'elixir_simulator': ElixirSimulatorBuilder
}
__all__ = [
'ALL_OPS', 'CPUAdamBuilder', 'FusedOptimBuilder', 'MultiHeadAttnBuilder', 'ScaledMaskedSoftmaxBuilder',
'ScaledUpperTrainglemaskedSoftmaxBuilder', 'MOEBuilder', 'MultiTensorSGDBuilder', 'MultiTensorAdamBuilder',
'MultiTensorLambBuilder', 'MultiTensorScaleBuilder', 'MultiTensorL2NormBuilder'
'MultiTensorLambBuilder', 'MultiTensorScaleBuilder', 'MultiTensorL2NormBuilder', 'ElixirSimulatorBuilder'
]

39
op_builder/elixir_simulator.py

@ -0,0 +1,39 @@
from .builder import Builder
class ElixirSimulatorBuilder(Builder):
NAME = "elixir_simulator"
PREBUILT_IMPORT_PATH = "colossalai._C.elixir_simulator"
def __init__(self):
super().__init__(name=ElixirSimulatorBuilder.NAME,
prebuilt_import_path=ElixirSimulatorBuilder.PREBUILT_IMPORT_PATH)
self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
# necessary 4 functions
def sources_files(self):
ret = [
self.relative_to_abs_path('elixir/simulator.cpp'),
]
return ret
def include_dirs(self):
return []
def cxx_flags(self):
return ['-O3'] + self.version_dependent_macros
def nvcc_flags(self):
return []
def builder(self) -> 'CppExtension':
"""
This function should return a CppExtension object.
"""
from torch.utils.cpp_extension import CppExtension
return CppExtension(name=self.prebuilt_import_path,
sources=self.strip_empty_entries(self.sources_files()),
extra_compile_args={
'cxx': self.strip_empty_entries(self.cxx_flags()),
})

8
setup.py

@ -16,7 +16,7 @@ from op_builder.utils import (
try:
import torch
from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CppExtension
from torch.utils.cpp_extension import CUDA_HOME, BuildExtension
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
@ -30,11 +30,7 @@ BUILD_CUDA_EXT = int(os.environ.get('CUDA_EXT', '0')) == 1
IS_NIGHTLY = int(os.environ.get('NIGHTLY', '0')) == 1
# a variable to store the op builder
ext_modules = [
CppExtension(name='colossalai.elixir.simulator',
sources=['colossalai/elixir/simulator.cpp'],
extra_compile_args=['-O3', '-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'])
]
ext_modules = []
# we do not support windows currently
if sys.platform == 'win32':

5
tests/test_elixir/test_src/test_move.py

@ -1,4 +1,4 @@
from colossalai.elixir.simulator import move_count
from colossalai.kernel.op_builder import ElixirSimulatorBuilder
from colossalai.testing import run_on_environment_flag
@ -6,7 +6,8 @@ from colossalai.testing import run_on_environment_flag
def test_move_count():
steps = [[0], [1, 2], [3], [3], [1, 2], [0]]
size = 2
assert move_count(steps, size) == 12
simulator = ElixirSimulatorBuilder().load()
assert simulator.move_count(steps, size) == 12
if __name__ == '__main__':

Loading…
Cancel
Save