Browse Source

[Fix] Fix & Update Inference Tests (compatibility w/ main)

pull/5685/head
Yuanheng Zhao 7 months ago
parent
commit
8754abae24
  1. 4
      colossalai/inference/modeling/models/nopadding_llama.py
  2. 2
      examples/inference/benchmark_ops/benchmark_context_attn_unpad.py
  3. 4
      examples/inference/benchmark_ops/benchmark_decoding_attn.py
  4. 2
      examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py
  5. 2
      examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py
  6. 4
      examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py
  7. 2
      examples/inference/benchmark_ops/benchmark_xine_copy.py
  8. 2
      tests/test_infer/test_config_and_struct.py
  9. 2
      tests/test_infer/test_cuda_graph.py
  10. 2
      tests/test_infer/test_inference_engine.py
  11. 0
      tests/test_infer/test_kernels/__init__.py
  12. 0
      tests/test_infer/test_kernels/cuda/__init__.py
  13. 4
      tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py
  14. 2
      tests/test_infer/test_kernels/cuda/test_get_cos_and_sin.py
  15. 5
      tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py
  16. 0
      tests/test_infer/test_kernels/cuda/test_rms_layernorm.py
  17. 4
      tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py
  18. 0
      tests/test_infer/test_kernels/cuda/test_silu_and_mul.py
  19. 0
      tests/test_infer/test_kernels/triton/__init__.py
  20. 0
      tests/test_infer/test_kernels/triton/kernel_utils.py
  21. 2
      tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
  22. 4
      tests/test_infer/test_kernels/triton/test_decoding_attn.py
  23. 0
      tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py
  24. 2
      tests/test_infer/test_kernels/triton/test_kvcache_copy.py
  25. 0
      tests/test_infer/test_kernels/triton/test_rmsnorm_triton.py
  26. 2
      tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py
  27. 0
      tests/test_infer/test_kernels/triton/test_xine_copy.py
  28. 2
      tests/test_infer/test_kvcache_manager.py
  29. 7
      tests/test_infer/test_models/test_baichuan.py
  30. 2
      tests/test_infer/test_request_handler.py

4
colossalai/inference/modeling/models/nopadding_llama.py

@ -270,7 +270,7 @@ def llama_rmsnorm_forward(
return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual) return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)
class NopadLlamaMLP(ParallelModule, LlamaMLP): class NopadLlamaMLP(LlamaMLP, ParallelModule):
def __init__( def __init__(
self, self,
config: LlamaConfig, config: LlamaConfig,
@ -392,7 +392,7 @@ class NopadLlamaMLP(ParallelModule, LlamaMLP):
return f"gate_up_proj MergedLinear1D_Col: in_features={self.gate_up_weight.shape[1]}x2, out_features={self.gate_up_weight.shape[2]}, bias=False" return f"gate_up_proj MergedLinear1D_Col: in_features={self.gate_up_weight.shape[1]}x2, out_features={self.gate_up_weight.shape[2]}, bias=False"
class NopadLlamaAttention(ParallelModule, LlamaAttention): class NopadLlamaAttention(LlamaAttention, ParallelModule):
def __init__( def __init__(
self, self,
config: LlamaConfig, config: LlamaConfig,

2
examples/inference/benchmark_ops/benchmark_context_attn_unpad.py

@ -4,7 +4,7 @@ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
from colossalai.inference.modeling.layers.attention import PagedAttention from colossalai.inference.modeling.layers.attention import PagedAttention
from colossalai.kernel.triton import context_attention_unpadded from colossalai.kernel.triton import context_attention_unpadded
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import generate_caches_and_block_tables_v2, torch_attn_ref from tests.test_infer.test_kernels.triton.kernel_utils import generate_caches_and_block_tables_v2, torch_attn_ref
try: try:
import triton # noqa import triton # noqa

4
examples/inference/benchmark_ops/benchmark_decoding_attn.py

@ -2,14 +2,14 @@ import torch
from colossalai.kernel.triton import flash_decoding_attention from colossalai.kernel.triton import flash_decoding_attention
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
convert_kv_unpad_to_padded, convert_kv_unpad_to_padded,
create_attention_mask, create_attention_mask,
generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v2,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,
torch_attn_ref, torch_attn_ref,
) )
from tests.test_infer.test_ops.triton.test_decoding_attn import prepare_data from tests.test_infer.test_kernels.triton.test_decoding_attn import prepare_data
try: try:
import triton # noqa import triton # noqa

2
examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py

@ -3,7 +3,7 @@ import torch
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from colossalai.kernel.triton import flash_decoding_attention from colossalai.kernel.triton import flash_decoding_attention
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v2,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,
generate_caches_and_block_tables_vllm, generate_caches_and_block_tables_vllm,

2
examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py

@ -2,7 +2,7 @@ import torch
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from colossalai.kernel.triton import copy_kv_to_blocked_cache, decoding_fused_rotary_embedding, rotary_embedding from colossalai.kernel.triton import copy_kv_to_blocked_cache, decoding_fused_rotary_embedding, rotary_embedding
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
mock_alloc_block_table_and_kvcache_v2, mock_alloc_block_table_and_kvcache_v2,
mock_alloc_block_table_and_kvcache_v3, mock_alloc_block_table_and_kvcache_v3,
mock_alloc_single_token, mock_alloc_single_token,

4
examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py

@ -4,8 +4,8 @@ from colossalai.inference.modeling.layers.attention import copy_to_cache
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from colossalai.kernel.triton import copy_kv_to_blocked_cache from colossalai.kernel.triton import copy_kv_to_blocked_cache
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.cuda.test_kv_cache_memcpy import prepare_data as prepare_data_new_kcache_layout from tests.test_infer.test_kernels.cuda.test_kv_cache_memcpy import prepare_data as prepare_data_new_kcache_layout
from tests.test_infer.test_ops.triton.test_kvcache_copy import prepare_data from tests.test_infer.test_kernels.triton.test_kvcache_copy import prepare_data
try: try:
import triton # noqa import triton # noqa

2
examples/inference/benchmark_ops/benchmark_xine_copy.py

@ -1,7 +1,7 @@
import torch import torch
from colossalai.kernel.triton import get_xine_cache from colossalai.kernel.triton import get_xine_cache
from tests.test_infer.test_ops.triton.test_xine_copy import get_cos_sin from tests.test_infer.test_kernels.triton.test_xine_copy import get_cos_sin
try: try:
import triton # noqa import triton # noqa

2
tests/test_infer/test_config_and_struct.py

@ -80,7 +80,7 @@ def check_config_and_inference():
def run_dist(rank, world_size, port): def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
check_config_and_inference() check_config_and_inference()

2
tests/test_infer/test_cuda_graph.py

@ -80,7 +80,7 @@ def check_output_consistency(batch_size):
def run_dist(rank, world_size, port): def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
check_output_consistency(32) check_output_consistency(32)
check_output_consistency(64) check_output_consistency(64)
check_output_consistency(128) check_output_consistency(128)

2
tests/test_infer/test_inference_engine.py

@ -157,7 +157,7 @@ def check_spec_dec(num_layers, max_length):
def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs): def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
if ret: if ret:
ret[rank] = func_to_run(**kwargs) ret[rank] = func_to_run(**kwargs)

0
tests/test_infer/test_ops/__init__.py → tests/test_infer/test_kernels/__init__.py

0
tests/test_infer/test_ops/cuda/__init__.py → tests/test_infer/test_kernels/cuda/__init__.py

4
tests/test_infer/test_ops/cuda/test_flash_decoding_attention.py → tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py

@ -7,11 +7,11 @@ import torch
from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.test_context_attn_unpad import generate_alibi_mask from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generate_alibi_mask
inference_ops = InferenceOpsLoader().load() inference_ops = InferenceOpsLoader().load()
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
convert_kv_unpad_to_padded, convert_kv_unpad_to_padded,
create_attention_mask, create_attention_mask,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,

2
tests/test_infer/test_ops/cuda/test_get_cos_and_sin.py → tests/test_infer/test_kernels/cuda/test_get_cos_and_sin.py

@ -3,7 +3,7 @@ import pytest
import torch import torch
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from tests.test_infer.test_ops.triton.test_xine_copy import get_cos_sin from tests.test_infer.test_kernels.triton.test_xine_copy import get_cos_sin
inference_ops = InferenceOpsLoader().load() inference_ops = InferenceOpsLoader().load()

5
tests/test_infer/test_ops/cuda/test_kv_cache_memcpy.py → tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py

@ -4,7 +4,10 @@ import torch.nn.functional as F
from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.kernel_loader import InferenceOpsLoader
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import generate_caches_and_block_tables_v3, mock_alloc_single_token from tests.test_infer.test_kernels.triton.kernel_utils import (
generate_caches_and_block_tables_v3,
mock_alloc_single_token,
)
inference_ops = InferenceOpsLoader().load() inference_ops = InferenceOpsLoader().load()

0
tests/test_infer/test_ops/cuda/test_rms_layernorm.py → tests/test_infer/test_kernels/cuda/test_rms_layernorm.py

4
tests/test_infer/test_ops/cuda/test_rotary_embdding_unpad.py → tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py

@ -7,8 +7,8 @@ from colossalai.kernel.kernel_loader import InferenceOpsLoader
inference_ops = InferenceOpsLoader().load() inference_ops = InferenceOpsLoader().load()
from tests.test_infer.test_ops.triton.kernel_utils import mock_alloc_block_table_and_kvcache_v3 from tests.test_infer.test_kernels.triton.kernel_utils import mock_alloc_block_table_and_kvcache_v3
from tests.test_infer.test_ops.triton.test_rotary_embdding_unpad import torch_rotary_emb from tests.test_infer.test_kernels.triton.test_rotary_embdding_unpad import torch_rotary_emb
def numpy_allclose(x, y, rtol, atol): def numpy_allclose(x, y, rtol, atol):

0
tests/test_infer/test_ops/cuda/test_silu_and_mul.py → tests/test_infer/test_kernels/cuda/test_silu_and_mul.py

0
tests/test_infer/test_ops/triton/__init__.py → tests/test_infer/test_kernels/triton/__init__.py

0
tests/test_infer/test_ops/triton/kernel_utils.py → tests/test_infer/test_kernels/triton/kernel_utils.py

2
tests/test_infer/test_ops/triton/test_context_attn_unpad.py → tests/test_infer/test_kernels/triton/test_context_attn_unpad.py

@ -5,7 +5,7 @@ from packaging import version
from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
from colossalai.kernel.triton import context_attention_unpadded from colossalai.kernel.triton import context_attention_unpadded
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v2,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,
torch_attn_ref, torch_attn_ref,

4
tests/test_infer/test_ops/triton/test_decoding_attn.py → tests/test_infer/test_kernels/triton/test_decoding_attn.py

@ -6,14 +6,14 @@ from packaging import version
from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
from colossalai.kernel.triton import flash_decoding_attention from colossalai.kernel.triton import flash_decoding_attention
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
convert_kv_unpad_to_padded, convert_kv_unpad_to_padded,
create_attention_mask, create_attention_mask,
generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v2,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,
torch_attn_ref, torch_attn_ref,
) )
from tests.test_infer.test_ops.triton.test_context_attn_unpad import generate_alibi_mask from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generate_alibi_mask
try: try:
import triton # noqa import triton # noqa

0
tests/test_infer/test_ops/triton/test_fused_rotary_embedding.py → tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py

2
tests/test_infer/test_ops/triton/test_kvcache_copy.py → tests/test_infer/test_kernels/triton/test_kvcache_copy.py

@ -4,7 +4,7 @@ from packaging import version
from colossalai.kernel.triton import copy_k_to_blocked_cache, copy_kv_to_blocked_cache from colossalai.kernel.triton import copy_k_to_blocked_cache, copy_kv_to_blocked_cache
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v2,
generate_caches_and_block_tables_v3, generate_caches_and_block_tables_v3,
mock_alloc_single_token, mock_alloc_single_token,

0
tests/test_infer/test_ops/triton/test_rmsnorm_triton.py → tests/test_infer/test_kernels/triton/test_rmsnorm_triton.py

2
tests/test_infer/test_ops/triton/test_rotary_embdding_unpad.py → tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py

@ -4,7 +4,7 @@ from packaging import version
from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
from colossalai.kernel.triton import decoding_fused_rotary_embedding from colossalai.kernel.triton import decoding_fused_rotary_embedding
from tests.test_infer.test_ops.triton.kernel_utils import ( from tests.test_infer.test_kernels.triton.kernel_utils import (
mock_alloc_block_table_and_kvcache_v2, mock_alloc_block_table_and_kvcache_v2,
mock_alloc_block_table_and_kvcache_v3, mock_alloc_block_table_and_kvcache_v3,
) )

0
tests/test_infer/test_ops/triton/test_xine_copy.py → tests/test_infer/test_kernels/triton/test_xine_copy.py

2
tests/test_infer/test_kvcache_manager.py

@ -164,7 +164,7 @@ def check_cache_manager(test_config):
def run_dist(rank, world_size, port): def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
check_cache_manager() check_cache_manager()

7
tests/test_infer/test_models/test_baichuan.py

@ -14,7 +14,6 @@ from colossalai.inference.core.engine import InferenceEngine
from colossalai.inference.modeling.policy import NoPaddingBaichuanModelInferPolicy from colossalai.inference.modeling.policy import NoPaddingBaichuanModelInferPolicy
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
# BAICHUAN_MODEL_NAME_OR_PATH = "baichuan-inc/Baichuan2-7B-Base"
BAICHUAN_MODEL_NAME_OR_PATH = "baichuan-inc/Baichuan2-13B-Base" BAICHUAN_MODEL_NAME_OR_PATH = "baichuan-inc/Baichuan2-13B-Base"
@ -87,7 +86,7 @@ def run_engine(world_size, **kwargs):
def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs): def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
if ret: if ret:
ret[rank] = func_to_run(**kwargs) ret[rank] = func_to_run(**kwargs)
@ -99,7 +98,7 @@ def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
@parameterize("prompt_template", [None, "baichuan"]) @parameterize("prompt_template", [None, "baichuan"])
@parameterize("do_sample", [False]) @parameterize("do_sample", [False])
@parameterize("use_cuda_kernel", [True]) @parameterize("use_cuda_kernel", [True])
def test_tp_engine(prompt_template, do_sample, use_cuda_kernel): def check_tp_engine(prompt_template, do_sample, use_cuda_kernel):
kwargs1 = { kwargs1 = {
"use_engine": True, "use_engine": True,
"prompt_template": prompt_template, "prompt_template": prompt_template,
@ -132,7 +131,7 @@ def test_tp_engine(prompt_template, do_sample, use_cuda_kernel):
@pytest.mark.dist @pytest.mark.dist
@rerun_if_address_is_in_use() @rerun_if_address_is_in_use()
def test_inference_engine(): def test_inference_engine():
test_tp_engine() check_tp_engine()
if __name__ == "__main__": if __name__ == "__main__":

2
tests/test_infer/test_request_handler.py

@ -90,7 +90,7 @@ def check_request_handler():
def run_dist(rank, world_size, port): def run_dist(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
check_running_list() check_running_list()
check_request_handler() check_request_handler()

Loading…
Cancel
Save