ColossalAI/tests/test_infer/test_request_handler.py

import pytest
from transformers.models.llama import LlamaConfig

import colossalai
from colossalai.inference.config import InferenceConfig
from colossalai.inference.core.request_handler import RequestHandler, RunningList
from colossalai.inference.struct import RequestStatus, Sequence
from colossalai.testing import rerun_if_address_is_in_use, spawn


def check_running_list():
    """
    Test the RunningList Structure.
    """
    running_list = RunningList(prefill_ratio=1.2)
    seq1 = Sequence(
        request_id=1,
        prompt="abc",
        input_token_id=[1, 2, 3],
        block_size=16,
        eos_token_id=0,
        pad_token_id=0,
        sample_params=None,
    )
    seq2 = Sequence(
        request_id=2,
        prompt="abc",
        input_token_id=[1, 2, 3],
        block_size=16,
        eos_token_id=0,
        pad_token_id=0,
        sample_params=None,
    )
    running_list.append(seq1)
    running_list.append(seq2)
    assert running_list.ready_for_prefill()
    assert len(running_list.decoding) == 0
    assert len(running_list.prefill) > 0 and running_list.prefill[0] == seq1

    seq = running_list.find_seq(seq1.request_id)
    assert seq == seq1

    running_list.mark_prefill_running()
    for seq in running_list.prefill:
        assert seq.status == RequestStatus.RUNNING

    running_list.move_prefill_to_decoding([seq1.request_id, seq2.request_id])
    assert len(running_list.prefill) == 0
    assert len(running_list.decoding) > 0 and running_list.decoding[0] == seq1

    running_list.remove(seq1)
    running_list.remove(seq2)
    assert running_list.is_empty()


def check_request_handler():
    """
    Test main function of RequestHandler
    """
    inference_config = InferenceConfig(
        max_input_len=10,
        max_output_len=10,
        block_size=8,
    )
    model_config = LlamaConfig(
        hidden_size=32,
        num_hidden_layers=2,
        num_attention_heads=4,
    )
    request_handler = RequestHandler(inference_config, model_config)
    seq1 = Sequence(
        request_id=1,
        prompt="abc",
        input_token_id=[1, 2, 3, 4, 5],
        block_size=16,
        eos_token_id=0,
        pad_token_id=0,
        sample_params=None,
    )
    request_handler.add_sequence(seq1)
    # the priority should be 1
    assert request_handler.waiting_list[1][0] == seq1
    assert request_handler._has_waiting()

    request_handler.abort_sequence(seq1.request_id)
    assert not request_handler._has_waiting()
    seq1.status = RequestStatus.WAITING
    request_handler.add_sequence(seq1)
    request_handler.schedule()


def run_dist(rank, world_size, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")
    check_running_list()
    check_request_handler()


@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_running_list_and_request_handler():
    spawn(run_dist, 1)


if __name__ == "__main__":
    test_running_list_and_request_handler()
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`import pytest`
			`from transformers.models.llama import LlamaConfig`

			`import colossalai`
			`from colossalai.inference.config import InferenceConfig`
			`from colossalai.inference.core.request_handler import RequestHandler, RunningList`
			`from colossalai.inference.struct import RequestStatus, Sequence`
[Hotfix] Fix accuracy and align attention method api with Triton kernel (#5229) * fix accuracy * alignment in attention * fix attention * fix * fix bugs * fix bugs * fix bugs 11 months ago			`from colossalai.testing import rerun_if_address_is_in_use, spawn`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago

			`def check_running_list():`
			`"""`
			`Test the RunningList Structure.`
			`"""`
			`running_list = RunningList(prefill_ratio=1.2)`
			`seq1 = Sequence(`
			`request_id=1,`
			`prompt="abc",`
			`input_token_id=[1, 2, 3],`
			`block_size=16,`
			`eos_token_id=0,`
[inference]Optimize the usage of the mid tensors space in flash attn (#5304) * opt flash attn * opt tmp tensor * fix benchmark_llama * fix code style * fix None logic for output tensor * fix adapted to get_xine_cache * add comment * fix ci bugs * fix some codes * rm duplicated codes * rm duplicated codes * fix code style * add _get_dtype in config.py 10 months ago			`pad_token_id=0,`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`sample_params=None,`
			`)`
[Inference] Optimize and Refactor Inference Batching/Scheduling (#5367) * add kvcache manager funcs for batching * add batch bucket for batching * revise RunningList struct in handler * add kvcache/batch funcs for compatibility * use new batching methods * fix indexing bugs * revise abort logic * use cpu seq lengths/block tables * rm unused attr in Sequence * fix type conversion/default arg * add and revise pytests * revise pytests, rm unused tests * rm unused statements * fix pop finished indexing issue * fix: use index in batch when retrieving inputs/update seqs * use dict instead of odict in batch struct * arg type hinting * fix make compress * refine comments * fix: pop_n_seqs to pop the first n seqs * add check in request handler * remove redundant conversion * fix test for request handler * fix pop method in batch bucket * fix prefill adding 9 months ago			`seq2 = Sequence(`
			`request_id=2,`
			`prompt="abc",`
			`input_token_id=[1, 2, 3],`
			`block_size=16,`
			`eos_token_id=0,`
			`pad_token_id=0,`
			`sample_params=None,`
			`)`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`running_list.append(seq1)`
[Inference] Optimize and Refactor Inference Batching/Scheduling (#5367) * add kvcache manager funcs for batching * add batch bucket for batching * revise RunningList struct in handler * add kvcache/batch funcs for compatibility * use new batching methods * fix indexing bugs * revise abort logic * use cpu seq lengths/block tables * rm unused attr in Sequence * fix type conversion/default arg * add and revise pytests * revise pytests, rm unused tests * rm unused statements * fix pop finished indexing issue * fix: use index in batch when retrieving inputs/update seqs * use dict instead of odict in batch struct * arg type hinting * fix make compress * refine comments * fix: pop_n_seqs to pop the first n seqs * add check in request handler * remove redundant conversion * fix test for request handler * fix pop method in batch bucket * fix prefill adding 9 months ago			`running_list.append(seq2)`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`assert running_list.ready_for_prefill()`
[Inference] Optimize and Refactor Inference Batching/Scheduling (#5367) * add kvcache manager funcs for batching * add batch bucket for batching * revise RunningList struct in handler * add kvcache/batch funcs for compatibility * use new batching methods * fix indexing bugs * revise abort logic * use cpu seq lengths/block tables * rm unused attr in Sequence * fix type conversion/default arg * add and revise pytests * revise pytests, rm unused tests * rm unused statements * fix pop finished indexing issue * fix: use index in batch when retrieving inputs/update seqs * use dict instead of odict in batch struct * arg type hinting * fix make compress * refine comments * fix: pop_n_seqs to pop the first n seqs * add check in request handler * remove redundant conversion * fix test for request handler * fix pop method in batch bucket * fix prefill adding 9 months ago			`assert len(running_list.decoding) == 0`
			`assert len(running_list.prefill) > 0 and running_list.prefill[0] == seq1`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago
			`seq = running_list.find_seq(seq1.request_id)`
			`assert seq == seq1`

[Inference] Optimize and Refactor Inference Batching/Scheduling (#5367) * add kvcache manager funcs for batching * add batch bucket for batching * revise RunningList struct in handler * add kvcache/batch funcs for compatibility * use new batching methods * fix indexing bugs * revise abort logic * use cpu seq lengths/block tables * rm unused attr in Sequence * fix type conversion/default arg * add and revise pytests * revise pytests, rm unused tests * rm unused statements * fix pop finished indexing issue * fix: use index in batch when retrieving inputs/update seqs * use dict instead of odict in batch struct * arg type hinting * fix make compress * refine comments * fix: pop_n_seqs to pop the first n seqs * add check in request handler * remove redundant conversion * fix test for request handler * fix pop method in batch bucket * fix prefill adding 9 months ago			`running_list.mark_prefill_running()`
			`for seq in running_list.prefill:`
			`assert seq.status == RequestStatus.RUNNING`

			`running_list.move_prefill_to_decoding([seq1.request_id, seq2.request_id])`
			`assert len(running_list.prefill) == 0`
			`assert len(running_list.decoding) > 0 and running_list.decoding[0] == seq1`

[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`running_list.remove(seq1)`
[Inference] Optimize and Refactor Inference Batching/Scheduling (#5367) * add kvcache manager funcs for batching * add batch bucket for batching * revise RunningList struct in handler * add kvcache/batch funcs for compatibility * use new batching methods * fix indexing bugs * revise abort logic * use cpu seq lengths/block tables * rm unused attr in Sequence * fix type conversion/default arg * add and revise pytests * revise pytests, rm unused tests * rm unused statements * fix pop finished indexing issue * fix: use index in batch when retrieving inputs/update seqs * use dict instead of odict in batch struct * arg type hinting * fix make compress * refine comments * fix: pop_n_seqs to pop the first n seqs * add check in request handler * remove redundant conversion * fix test for request handler * fix pop method in batch bucket * fix prefill adding 9 months ago			`running_list.remove(seq2)`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`assert running_list.is_empty()`


			`def check_request_handler():`
			`"""`
			`Test main function of RequestHandler`
			`"""`
			`inference_config = InferenceConfig(`
			`max_input_len=10,`
			`max_output_len=10,`
			`block_size=8,`
			`)`
			`model_config = LlamaConfig(`
			`hidden_size=32,`
			`num_hidden_layers=2,`
			`num_attention_heads=4,`
			`)`
			`request_handler = RequestHandler(inference_config, model_config)`
			`seq1 = Sequence(`
			`request_id=1,`
			`prompt="abc",`
			`input_token_id=[1, 2, 3, 4, 5],`
			`block_size=16,`
			`eos_token_id=0,`
[inference]Optimize the usage of the mid tensors space in flash attn (#5304) * opt flash attn * opt tmp tensor * fix benchmark_llama * fix code style * fix None logic for output tensor * fix adapted to get_xine_cache * add comment * fix ci bugs * fix some codes * rm duplicated codes * rm duplicated codes * fix code style * add _get_dtype in config.py 10 months ago			`pad_token_id=0,`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`sample_params=None,`
			`)`
			`request_handler.add_sequence(seq1)`
			`# the priority should be 1`
			`assert request_handler.waiting_list[1][0] == seq1`
			`assert request_handler._has_waiting()`

			`request_handler.abort_sequence(seq1.request_id)`
			`assert not request_handler._has_waiting()`
			`seq1.status = RequestStatus.WAITING`
			`request_handler.add_sequence(seq1)`
			`request_handler.schedule()`


			`def run_dist(rank, world_size, port):`
			`colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")`
			`check_running_list()`
			`check_request_handler()`


			`@pytest.mark.dist`
[Hotfix] Fix accuracy and align attention method api with Triton kernel (#5229) * fix accuracy * alignment in attention * fix attention * fix * fix bugs * fix bugs * fix bugs 11 months ago			`@rerun_if_address_is_in_use()`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`def test_running_list_and_request_handler():`
			`spawn(run_dist, 1)`


			`if __name__ == "__main__":`
			`test_running_list_and_request_handler()`