ColossalAI/colossalai/inference/sampler.py

from typing import List, Optional, Tuple

import torch
from transformers.generation import GenerationConfig

from colossalai.inference.logit_processors import logit_processor


def greedy_sample(
    generation_config,
    logprobs: torch.Tensor,
) -> torch.Tensor:
    """
    Sample tokens greedyly.
    """
    results = torch.argmax(logprobs, dim=-1)
    return results


def multinomial_sample(
    generation_config,
    probs: torch.Tensor,
) -> torch.Tensor:
    """
    Sample tokens in a random phase.
    """
    random_results = torch.multinomial(probs, num_samples=1).squeeze(1)
    return random_results


def beam_search_sample(
    generation_config,
    logprobs: torch.Tensor,
    is_prompt: bool = False,
) -> List[Tuple[List[int], List[int]]]:
    """
    Sample tokens with beam search.
    We sample 2 * beam_width candidates to make sure that with high probability we can get `beam_width` candidates in addition to
    the finished sequences for the next iteration.

    ref:
        https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
    for details. See also HF reference:
        https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065

    # NOTE: this beam search sample function is wrong now.
    """

    beam_width = generation_config.num_beams
    results = []
    if is_prompt:
        # Prompt phase.
        parent_ids = [0] * (2 * beam_width)
        _, next_token_ids = torch.topk(logprobs[0], 2 * beam_width)
        next_token_ids = next_token_ids.tolist()
    else:
        # Generation phase.
        # cumulative_logprobs = [seq_data[seq_id].cumulative_logprob for seq_id in seq_ids]
        cumulative_logprobs = torch.tensor(logprobs, dtype=torch.float, device=seq_group_logprobs.device)
        seq_group_logprobs = seq_group_logprobs + cumulative_logprobs.unsqueeze(dim=1)
        _, topk_ids = torch.topk(logprobs.flatten(), 2 * beam_width)

    results.append((next_token_ids, parent_ids))
    return results


def _sample(probs: torch.Tensor, logprobs: torch.Tensor, generation_config: GenerationConfig, is_prompt: bool = False):
    if generation_config.num_beams == 1:
        if generation_config.do_sample:
            sample_tokens = multinomial_sample(generation_config, probs)
        else:
            sample_tokens = greedy_sample(generation_config, logprobs)
    else:
        sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=is_prompt)

    return sample_tokens


def search_tokens(
    generation_config: GenerationConfig,
    logits,
    is_prompt: bool = False,
    batch_token_ids: Optional[List[List[int]]] = None,
):
    """
    Sample tokens for finished requests.
    """
    # NOTE: need to decide the granularity to process logits (sequence or batch)
    config_dict = generation_config.to_dict()
    # process repetition_penalty, no_repeat_ngram_size
    for type in ["repetition_penalty", "no_repeat_ngram_size"]:
        if type in config_dict and config_dict[type] is not None:
            logits = logit_processor(type, logits, config_dict[type], batch_token_ids)

    # do logit processor
    if generation_config.do_sample:
        # process temperature, top_k, top_p
        for type in ["temperature", "top_k", "top_p"]:
            if type in config_dict and config_dict[type] is not None:
                logits = logit_processor(type, logits, config_dict[type])

    # calculate probs
    probs = torch.softmax(logits, dim=-1, dtype=torch.float)
    logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)

    # sample the next tokens
    sample_tokens = _sample(probs, logprobs, generation_config, is_prompt)
    return sample_tokens
[Feat]Inference RPC Server Support (#5705) * rpc support source * kv cache logical/physical disaggregation * sampler refactor * colossalai launch built in * Unitest * Rpyc support --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 6 months ago			`from typing import List, Optional, Tuple`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago
			`import torch`
[Feat]Inference RPC Server Support (#5705) * rpc support source * kv cache logical/physical disaggregation * sampler refactor * colossalai launch built in * Unitest * Rpyc support --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 6 months ago			`from transformers.generation import GenerationConfig`

			`from colossalai.inference.logit_processors import logit_processor`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago

			`def greedy_sample(`
			`generation_config,`
			`logprobs: torch.Tensor,`
			`) -> torch.Tensor:`
			`"""`
			`Sample tokens greedyly.`
			`"""`
[Inference]Fused the gate and up proj in mlp，and optimized the autograd process. (#5365) * fused the gate and up proj in mlp * fix code styles * opt auto_grad * rollback test_inference_engine.py * modifications based on the review feedback. * fix bugs in flash attn * Change reshape to view * fix test_rmsnorm_triton.py 10 months ago			`results = torch.argmax(logprobs, dim=-1)`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`return results`


			`def multinomial_sample(`
			`generation_config,`
			`probs: torch.Tensor,`
			`) -> torch.Tensor:`
			`"""`
			`Sample tokens in a random phase.`
			`"""`
fix bugs in sampler 11 months ago			`random_results = torch.multinomial(probs, num_samples=1).squeeze(1)`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`return random_results`


			`def beam_search_sample(`
			`generation_config,`
			`logprobs: torch.Tensor,`
			`is_prompt: bool = False,`
			`) -> List[Tuple[List[int], List[int]]]:`
			`"""`
			`Sample tokens with beam search.`
			We sample 2 * beam_width candidates to make sure that with high probability we can get `beam_width` candidates in addition to
			`the finished sequences for the next iteration.`

			`ref:`
			`https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563`
			`for details. See also HF reference:`
			`https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065`

			`# NOTE: this beam search sample function is wrong now.`
			`"""`
adapted to pad_context_forward 11 months ago
fix beam_width 11 months ago			`beam_width = generation_config.num_beams`
[Inference] add logit processor and request handler (#5166) * add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 11 months ago			`results = []`
			`if is_prompt:`
			`# Prompt phase.`
			`parent_ids = [0] * (2 * beam_width)`
			`_, next_token_ids = torch.topk(logprobs[0], 2 * beam_width)`
			`next_token_ids = next_token_ids.tolist()`
			`else:`
			`# Generation phase.`
			`# cumulative_logprobs = [seq_data[seq_id].cumulative_logprob for seq_id in seq_ids]`
			`cumulative_logprobs = torch.tensor(logprobs, dtype=torch.float, device=seq_group_logprobs.device)`
			`seq_group_logprobs = seq_group_logprobs + cumulative_logprobs.unsqueeze(dim=1)`
			`_, topk_ids = torch.topk(logprobs.flatten(), 2 * beam_width)`

			`results.append((next_token_ids, parent_ids))`
			`return results`
[Feat]Inference RPC Server Support (#5705) * rpc support source * kv cache logical/physical disaggregation * sampler refactor * colossalai launch built in * Unitest * Rpyc support --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 6 months ago

			`def _sample(probs: torch.Tensor, logprobs: torch.Tensor, generation_config: GenerationConfig, is_prompt: bool = False):`
			`if generation_config.num_beams == 1:`
			`if generation_config.do_sample:`
			`sample_tokens = multinomial_sample(generation_config, probs)`
			`else:`
			`sample_tokens = greedy_sample(generation_config, logprobs)`
			`else:`
			`sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=is_prompt)`

			`return sample_tokens`


			`def search_tokens(`
			`generation_config: GenerationConfig,`
			`logits,`
			`is_prompt: bool = False,`
			`batch_token_ids: Optional[List[List[int]]] = None,`
			`):`
			`"""`
			`Sample tokens for finished requests.`
			`"""`
			`# NOTE: need to decide the granularity to process logits (sequence or batch)`
			`config_dict = generation_config.to_dict()`
			`# process repetition_penalty, no_repeat_ngram_size`
			`for type in ["repetition_penalty", "no_repeat_ngram_size"]:`
			`if type in config_dict and config_dict[type] is not None:`
			`logits = logit_processor(type, logits, config_dict[type], batch_token_ids)`

			`# do logit processor`
			`if generation_config.do_sample:`
			`# process temperature, top_k, top_p`
			`for type in ["temperature", "top_k", "top_p"]:`
			`if type in config_dict and config_dict[type] is not None:`
			`logits = logit_processor(type, logits, config_dict[type])`

			`# calculate probs`
			`probs = torch.softmax(logits, dim=-1, dtype=torch.float)`
			`logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)`

			`# sample the next tokens`
			`sample_tokens = _sample(probs, logprobs, generation_config, is_prompt)`
			`return sample_tokens`