2024-02-28 05:48:17 +00:00
|
|
|
import pytest
|
|
|
|
import torch
|
|
|
|
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
|
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
from colossalai.inference.modeling.models.glide_llama import GlideLlamaConfig, GlideLlamaForCausalLM
|
2024-02-28 05:48:17 +00:00
|
|
|
from colossalai.inference.spec.drafter import Drafter
|
|
|
|
from colossalai.utils import get_current_device
|
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
NUM_LAYERS = 1
|
2024-03-11 01:51:42 +00:00
|
|
|
MAX_LEN = 100
|
2024-04-01 13:54:24 +00:00
|
|
|
SPEC_NUM = 5
|
2024-02-28 05:48:17 +00:00
|
|
|
|
|
|
|
|
2024-05-08 03:30:15 +00:00
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
def tokenizer():
|
|
|
|
return AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
|
|
|
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
@pytest.mark.parametrize("spec_num", [SPEC_NUM])
|
2024-05-08 03:30:15 +00:00
|
|
|
def test_drafter(tokenizer, spec_num: int):
|
2024-02-28 05:48:17 +00:00
|
|
|
torch.manual_seed(123)
|
|
|
|
|
|
|
|
device = get_current_device()
|
|
|
|
toy_config = LlamaConfig(num_hidden_layers=NUM_LAYERS)
|
2024-03-11 01:51:42 +00:00
|
|
|
toy_config.pad_token_id = tokenizer.eos_token_id
|
2024-02-28 05:48:17 +00:00
|
|
|
drafter_model = LlamaForCausalLM(toy_config)
|
|
|
|
drafter_model = drafter_model.eval().cuda()
|
|
|
|
|
2024-03-11 01:51:42 +00:00
|
|
|
drafter = Drafter(drafter_model, tokenizer, device=device)
|
2024-02-28 05:48:17 +00:00
|
|
|
|
|
|
|
input_ids = torch.randint(low=5, high=1000, size=(1, 6)).to(device)
|
|
|
|
out = drafter.speculate(input_ids, spec_num)
|
|
|
|
past_kv_length = input_ids.size(1) + spec_num - 1
|
|
|
|
|
|
|
|
assert out.speculated_length == spec_num
|
|
|
|
assert out.next_tokens.shape == (spec_num,)
|
|
|
|
assert out.logits.shape == (spec_num, len(tokenizer))
|
2024-03-11 01:51:42 +00:00
|
|
|
assert out.past_key_values[0][0].size(2) == past_kv_length
|
|
|
|
|
|
|
|
reject_num = max(0, spec_num - 1)
|
|
|
|
trimmed_past_key_values = drafter.trim_kv_cache(out.past_key_values, reject_num)
|
|
|
|
assert trimmed_past_key_values[0][0].size(2) == past_kv_length - reject_num
|
|
|
|
|
|
|
|
|
2024-05-08 03:30:15 +00:00
|
|
|
def test_spec_dec(tokenizer):
|
2024-04-01 13:54:24 +00:00
|
|
|
spec_num = SPEC_NUM
|
|
|
|
device = get_current_device()
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
|
|
# Dummy config for Glide Model
|
|
|
|
glide_config = GlideLlamaConfig(
|
|
|
|
intermediate_size=8192,
|
|
|
|
large_hidden_size=4096,
|
|
|
|
large_num_attention_heads=32,
|
|
|
|
num_hidden_layers=NUM_LAYERS,
|
2024-03-11 01:51:42 +00:00
|
|
|
)
|
2024-04-01 13:54:24 +00:00
|
|
|
drafter_model = GlideLlamaForCausalLM(glide_config)
|
2024-03-11 01:51:42 +00:00
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
assert hasattr(drafter_model, "model")
|
|
|
|
assert hasattr(drafter_model.model, "layers")
|
|
|
|
for _, layer in enumerate(drafter_model.model.layers):
|
|
|
|
assert hasattr(layer, "cross_attn")
|
2024-03-11 01:51:42 +00:00
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
# Init the Drafter by providing the sharded drafter model
|
|
|
|
drafter = Drafter(drafter_model, tokenizer, device=device, dtype=torch.float16)
|
2024-03-11 01:51:42 +00:00
|
|
|
|
2024-04-01 13:54:24 +00:00
|
|
|
input_ids = torch.randint(low=5, high=1000, size=(1, 6)).to(device)
|
|
|
|
out = drafter.speculate(input_ids, spec_num, past_key_values=None)
|
2024-02-28 05:48:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-05-08 03:30:15 +00:00
|
|
|
dummy_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
|
|
test_drafter(dummy_tokenizer, spec_num=SPEC_NUM)
|
|
|
|
test_spec_dec(dummy_tokenizer)
|