ColossalAI/applications/Chat/tests/test_models.py

236 lines
7.9 KiB
Python
Raw Normal View History

[chat] fix bugs and add unit tests (#4213) * style: rename replay buffer Experience replay is typically for off policy algorithms. Use this name in PPO maybe misleading. * fix: fix wrong zero2 default arg * test: update experience tests * style: rename zero_pad fn * fix: defer init in CycledDataLoader * test: add benchmark test * style: rename internal fn of generation * style: rename internal fn of lora * fix: remove unused loss fn * fix: remove unused utils fn * refactor: remove generate_with_actor fn * fix: fix type annotation * test: add models tests * fix: skip llama due to long execution time * style: modify dataset * style: apply formatter * perf: update reward dataset * fix: fix wrong IGNORE_INDEX in sft dataset * fix: remove DataCollatorForSupervisedDataset * test: add dataset tests * style: apply formatter * style: rename test_ci to test_train * feat: add llama in inference * test: add inference tests * test: change test scripts directory * fix: update ci * fix: fix typo * fix: skip llama due to oom * fix: fix file mod * style: apply formatter * refactor: remove duplicated llama_gptq * style: apply formatter * to: update rm test * feat: add tokenizer arg * feat: add download model script * test: update train tests * fix: modify gemini load and save pretrained * test: update checkpoint io test * to: modify nproc_per_node * fix: do not remove existing dir * fix: modify save path * test: add random choice * fix: fix sft path * fix: enlarge nproc_per_node to avoid oom * fix: add num_retry * fix: make lora config of rm and critic consistent * fix: add warning about lora weights * fix: skip some gpt2 tests * fix: remove grad ckpt in rm and critic due to errors * refactor: directly use Actor in train_sft * test: add more arguments * fix: disable grad ckpt when using lora * fix: fix save_pretrained and related tests * test: enable zero2 tests * revert: remove useless fn * style: polish code * test: modify test args
2023-08-02 02:17:36 +00:00
import copy
from typing import Any, Callable, Dict, Tuple
import pytest
import torch
import torch.nn as nn
from coati.models.base import Actor, Critic, RewardModel, get_base_model
from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
from coati.models.generation import generate
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
from coati.models.lora import LoraLinear, convert_to_lora_module
from coati.models.loss import GPTLMLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
from coati.models.opt import OPTRM, OPTActor, OPTCritic
from coati.models.utils import calc_action_log_probs, compute_reward, masked_mean
@pytest.mark.gpu
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seq_len", [32])
@pytest.mark.parametrize("actor_maker", [
lambda: BLOOMActor(),
lambda: GPTActor(),
# HACK: skip llama due to long execution time
# lambda: LlamaActor(),
lambda: OPTActor()
])
@pytest.mark.parametrize("generate_kwargs", [{
"max_length": 64,
"use_cache": True,
"do_sample": True,
"temperature": 1.0,
"top_k": 50,
}])
def test_generation(actor_maker: Callable[[], Actor],
batch_size: int,
seq_len: int,
generate_kwargs: Dict[str, Any]
):
actor = actor_maker()
input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
sequences = generate(actor.cuda(), input_ids, **generate_kwargs)
assert sequences.shape == (batch_size, generate_kwargs["max_length"])
@pytest.mark.cpu
def test_utils():
fn_input = {
"tensor": torch.ones((10, )),
"mask": torch.randint(0, 2, (10, ))
}
fn_output = masked_mean(dim=0, **fn_input)
assert fn_output.dim() == 0
assert torch.allclose(fn_output, torch.tensor(1.0))
batch_size = 4
num_labels = 10
fn_input = {
"r": torch.ones((batch_size, )),
"kl_coef": 1.0,
"log_probs": torch.randn((batch_size, num_labels)),
"log_probs_base": torch.randn((batch_size, num_labels)),
"action_mask": torch.randint(0, 2, (batch_size, num_labels))
}
fn_output = compute_reward(**fn_input)
assert fn_output.shape == (batch_size, )
batch_size = 4
seq_len = 32
num_labels = 10
num_actions = 2
fn_input = {
"output": {
"logits": torch.randn((batch_size, seq_len, num_labels))
},
"sequences": torch.randint(0, num_labels, (batch_size, seq_len)),
"num_actions": num_actions,
}
fn_output = calc_action_log_probs(**fn_input)
assert fn_output.shape == (batch_size, num_actions)
@pytest.mark.cpu
@pytest.mark.parametrize("lora_rank", [4])
@pytest.mark.parametrize("num_dim", [32])
@pytest.mark.parametrize("num_layers", [4])
def test_lora(lora_rank: int,
num_dim: int,
num_layers: int):
model = nn.ModuleList(
[nn.Linear(num_dim, num_dim)
for _ in range(num_layers)]
)
lora_model = convert_to_lora_module(model, lora_rank)
assert isinstance(lora_model, nn.ModuleList)
for i in range(num_layers):
assert isinstance(lora_model[i], LoraLinear)
assert lora_model[i].lora_A.shape == (lora_rank, num_dim)
assert lora_model[i].lora_B.shape == (num_dim, lora_rank)
old_model = copy.deepcopy(lora_model)
for i in range(num_layers):
assert isinstance(lora_model[i], LoraLinear)
assert torch.allclose(old_model[i].weight, lora_model[i].weight)
assert torch.allclose(old_model[i].bias, lora_model[i].bias)
assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A,
lora_model[i].lora_B @ lora_model[i].lora_A)
optimizer = torch.optim.Adam(lora_model.parameters())
x = torch.randn(8, num_dim)
for i in range(num_layers):
x = lora_model[i](x)
loss = x.sum()
loss.backward()
optimizer.step()
for i in range(num_layers):
assert isinstance(lora_model[i], LoraLinear)
assert torch.allclose(old_model[i].weight, lora_model[i].weight)
assert torch.allclose(old_model[i].bias, lora_model[i].bias)
assert not torch.allclose(old_model[i].lora_B @ old_model[i].lora_A,
lora_model[i].lora_B @ lora_model[i].lora_A)
@pytest.mark.cpu
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seq_len", [128])
@pytest.mark.parametrize("models_maker", [
lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
lambda: (GPTActor(), GPTCritic(), GPTRM()),
# HACK: skip llama due to long execution time
# lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
lambda: (OPTActor(), OPTCritic(), OPTRM()),
])
@torch.no_grad()
def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
batch_size: int,
seq_len: int):
actor_input = {
"input_ids": torch.randint(0, 100, (batch_size, seq_len)),
"attention_mask": torch.randint(0, 2, (batch_size, seq_len))
}
critic_input = {
"sequences": torch.randint(0, 100, (batch_size, seq_len)),
"action_mask": torch.randint(0, 2, (batch_size, seq_len)),
"attention_mask": torch.randint(0, 2, (batch_size, seq_len))
}
rm_input = {
"sequences": torch.randint(0, 100, (batch_size, seq_len)),
"attention_mask": torch.randint(0, 2, (batch_size, seq_len))
}
actor, critic, rm = models_maker()
assert isinstance(actor, Actor)
base_actor_model = get_base_model(actor)
assert isinstance(critic, Critic)
base_critic_model = get_base_model(critic)
assert isinstance(rm, RewardModel)
base_rm_model = get_base_model(rm)
actor_output = actor(**actor_input)
critic_output = critic(**critic_input)
rm_output = rm(**rm_input)
assert actor_output.logits.shape[:2] == (batch_size, seq_len)
assert critic_output.shape == (batch_size, )
assert rm_output.shape == (batch_size, )
@pytest.mark.cpu
@pytest.mark.parametrize("batch_size", [16])
@pytest.mark.parametrize("seq_len", [128])
@pytest.mark.parametrize("num_labels", [100])
def test_loss(batch_size: int,
seq_len: int,
num_labels: int):
loss = GPTLMLoss()
loss_input = {
"logits": torch.randn(batch_size, seq_len, num_labels),
"labels": torch.randint(0, num_labels, (batch_size, seq_len))
}
loss_output = loss(**loss_input)
loss = PolicyLoss()
loss_input = {
"log_probs": torch.randn(batch_size, ),
"old_log_probs": torch.randn(batch_size, ),
"advantages": torch.randn(batch_size, )
}
loss_output = loss(**loss_input)
loss = ValueLoss()
loss_input = {
"values": torch.randn(batch_size, ),
"old_values": torch.randn(batch_size, ),
"reward": torch.randn(batch_size, )
}
loss_output = loss(**loss_input)
loss = LogSigLoss()
loss_input = {
"chosen_reward": torch.randn(batch_size, ),
"reject_reward": torch.randn(batch_size, ),
}
loss_output = loss(**loss_input)
loss = LogExpLoss()
loss_input = {
"chosen_reward": torch.randn(batch_size, ),
"reject_reward": torch.randn(batch_size, ),
}
loss_output = loss(**loss_input)
if __name__ == "__main__":
generate_kwargs = dict(max_length=40,
use_cache=True,
do_sample=True,
temperature=1.0,
top_k=50)
test_generation(lambda: LlamaActor(),
batch_size=4,
seq_len=32,
generate_kwargs=generate_kwargs)
test_utils()
test_lora(lora_rank=2, num_dim=8, num_layers=2)
test_models(models_maker=lambda: (BLOOMActor(),
BLOOMCritic(),
BLOOMRM()),
batch_size=8,
seq_len=128)
test_loss(batch_size=8, seq_len=128, num_labels=100)