diff --git a/tests/test_hf_model.py b/tests/test_hf_model.py index 092cb59..dc9ed16 100644 --- a/tests/test_hf_model.py +++ b/tests/test_hf_model.py @@ -161,164 +161,6 @@ class TestMath: assert '2' in response -class TestReward: - """Test cases for base model.""" - - @pytest.mark.parametrize( - 'model_name', - [ - 'internlm/internlm-reward-1_8b', 'internlm/internlm-reward-7b', - 'internlm/internlm-reward-20b' - ], - ) - @pytest.mark.parametrize( - 'usefast', - [ - True, - False, - ], - ) - def test_demo_default(self, model_name, usefast): - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True, - use_fast=usefast) - model = AutoModel.from_pretrained( - model_name, - device_map='cuda', - torch_dtype=torch.float16, - trust_remote_code=True, - ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - - chat_1 = [{ - 'role': 'user', - 'content': "Hello! What's your name?" - }, { - 'role': - 'assistant', - 'content': - 'I am InternLM2! A helpful AI assistant. What can I do for you?' - }] - chat_2 = [{ - 'role': 'user', - 'content': "Hello! What's your name?" - }, { - 'role': 'assistant', - 'content': 'I have no idea.' - }] - - # get reward score for a single chat - score1 = model.get_score(tokenizer, chat_1) - score2 = model.get_score(tokenizer, chat_2) - print('score1: ', score1) - print('score2: ', score2) - assert score1 > 0 - assert score2 < 0 - - # batch inference, get multiple scores at once - scores = model.get_scores(tokenizer, [chat_1, chat_2]) - print('scores: ', scores) - assert scores[0] > 0 - assert scores[1] < 0 - - # compare whether chat_1 is better than chat_2 - compare_res = model.compare(tokenizer, chat_1, chat_2) - print('compare_res: ', compare_res) - assert compare_res - # >>> compare_res: True - - # rank multiple chats, it will return the ranking index of each chat - # the chat with the highest score will have ranking index as 0 - rank_res = model.rank(tokenizer, [chat_1, chat_2]) - print('rank_res: ', rank_res) # lower index means higher score - # >>> rank_res: [0, 1] - assert rank_res[0] == 0 - assert rank_res[1] == 1 - - @pytest.mark.parametrize( - 'model_name', - [ - 'internlm/internlm-reward-1_8b', 'internlm/internlm-reward-7b', - 'internlm/internlm-reward-20b' - ], - ) - @pytest.mark.parametrize( - 'usefast', - [ - True, - False, - ], - ) - def test_demo_topn(self, model_name, usefast): - # prepare the llm model and tokenizer - llm = AutoModel.from_pretrained( - 'internlm/internlm2-chat-7b', - device_map='cuda', - torch_dtype=torch.float16, - trust_remote_code=True, - ) - llm_tokenizer = AutoTokenizer.from_pretrained( - 'internlm/internlm2-chat-7b', trust_remote_code=True) - - # prepare the reward model and tokenizer - reward = AutoModel.from_pretrained( - model_name, - device_map='cuda', - torch_dtype=torch.float16, - trust_remote_code=True, - ) - reward_tokenizer = AutoTokenizer.from_pretrained( - model_name, trust_remote_code=True) - - # prepare the chat prompt - prompt = 'Write an short bedtime story.' - messages = [{ - 'role': 'system', - 'content': 'You are a helpful assistant.' - }, { - 'role': 'user', - 'content': prompt - }] - text = llm_tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - model_inputs = llm_tokenizer([text], return_tensors='pt').to('cuda') - - # generate best of N candidates - num_candidates = 3 # N=3 - candidates = [] - - outputs = llm.generate( - **model_inputs, - max_new_tokens=512, - num_return_sequences=num_candidates, - pad_token_id=llm_tokenizer.eos_token_id, - do_sample=True, - top_k=50, - top_p=0.95, - temperature=0.8, - ) - outputs = outputs[:, model_inputs['input_ids'].shape[1]:] - for i in range(num_candidates): - candidate = llm_tokenizer.decode(outputs[i], - skip_special_tokens=True) - candidates.append(messages + [{ - 'role': 'assistant', - 'content': candidate - }]) - - rank_indices = reward.rank(reward_tokenizer, candidates) - sorted_candidates = sorted(zip(rank_indices, candidates), - key=lambda x: x[0]) - - # print the best response - best_response = sorted_candidates[0][1][-1]['content'] - print(sorted_candidates) - print(best_response) - assert len(sorted_candidates) == 3 - - class TestMMModel: """Test cases for base model."""