[ci] add reward model into testcase (#769)

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
pull/802/head
zhulinJulia24 2024-08-14 10:20:00 +08:00 committed by GitHub
parent 6920fa080d
commit 4fbc98912c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 385 additions and 21 deletions

View File

@ -36,12 +36,20 @@ jobs:
conda create -n internlm-model-latest --clone ${CONDA_BASE_ENV} conda create -n internlm-model-latest --clone ${CONDA_BASE_ENV}
source activate internlm-model-latest source activate internlm-model-latest
pip install transformers pip install transformers
- name: install torch
run: |
source activate internlm-model-latest
pip install /mnt/petrelfs/qa-caif-cicd/resource/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118
pip install sentencepiece auto-gptq==0.6.0 beautifulsoup4 decord lxml
export LMDEPLOY_VERSION=0.5.0
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
python3 -m pip list
conda deactivate
- name: run_test - name: run_test
run: | run: |
source activate internlm-model-latest source activate internlm-model-latest
pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118
pip install /mnt/petrelfs/qa-caif-cicd/resource/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install sentencepiece auto-gptq==0.6.0 lmdeploy[all]
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 pytest -s -v --color=yes ./tests/test_hf_model.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 pytest -s -v --color=yes ./tests/test_hf_model.py
conda deactivate conda deactivate
- name: remove_env - name: remove_env

View File

@ -1,6 +1,7 @@
import pytest import pytest
import torch import torch
from auto_gptq.modeling import BaseGPTQForCausalLM from auto_gptq.modeling import BaseGPTQForCausalLM
from bs4 import BeautifulSoup
from lmdeploy import TurbomindEngineConfig, pipeline from lmdeploy import TurbomindEngineConfig, pipeline
from PIL import Image from PIL import Image
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
@ -22,6 +23,7 @@ class TestChat:
'model_name', 'model_name',
[ [
'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-1m', 'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-1m',
'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-1_8b-chat',
'internlm/internlm2-chat-7b', 'internlm/internlm2-chat-7b-sft', 'internlm/internlm2-chat-7b', 'internlm/internlm2-chat-7b-sft',
'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-sft', 'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-sft',
'internlm/internlm2-chat-1_8b', 'internlm/internlm2-chat-1_8b-sft' 'internlm/internlm2-chat-1_8b', 'internlm/internlm2-chat-1_8b-sft'
@ -61,23 +63,6 @@ class TestChat:
assert_model(response) assert_model(response)
class TestChatAwq:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model_name',
['internlm/internlm2-chat-20b-4bits'],
)
def test_demo_default(self, model_name):
engine_config = TurbomindEngineConfig(model_format='awq')
pipe = pipeline('internlm/internlm2-chat-20b-4bits',
backend_config=engine_config)
responses = pipe(['Hi, pls intro yourself', 'Shanghai is'])
print(responses)
for response in responses:
assert_model(response.text)
class TestBase: class TestBase:
"""Test cases for base model.""" """Test cases for base model."""
@ -86,7 +71,8 @@ class TestBase:
[ [
'internlm/internlm2_5-7b', 'internlm/internlm2-7b', 'internlm/internlm2_5-7b', 'internlm/internlm2-7b',
'internlm/internlm2-base-7b', 'internlm/internlm2-20b', 'internlm/internlm2-base-7b', 'internlm/internlm2-20b',
'internlm/internlm2-base-20b', 'internlm/internlm2-1_8b' 'internlm/internlm2-base-20b', 'internlm/internlm2-1_8b',
'internlm/internlm2_5-20b',
], ],
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -279,3 +265,373 @@ class InternLMXComposer2QForCausalLM(BaseGPTQForCausalLM):
['feed_forward.w1.linear', 'feed_forward.w3.linear'], ['feed_forward.w1.linear', 'feed_forward.w3.linear'],
['feed_forward.w2.linear'], ['feed_forward.w2.linear'],
] ]
class TestReward:
"""Test cases for base model."""
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm2-1_8b-reward', 'internlm/internlm2-7b-reward',
'internlm/internlm2-20b-reward'
],
)
@pytest.mark.parametrize(
'usefast',
[
True,
False,
],
)
def test_demo_default(self, model_name, usefast):
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True,
use_fast=usefast)
model = AutoModel.from_pretrained(
model_name,
device_map='cuda',
torch_dtype=torch.float16,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
chat_1 = [{
'role': 'user',
'content': "Hello! What's your name?"
}, {
'role':
'assistant',
'content':
'I am InternLM2! A helpful AI assistant. What can I do for you?'
}]
chat_2 = [{
'role': 'user',
'content': "Hello! What's your name?"
}, {
'role': 'assistant',
'content': 'I have no idea.'
}]
# get reward score for a single chat
score1 = model.get_score(tokenizer, chat_1)
score2 = model.get_score(tokenizer, chat_2)
print('score1: ', score1)
print('score2: ', score2)
assert score1 > 0
assert score2 < 0
# batch inference, get multiple scores at once
scores = model.get_scores(tokenizer, [chat_1, chat_2])
print('scores: ', scores)
assert scores[0] > 0
assert scores[1] < 0
# compare whether chat_1 is better than chat_2
compare_res = model.compare(tokenizer, chat_1, chat_2)
print('compare_res: ', compare_res)
assert compare_res
# >>> compare_res: True
# rank multiple chats, it will return the ranking index of each chat
# the chat with the highest score will have ranking index as 0
rank_res = model.rank(tokenizer, [chat_1, chat_2])
print('rank_res: ', rank_res) # lower index means higher score
# >>> rank_res: [0, 1]
assert rank_res[0] == 0
assert rank_res[1] == 1
class TestXcomposer2d5Model:
"""Test cases for base model."""
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_video_understanding(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval().half()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
query = 'Here are some frames of a video. Describe this video in detail' # noqa: F401, E501
image = [
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/liuxiang.mp4',
]
with torch.autocast(device_type='cuda', dtype=torch.float16):
response, his = model.chat(tokenizer,
query,
image,
do_sample=False,
num_beams=3,
use_meta=True)
print(response)
assert len(response) > 100
assert 'athlete' in response.lower()
query = 'tell me the athlete code of Liu Xiang'
image = [
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/liuxiang.mp4',
]
with torch.autocast(device_type='cuda', dtype=torch.float16):
response, _ = model.chat(tokenizer,
query,
image,
history=his,
do_sample=False,
num_beams=3,
use_meta=True)
print(response)
assert len(response) > 10
assert '1363' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_multi_image_understanding(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval().half()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
query = 'Image1 <ImageHere>; Image2 <ImageHere>; Image3 <ImageHere>; I want to buy a car from the three given cars, analyze their advantages and weaknesses one by one' # noqa: F401, E501
image = [
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars1.jpg',
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars2.jpg',
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars3.jpg',
]
with torch.autocast(device_type='cuda', dtype=torch.float16):
response, his = model.chat(tokenizer,
query,
image,
do_sample=False,
num_beams=3,
use_meta=True)
print(response)
assert len(response) > 100
assert 'car' in response.lower()
query = 'Image4 <ImageHere>; How about the car in Image4'
image.append(
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars4.jpg')
with torch.autocast(device_type='cuda', dtype=torch.float16):
response, _ = model.chat(tokenizer,
query,
image,
do_sample=False,
num_beams=3,
history=his,
use_meta=True)
print(response)
assert len(response) > 10
assert 'ferrari' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_high_resolution_default(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval().half()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
query = 'Analyze the given image in a detail manner'
image = ['/mnt/petrelfs/qa-caif-cicd/github_runner/examples/dubai.png']
with torch.autocast(device_type='cuda', dtype=torch.float16):
response, _ = model.chat(tokenizer,
query,
image,
do_sample=False,
num_beams=3,
use_meta=True)
print(response)
assert len(response) > 100
assert 'dubai' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_introduce_web_default(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
query = '''A website for Research institutions. The name is Shanghai
AI lab. Top Navigation Bar is blue.Below left, an image shows the
logo of the lab. In the right, there is a passage of text below that
describes the mission of the laboratory.There are several images to
show the research projects of Shanghai AI lab.'''
with torch.autocast(device_type='cuda', dtype=torch.float16):
response = model.write_webpage(
query,
seed=202,
task='Instruction-aware Webpage Generation',
repetition_penalty=3.0)
print(response)
assert len(response) > 100
assert is_html_code(response)
assert 'href' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_resume_to_webset_default(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
# the input should be a resume in markdown format
query = '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/resume.md'
with torch.autocast(device_type='cuda', dtype=torch.float16):
response = model.resume_2_webpage(query,
seed=202,
repetition_penalty=3.0)
print(response)
assert len(response) > 100
assert is_html_code(response)
assert 'href' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_screen_to_webset_default(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
model_name, torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
model.tokenizer = tokenizer
query = 'Generate the HTML code of this web image with Tailwind CSS.'
image = [
'/mnt/petrelfs/qa-caif-cicd/github_runner/examples/screenshot.jpg'
]
with torch.autocast(device_type='cuda', dtype=torch.float16):
response = model.screen_2_webpage(query,
image,
seed=202,
repetition_penalty=3.0)
print(response)
assert len(response) > 100
assert is_html_code(response)
assert 'href' in response.lower()
@pytest.mark.parametrize(
'model_name',
[
'internlm/internlm-xcomposer2d5-7b',
],
)
def test_write_artical_default(self, model_name):
torch.set_grad_enabled(False)
# init model and tokenizer
model = AutoModel.from_pretrained(
'internlm/internlm-xcomposer2d5-7b',
torch_dtype=torch.bfloat16,
trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(
'internlm/internlm-xcomposer2d5-7b', trust_remote_code=True)
model.tokenizer = tokenizer
query = '''阅读下面的材料,根据要求写作。 电影《长安三万里》的出现让人感慨,影片并未将重点全落在大唐风华上,
也展现了恢弘气象的阴暗面即旧门阀的资源垄断朝政的日益衰败与青年才俊的壮志难酬高适仕进无门只能回乡>沉潜修行
李白虽得玉真公主举荐擢入翰林但他只是成为唐玄宗的御用文人不能真正实现有益于朝政的志意然而片中高潮部分将进酒一节
人至中年挂着肚腩的李白引众人乘仙鹤上天一路从水面瀑布飞升至银河进入仙>李白狂奔着与仙人们碰杯最后大家纵身飞向漩涡般的九重天
肉身的微贱世路的天生我材必有用坎坷拘不住精神的高蹈天生我材必有用千金散尽还复来 古往今来身处闲顿遭受挫折被病痛折磨
很多人都曾经历>了人生的失意却反而成就了他们诗意的人生对正在追求人生价值的当代青年来说如何对待人生中的缺憾和困顿?诗意人生中又
有怎样的自我坚守和自我认同?请结合失意诗意这两个关键词写一篇文章 要求:选准角度确定>立意明确文体自拟标题;不要套作不得抄
;不得泄露个人信息;不少于 800 '''
with torch.autocast(device_type='cuda', dtype=torch.float16):
response = model.write_artical(query, seed=8192)
print(response)
assert len(response) > 100
assert '' in response and '' in response
query = '''Please write a blog based on the title: French Pastries:
A Sweet Indulgence'''
with torch.autocast(device_type='cuda', dtype=torch.float16):
response = model.write_artical(query, seed=8192)
print(response)
assert len(response) > 100
assert ' ' in response and 'a' in response
def is_html_code(html_code):
try:
soup = BeautifulSoup(html_code, 'lxml')
if soup.find('html'):
print('HTML appears to be well-formed.')
return True
else:
print('There was an issue with the HTML structure.')
return False
except Exception as e:
print('Error parsing HTML:', str(e))
return False
class TestChatAwq:
"""Test cases for chat model."""
@pytest.mark.parametrize(
'model_name',
['internlm/internlm2-chat-20b-4bits'],
)
def test_demo_default(self, model_name):
engine_config = TurbomindEngineConfig(model_format='awq')
pipe = pipeline('internlm/internlm2-chat-20b-4bits',
backend_config=engine_config)
responses = pipe(['Hi, pls intro yourself', 'Shanghai is'])
print(responses)
for response in responses:
assert_model(response.text)