[ci] add reward model into testcase (#769)

Co-authored-by: zhulin1 <zhulin1@pjlab.org.cn>
2024-08-14 10:20:00 +08:00 · 2024-08-14 10:20:00 +08:00 · 4fbc98912c
parent 6920fa080d
commit 4fbc98912c
2 changed files with 385 additions and 21 deletions
--- a/.github/workflows/daily_tests.yaml
+++ b/.github/workflows/daily_tests.yaml
@ -36,12 +36,20 @@ jobs:
        conda create -n internlm-model-latest --clone ${CONDA_BASE_ENV}
        source activate internlm-model-latest
        pip install transformers
+    - name: install torch
+      run: |
+        source activate internlm-model-latest
+        pip install /mnt/petrelfs/qa-caif-cicd/resource/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+        pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118
+        pip install sentencepiece auto-gptq==0.6.0 beautifulsoup4 decord lxml
+        export LMDEPLOY_VERSION=0.5.0
+        export PYTHON_VERSION=310
+        pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+        python3 -m pip list
+        conda deactivate
    - name: run_test
      run: |
        source activate internlm-model-latest
-        pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118
-        pip install /mnt/petrelfs/qa-caif-cicd/resource/flash_attn-2.5.8+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-        pip install sentencepiece auto-gptq==0.6.0 lmdeploy[all]
        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 pytest -s -v --color=yes ./tests/test_hf_model.py
        conda deactivate
    - name: remove_env
--- a/tests/test_hf_model.py
+++ b/tests/test_hf_model.py
@ -1,6 +1,7 @@
 import pytest
 import torch
 from auto_gptq.modeling import BaseGPTQForCausalLM
+from bs4 import BeautifulSoup
 from lmdeploy import TurbomindEngineConfig, pipeline
 from PIL import Image
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
@ -22,6 +23,7 @@ class TestChat:
        'model_name',
        [
            'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-1m',
+            'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-1_8b-chat',
            'internlm/internlm2-chat-7b', 'internlm/internlm2-chat-7b-sft',
            'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-20b-sft',
            'internlm/internlm2-chat-1_8b', 'internlm/internlm2-chat-1_8b-sft'
@ -61,23 +63,6 @@ class TestChat:
            assert_model(response)


-class TestChatAwq:
-    """Test cases for chat model."""
-
-    @pytest.mark.parametrize(
-        'model_name',
-        ['internlm/internlm2-chat-20b-4bits'],
-    )
-    def test_demo_default(self, model_name):
-        engine_config = TurbomindEngineConfig(model_format='awq')
-        pipe = pipeline('internlm/internlm2-chat-20b-4bits',
-                        backend_config=engine_config)
-        responses = pipe(['Hi, pls intro yourself', 'Shanghai is'])
-        print(responses)
-        for response in responses:
-            assert_model(response.text)
-
-
 class TestBase:
    """Test cases for base model."""

@ -86,7 +71,8 @@ class TestBase:
        [
            'internlm/internlm2_5-7b', 'internlm/internlm2-7b',
            'internlm/internlm2-base-7b', 'internlm/internlm2-20b',
-            'internlm/internlm2-base-20b', 'internlm/internlm2-1_8b'
+            'internlm/internlm2-base-20b', 'internlm/internlm2-1_8b',
+            'internlm/internlm2_5-20b',
        ],
    )
    @pytest.mark.parametrize(
@ -279,3 +265,373 @@ class InternLMXComposer2QForCausalLM(BaseGPTQForCausalLM):
        ['feed_forward.w1.linear', 'feed_forward.w3.linear'],
        ['feed_forward.w2.linear'],
    ]
+
+
+class TestReward:
+    """Test cases for base model."""
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm2-1_8b-reward', 'internlm/internlm2-7b-reward',
+            'internlm/internlm2-20b-reward'
+        ],
+    )
+    @pytest.mark.parametrize(
+        'usefast',
+        [
+            True,
+            False,
+        ],
+    )
+    def test_demo_default(self, model_name, usefast):
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True,
+                                                  use_fast=usefast)
+        model = AutoModel.from_pretrained(
+            model_name,
+            device_map='cuda',
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+
+        chat_1 = [{
+            'role': 'user',
+            'content': "Hello! What's your name?"
+        }, {
+            'role':
+            'assistant',
+            'content':
+            'I am InternLM2! A helpful AI assistant. What can I do for you?'
+        }]
+        chat_2 = [{
+            'role': 'user',
+            'content': "Hello! What's your name?"
+        }, {
+            'role': 'assistant',
+            'content': 'I have no idea.'
+        }]
+
+        # get reward score for a single chat
+        score1 = model.get_score(tokenizer, chat_1)
+        score2 = model.get_score(tokenizer, chat_2)
+        print('score1: ', score1)
+        print('score2: ', score2)
+        assert score1 > 0
+        assert score2 < 0
+
+        # batch inference, get multiple scores at once
+        scores = model.get_scores(tokenizer, [chat_1, chat_2])
+        print('scores: ', scores)
+        assert scores[0] > 0
+        assert scores[1] < 0
+
+        # compare whether chat_1 is better than chat_2
+        compare_res = model.compare(tokenizer, chat_1, chat_2)
+        print('compare_res: ', compare_res)
+        assert compare_res
+        # >>> compare_res:  True
+
+        # rank multiple chats, it will return the ranking index of each chat
+        # the chat with the highest score will have ranking index as 0
+        rank_res = model.rank(tokenizer, [chat_1, chat_2])
+        print('rank_res: ', rank_res)  # lower index means higher score
+        # >>> rank_res:  [0, 1]
+        assert rank_res[0] == 0
+        assert rank_res[1] == 1
+
+
+class TestXcomposer2d5Model:
+    """Test cases for base model."""
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_video_understanding(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval().half()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = 'Here are some frames of a video. Describe this video in detail'  # noqa: F401, E501
+        image = [
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/liuxiang.mp4',
+        ]
+
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response, his = model.chat(tokenizer,
+                                       query,
+                                       image,
+                                       do_sample=False,
+                                       num_beams=3,
+                                       use_meta=True)
+        print(response)
+        assert len(response) > 100
+        assert 'athlete' in response.lower()
+
+        query = 'tell me the athlete code of Liu Xiang'
+        image = [
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/liuxiang.mp4',
+        ]
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response, _ = model.chat(tokenizer,
+                                     query,
+                                     image,
+                                     history=his,
+                                     do_sample=False,
+                                     num_beams=3,
+                                     use_meta=True)
+        print(response)
+        assert len(response) > 10
+        assert '1363' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_multi_image_understanding(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval().half()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = 'Image1 <ImageHere>; Image2 <ImageHere>; Image3 <ImageHere>; I want to buy a car from the three given cars, analyze their advantages and weaknesses one by one'  # noqa: F401, E501
+        image = [
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars1.jpg',
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars2.jpg',
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars3.jpg',
+        ]
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response, his = model.chat(tokenizer,
+                                       query,
+                                       image,
+                                       do_sample=False,
+                                       num_beams=3,
+                                       use_meta=True)
+        print(response)
+        assert len(response) > 100
+        assert 'car' in response.lower()
+
+        query = 'Image4 <ImageHere>; How about the car in Image4'
+        image.append(
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/cars4.jpg')
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response, _ = model.chat(tokenizer,
+                                     query,
+                                     image,
+                                     do_sample=False,
+                                     num_beams=3,
+                                     history=his,
+                                     use_meta=True)
+        print(response)
+        assert len(response) > 10
+        assert 'ferrari' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_high_resolution_default(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval().half()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = 'Analyze the given image in a detail manner'
+        image = ['/mnt/petrelfs/qa-caif-cicd/github_runner/examples/dubai.png']
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response, _ = model.chat(tokenizer,
+                                     query,
+                                     image,
+                                     do_sample=False,
+                                     num_beams=3,
+                                     use_meta=True)
+        print(response)
+        assert len(response) > 100
+        assert 'dubai' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_introduce_web_default(self, model_name):
+        torch.set_grad_enabled(False)
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = '''A website for Research institutions. The name is Shanghai
+        AI lab. Top Navigation Bar is blue.Below left, an image shows the
+        logo of the lab. In the right, there is a passage of text below that
+        describes the mission of the laboratory.There are several images to
+        show the research projects of Shanghai AI lab.'''
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response = model.write_webpage(
+                query,
+                seed=202,
+                task='Instruction-aware Webpage Generation',
+                repetition_penalty=3.0)
+        print(response)
+        assert len(response) > 100
+        assert is_html_code(response)
+        assert 'href' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_resume_to_webset_default(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        # the input should be a resume in markdown format
+        query = '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/resume.md'
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response = model.resume_2_webpage(query,
+                                              seed=202,
+                                              repetition_penalty=3.0)
+        print(response)
+        assert len(response) > 100
+        assert is_html_code(response)
+        assert 'href' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_screen_to_webset_default(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            model_name, torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                                  trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = 'Generate the HTML code of this web image with Tailwind CSS.'
+        image = [
+            '/mnt/petrelfs/qa-caif-cicd/github_runner/examples/screenshot.jpg'
+        ]
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response = model.screen_2_webpage(query,
+                                              image,
+                                              seed=202,
+                                              repetition_penalty=3.0)
+        print(response)
+        assert len(response) > 100
+        assert is_html_code(response)
+        assert 'href' in response.lower()
+
+    @pytest.mark.parametrize(
+        'model_name',
+        [
+            'internlm/internlm-xcomposer2d5-7b',
+        ],
+    )
+    def test_write_artical_default(self, model_name):
+        torch.set_grad_enabled(False)
+
+        # init model and tokenizer
+        model = AutoModel.from_pretrained(
+            'internlm/internlm-xcomposer2d5-7b',
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True).cuda().eval()
+        tokenizer = AutoTokenizer.from_pretrained(
+            'internlm/internlm-xcomposer2d5-7b', trust_remote_code=True)
+        model.tokenizer = tokenizer
+
+        query = '''阅读下面的材料，根据要求写作。 电影《长安三万里》的出现让人感慨，影片并未将重点全落在大唐风华上，
+        也展现了恢弘气象的阴暗面，即旧门阀的资源垄断、朝政的日益衰败与青年才俊的壮志难酬。高适仕进无门，只能回乡>沉潜修行。
+        李白虽得玉真公主举荐，擢入翰林，但他只是成为唐玄宗的御用文人，不能真正实现有益于朝政的志意。然而，片中高潮部分《将进酒》一节，
+        人至中年、挂着肚腩的李白引众人乘仙鹤上天，一路从水面、瀑布飞升至银河进入仙>宫，李白狂奔着与仙人们碰杯，最后大家纵身飞向漩涡般的九重天。
+        肉身的微贱、世路的“天生我材必有用，坎坷，拘不住精神的高蹈。“天生我材必有用，千金散尽还复来。” 古往今来，身处闲顿、遭受挫折、被病痛折磨，
+        很多人都曾经历>了人生的“失意”，却反而成就了他们“诗意”的人生。对正在追求人生价值的当代青年来说，如何对待人生中的缺憾和困顿?诗意人生中又
+        有怎样的自我坚守和自我认同?请结合“失意”与“诗意”这两个关键词写一篇文章。 要求:选准角度，确定>立意，明确文体，自拟标题;不要套作，不得抄
+        袭;不得泄露个人信息;不少于 800 字。'''
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response = model.write_artical(query, seed=8192)
+        print(response)
+        assert len(response) > 100
+        assert '。' in response and '诗' in response
+
+        query = '''Please write a blog based on the title: French Pastries:
+        A Sweet Indulgence'''
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            response = model.write_artical(query, seed=8192)
+        print(response)
+        assert len(response) > 100
+        assert ' ' in response and 'a' in response
+
+
+def is_html_code(html_code):
+    try:
+        soup = BeautifulSoup(html_code, 'lxml')
+        if soup.find('html'):
+            print('HTML appears to be well-formed.')
+            return True
+        else:
+            print('There was an issue with the HTML structure.')
+            return False
+    except Exception as e:
+        print('Error parsing HTML:', str(e))
+        return False
+
+
+class TestChatAwq:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize(
+        'model_name',
+        ['internlm/internlm2-chat-20b-4bits'],
+    )
+    def test_demo_default(self, model_name):
+        engine_config = TurbomindEngineConfig(model_format='awq')
+        pipe = pipeline('internlm/internlm2-chat-20b-4bits',
+                        backend_config=engine_config)
+        responses = pipe(['Hi, pls intro yourself', 'Shanghai is'])
+        print(responses)
+        for response in responses:
+            assert_model(response.text)