diff --git a/tests/test_hf_model.py b/tests/test_hf_model.py
index 092cb59..dc9ed16 100644
--- a/tests/test_hf_model.py
+++ b/tests/test_hf_model.py
@@ -161,164 +161,6 @@ class TestMath:
         assert '2' in response
 
 
-class TestReward:
-    """Test cases for base model."""
-
-    @pytest.mark.parametrize(
-        'model_name',
-        [
-            'internlm/internlm-reward-1_8b', 'internlm/internlm-reward-7b',
-            'internlm/internlm-reward-20b'
-        ],
-    )
-    @pytest.mark.parametrize(
-        'usefast',
-        [
-            True,
-            False,
-        ],
-    )
-    def test_demo_default(self, model_name, usefast):
-        tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                  trust_remote_code=True,
-                                                  use_fast=usefast)
-        model = AutoModel.from_pretrained(
-            model_name,
-            device_map='cuda',
-            torch_dtype=torch.float16,
-            trust_remote_code=True,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                  trust_remote_code=True)
-
-        chat_1 = [{
-            'role': 'user',
-            'content': "Hello! What's your name?"
-        }, {
-            'role':
-            'assistant',
-            'content':
-            'I am InternLM2! A helpful AI assistant. What can I do for you?'
-        }]
-        chat_2 = [{
-            'role': 'user',
-            'content': "Hello! What's your name?"
-        }, {
-            'role': 'assistant',
-            'content': 'I have no idea.'
-        }]
-
-        # get reward score for a single chat
-        score1 = model.get_score(tokenizer, chat_1)
-        score2 = model.get_score(tokenizer, chat_2)
-        print('score1: ', score1)
-        print('score2: ', score2)
-        assert score1 > 0
-        assert score2 < 0
-
-        # batch inference, get multiple scores at once
-        scores = model.get_scores(tokenizer, [chat_1, chat_2])
-        print('scores: ', scores)
-        assert scores[0] > 0
-        assert scores[1] < 0
-
-        # compare whether chat_1 is better than chat_2
-        compare_res = model.compare(tokenizer, chat_1, chat_2)
-        print('compare_res: ', compare_res)
-        assert compare_res
-        # >>> compare_res:  True
-
-        # rank multiple chats, it will return the ranking index of each chat
-        # the chat with the highest score will have ranking index as 0
-        rank_res = model.rank(tokenizer, [chat_1, chat_2])
-        print('rank_res: ', rank_res)  # lower index means higher score
-        # >>> rank_res:  [0, 1]
-        assert rank_res[0] == 0
-        assert rank_res[1] == 1
-
-    @pytest.mark.parametrize(
-        'model_name',
-        [
-            'internlm/internlm-reward-1_8b', 'internlm/internlm-reward-7b',
-            'internlm/internlm-reward-20b'
-        ],
-    )
-    @pytest.mark.parametrize(
-        'usefast',
-        [
-            True,
-            False,
-        ],
-    )
-    def test_demo_topn(self, model_name, usefast):
-        # prepare the llm model and tokenizer
-        llm = AutoModel.from_pretrained(
-            'internlm/internlm2-chat-7b',
-            device_map='cuda',
-            torch_dtype=torch.float16,
-            trust_remote_code=True,
-        )
-        llm_tokenizer = AutoTokenizer.from_pretrained(
-            'internlm/internlm2-chat-7b', trust_remote_code=True)
-
-        # prepare the reward model and tokenizer
-        reward = AutoModel.from_pretrained(
-            model_name,
-            device_map='cuda',
-            torch_dtype=torch.float16,
-            trust_remote_code=True,
-        )
-        reward_tokenizer = AutoTokenizer.from_pretrained(
-            model_name, trust_remote_code=True)
-
-        # prepare the chat prompt
-        prompt = 'Write an short bedtime story.'
-        messages = [{
-            'role': 'system',
-            'content': 'You are a helpful assistant.'
-        }, {
-            'role': 'user',
-            'content': prompt
-        }]
-        text = llm_tokenizer.apply_chat_template(messages,
-                                                 tokenize=False,
-                                                 add_generation_prompt=True)
-        model_inputs = llm_tokenizer([text], return_tensors='pt').to('cuda')
-
-        # generate best of N candidates
-        num_candidates = 3  # N=3
-        candidates = []
-
-        outputs = llm.generate(
-            **model_inputs,
-            max_new_tokens=512,
-            num_return_sequences=num_candidates,
-            pad_token_id=llm_tokenizer.eos_token_id,
-            do_sample=True,
-            top_k=50,
-            top_p=0.95,
-            temperature=0.8,
-        )
-        outputs = outputs[:, model_inputs['input_ids'].shape[1]:]
-        for i in range(num_candidates):
-            candidate = llm_tokenizer.decode(outputs[i],
-                                             skip_special_tokens=True)
-            candidates.append(messages + [{
-                'role': 'assistant',
-                'content': candidate
-            }])
-
-        rank_indices = reward.rank(reward_tokenizer, candidates)
-        sorted_candidates = sorted(zip(rank_indices, candidates),
-                                   key=lambda x: x[0])
-
-        # print the best response
-        best_response = sorted_candidates[0][1][-1]['content']
-        print(sorted_candidates)
-        print(best_response)
-        assert len(sorted_candidates) == 3
-
-
 class TestMMModel:
     """Test cases for base model."""