diff --git a/agent/README.md b/agent/README.md index 1ed009c..d135b97 100644 --- a/agent/README.md +++ b/agent/README.md @@ -32,7 +32,8 @@ python streaming_inference.py \ --backend=lmdeploy \ # For HuggingFace models: hf --model_path=internlm/internlm2-chat-20b \ --tp=2 \ - --temperature=0.0 \ + --temperature=1.0 \ + --top_k=1 \ --dataset=math \ --output_path=math_lmdeploy.jsonl \ --do_eval diff --git a/agent/README_zh-CN.md b/agent/README_zh-CN.md index a5d41ee..10981e3 100644 --- a/agent/README_zh-CN.md +++ b/agent/README_zh-CN.md @@ -32,7 +32,8 @@ python streaming_inference.py \ --backend=lmdeploy \ # For HuggingFace models: hf --model_path=internlm/internlm2-chat-20b \ --tp=2 \ - --temperature=0.0 \ + --temperature=1.0 \ + --top_k=1 \ --dataset=math \ --output_path=math_lmdeploy.jsonl \ --do_eval diff --git a/agent/streaming_inference.py b/agent/streaming_inference.py index fc36b34..519ac91 100644 --- a/agent/streaming_inference.py +++ b/agent/streaming_inference.py @@ -30,7 +30,7 @@ import os import re import sys import traceback -from math import isclose +from math import isclose, ceil from typing import Union import jsonlines @@ -38,28 +38,38 @@ import numpy as np from datasets import load_dataset from lagent import (INTERNLM2_META, ActionExecutor, HFTransformer, Internlm2Agent, Internlm2Protocol, LMDeployPipeline, - get_tool) + IPythonInteractiveManager) from pebble import ProcessPool from sympy import N, simplify from sympy.parsing.latex import parse_latex from sympy.parsing.sympy_parser import parse_expr from tqdm import tqdm +# --------------------- modify the system prompt as needed --------------------- +# DEFAULT_PROMPT = ( +# 'Integrate step-by-step reasoning and Python code to solve math problems ' +# 'using the following guidelines:\n' +# '- Just write jupyter code to solve the problem without giving your thought;\n' +# r"- Present the final result in LaTeX using a '\boxed{{}}' without any " +# 'units. \n') + DEFAULT_PROMPT = ( 'Integrate step-by-step reasoning and Python code to solve math problems ' 'using the following guidelines:\n' - '- Just write jupyter code to solve the problem without giving your thought;\n' + '- Analyze the question and write jupyter code to solve the problem;\n' r"- Present the final result in LaTeX using a '\boxed{{}}' without any " 'units. \n') +# ------------------------------------------------------------------------------ def parse_args(): parser = argparse.ArgumentParser(description='Math Code Interpreter') - parser.add_argument('--backend', - type=str, - default='lmdeploy', - help='Which inference framework to use.', - choices=['lmdeploy', 'hf']) + parser.add_argument( + '--backend', + type=str, + default='lmdeploy', + help='Which inference framework to use.', + choices=['lmdeploy', 'hf']) parser.add_argument( '--model_path', type=str, @@ -71,37 +81,52 @@ def parse_args(): type=str, required=True, help='Path to save inference results to, should be a `jsonl` file') - parser.add_argument('--dataset', - type=str, - default='math', - choices=['gsm8k', 'math'], - help='Dataset for inference') + parser.add_argument( + '--dataset', + type=str, + default='math', + choices=['gsm8k', 'math'], + help='Dataset for inference') + parser.add_argument( + '--batch_size', + type=int, + default=100, + help='Agent inference batch size') + parser.add_argument( + '--max_turn', + type=int, + default=3, + help= + 'Maximum number of interaction rounds between the agent and environment' + ) parser.add_argument( '--tp', type=int, default=1, help='Number of tensor parallelism. It may be required in LMDelpoy.') - parser.add_argument('--temperature', - type=float, - default=0.1, - help='Temperature in next token prediction') - parser.add_argument('--top_p', - type=float, - default=0.8, - help='Parameter for Top-P Sampling.') - parser.add_argument('--top_k', - type=int, - default=None, - help='Parameter for Top-K Sampling.') - parser.add_argument('--stop_words', - type=str, - default=['<|action_end|>', '<|im_end|>'], - action='append', - help='Stop words') - parser.add_argument('--max_new_tokens', - type=int, - default=512, - help='Number of maximum generated tokens.') + parser.add_argument( + '--temperature', + type=float, + default=0.1, + help='Temperature in next token prediction') + parser.add_argument( + '--top_p', + type=float, + default=0.8, + help='Parameter for Top-P Sampling.') + parser.add_argument( + '--top_k', type=int, default=40, help='Parameter for Top-K Sampling.') + parser.add_argument( + '--stop_words', + type=str, + default=['<|action_end|>', '<|im_end|>'], + action='append', + help='Stop words') + parser.add_argument( + '--max_new_tokens', + type=int, + default=512, + help='Number of maximum generated tokens.') parser.add_argument( '--do_infer', default=True, @@ -113,18 +138,21 @@ def parse_args(): # action='store_false', # help='Disable the inference.' # ) - parser.add_argument('--do_eval', - default=False, - action='store_true', - help='Whether to evaluate the inference results.') - parser.add_argument('--overwrite', - default=False, - action='store_true', - help='Whether to overwrite the existing result file') - parser.add_argument('--debug', - default=False, - action='store_true', - help='Only infer the first 50 samples') + parser.add_argument( + '--do_eval', + default=False, + action='store_true', + help='Whether to evaluate the inference results.') + parser.add_argument( + '--overwrite', + default=False, + action='store_true', + help='Whether to overwrite the existing result file') + # parser.add_argument( + # '--debug', + # default=False, + # action='store_true', + # help='Only infer the first 50 samples') return parser.parse_args() @@ -473,9 +501,8 @@ def symbolic_equal_process(a, b, output_queue): def call_with_timeout(func, *args, timeout=1, **kwargs): output_queue = multiprocessing.Queue() process_args = args + (output_queue, ) - process = multiprocessing.Process(target=func, - args=process_args, - kwargs=kwargs) + process = multiprocessing.Process( + target=func, args=process_args, kwargs=kwargs) process.start() process.join(timeout) @@ -487,25 +514,33 @@ def call_with_timeout(func, *args, timeout=1, **kwargs): return output_queue.get() -def init_agent(backend: str, model_path: str, tp: int, **kwargs): +def init_agent(backend: str, max_turn: int, model_path: str, tp: int, + **kwargs): if backend == 'lmdeploy': - model = LMDeployPipeline(path=model_path, - meta_template=INTERNLM2_META, - tp=tp, - **kwargs) + from lmdeploy import TurbomindEngineConfig + model = LMDeployPipeline( + path=model_path, + model_name='internlm2-chat', + meta_template=INTERNLM2_META, + pipeline_cfg=dict(backend_config=TurbomindEngineConfig(tp=tp)), + **kwargs) elif backend == 'hf': - model = HFTransformer(path=model_path, - meta_template=INTERNLM2_META, - **kwargs) + model = HFTransformer( + path=model_path, meta_template=INTERNLM2_META, **kwargs) else: raise NotImplementedError - agent = Internlm2Agent(llm=model, - protocol=Internlm2Protocol( - meta_prompt=None, - interpreter_prompt=DEFAULT_PROMPT), - interpreter_executor=ActionExecutor( - actions=[get_tool('IPythonInteractive')])) + agent = Internlm2Agent( + llm=model, + protocol=Internlm2Protocol( + meta_prompt=None, interpreter_prompt=DEFAULT_PROMPT), + interpreter_executor=ActionExecutor(actions=[ + IPythonInteractiveManager( + max_workers=200, + ci_lock=os.path.join( + os.path.dirname(__file__), '.ipython.lock')) + ]), + max_turn=max_turn) return agent @@ -522,8 +557,8 @@ def predict(args): d['pred'], d['steps'], d['error'] = [], [], None return d - dataset = load_dataset('gsm8k', 'main', - split='test').map(process, True) + dataset = load_dataset( + 'gsm8k', 'main', split='test').map(process, True) elif args.dataset == 'math': @@ -543,14 +578,15 @@ def predict(args): d['error'] = None return d - dataset = load_dataset('lighteval/MATH', - split='test').map(process, True) + dataset = load_dataset( + 'lighteval/MATH', split='test').map(process, True) else: raise NotImplementedError agent = init_agent( backend=args.backend, + max_turn=args.max_turn, model_path=args.model_path, tp=args.tp, temperature=args.temperature, @@ -559,26 +595,35 @@ def predict(args): top_k=args.top_k, max_new_tokens=args.max_new_tokens, ) - with jsonlines.open(args.output_path, 'w') as f: - for item in tqdm( - dataset if not args.debug else dataset.select(range(50))): + num_batches = ceil(len(dataset) / args.batch_size) + with jsonlines.open(args.output_path, 'w', flush=True) as f: + for i in tqdm(range(num_batches)): + batch = dataset.select( + range(i * args.batch_size, + min((i + 1) * args.batch_size, len(dataset)))) + # for item in tqdm( + # dataset if not args.debug else dataset.select(range(50))): try: - ret = agent.chat(item['query']) - item['steps'] = ret.inner_steps + rets = agent.batch_chat(batch['query']) + for item, ret in zip(batch, rets): + item['steps'] = ret.inner_steps - lang = [ - step for step in item['steps'] - if step['role'] == 'language' - ] - item['pred'].append('😭' if not lang else - extract_answer(lang[-1]['content']) or '😭') - agent._interpreter_executor.actions[ - 'IPythonInteractive'].reset() + lang = [ + step for step in item['steps'] + if step['role'] == 'language' + ] + item['pred'].append('😭' if not lang else extract_answer( + lang[-1]['content']) or '😭') + f.write(item) except Exception as e: err = str(traceback.format_exc()) - print(f'Error processing index {item["idx"]}: {e}\n{err}') - item['error'] = err - f.write(item) + print(f'Processing batch data error: {e}\n{err}') + for item in batch: + item['error'] = err + f.write(item) + finally: + agent._interpreter_executor.actions[ + 'IPythonInteractiveManager'].reset() def evaluate(args): @@ -606,7 +651,7 @@ def evaluate(args): timeout_cnt += 1 except Exception as error: print(error.__traceback__) - sys.exit() + # sys.exit() progress_bar.update(1) idx = 0