From fb14f9b60a4cd9e59e0ad3d29ed64ba375fcdf0e Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Thu, 16 Jan 2025 18:47:32 +0800 Subject: [PATCH] [Doc]: Update doc for internlm3 (#824) --- README.md | 89 ++++++++++++++++++++++++++++++++++---- README_zh-CN.md | 91 +++++++++++++++++++++++++++++++++++---- ecosystem/README.md | 38 +++++++++------- ecosystem/README_zh-CN.md | 36 +++++++++------- 4 files changed, 206 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 5a6ada9..5cff0e0 100644 --- a/README.md +++ b/README.md @@ -290,15 +290,53 @@ print(response) #### Ollama inference -TODO +install ollama and pull the model + +```bash +# install ollama +curl -fsSL https://ollama.com/install.sh | sh +# pull the model +ollama pull internlm/internlm3-8b-instruct +# install ollama-python +pip install ollama +``` + +inference code: + +```python +import ollama + +system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.""" + +messages = [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": "Please tell me five scenic spots in Shanghai" + }, +] + +stream = ollama.chat( + model='internlm/internlm3-8b-instruct', + messages=messages, + stream=True, +) + +for chunk in stream: + print(chunk['message']['content'], end='', flush=True) +``` #### vLLM inference -We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually. +refer to [installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to install the latest code of vllm ```python -git clone -b support-internlm3 https://github.com/RunningLeon/vllm.git -pip install -e . +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` inference code: @@ -447,15 +485,50 @@ For offline engine api usage, please refer to [Offline Engine API](https://docs. #### Ollama inference -TODO +install ollama and pull the model + +```bash +# install ollama +curl -fsSL https://ollama.com/install.sh | sh +# pull the model +ollama pull internlm/internlm3-8b-instruct +# install ollama-python +pip install ollama +``` + +inference code: + +```python +import ollama + +messages = [ + { + "role": "system", + "content": thinking_system_prompt, + }, + { + "role": "user", + "content": "已知函数\(f(x)=\mathrm{e}^{x}-ax - a^{3}\)。\n(1)当\(a = 1\)时,求曲线\(y = f(x)\)在点\((1,f(1))\)处的切线方程;\n(2)若\(f(x)\)有极小值,且极小值小于\(0\),求\(a\)的取值范围。" + }, +] + +stream = ollama.chat( + model='internlm/internlm3-8b-instruct', + messages=messages, + stream=True, + options=dict(num_ctx=8192, num_predict=2048) +) + +for chunk in stream: + print(chunk['message']['content'], end='', flush=True) +``` #### vLLM inference -We are still working on merging the PR(https://github.com/vllm-project/vllm/pull/12037) into vLLM. In the meantime, please use the following PR link to install it manually. +refer to [installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to install the latest code of vllm ```python -git clone https://github.com/RunningLeon/vllm.git -pip install -e . +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` inference code diff --git a/README_zh-CN.md b/README_zh-CN.md index 18856f5..89f7960 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -257,15 +257,53 @@ curl http://localhost:23333/v1/chat/completions \ #### Ollama 推理 -TODO +安装ollama和拉取模型 + +```bash +# 安装 ollama +curl -fsSL https://ollama.com/install.sh | sh +# 拉取模型 +ollama pull internlm/internlm3-8b-instruct +# 安装python库 +pip install ollama +``` + +推理代码 + +```python +import ollama + +system_prompt = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.""" + +messages = [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": "Please tell me five scenic spots in Shanghai" + }, +] + +stream = ollama.chat( + model='internlm/internlm3-8b-instruct', + messages=messages, + stream=True, +) + +for chunk in stream: + print(chunk['message']['content'], end='', flush=True) +``` #### vLLM 推理 -我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm,现在请使用以下PR链接手动安装 +参考[安装文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 安装 vllm 最新代码 ```python -git clone https://github.com/RunningLeon/vllm.git -pip install -e . +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` 推理代码 @@ -404,15 +442,50 @@ print(response) #### Ollama 推理 -TODO +安装ollama和拉取模型 + +```bash +# 安装 ollama +curl -fsSL https://ollama.com/install.sh | sh +# 拉取模型 +ollama pull internlm/internlm3-8b-instruct +# 安装python库 +pip install ollama +``` + +推理代码 + +```python +import ollama + +messages = [ + { + "role": "system", + "content": thinking_system_prompt, + }, + { + "role": "user", + "content": "已知函数\(f(x)=\mathrm{e}^{x}-ax - a^{3}\)。\n(1)当\(a = 1\)时,求曲线\(y = f(x)\)在点\((1,f(1))\)处的切线方程;\n(2)若\(f(x)\)有极小值,且极小值小于\(0\),求\(a\)的取值范围。" + }, +] + +stream = ollama.chat( + model='internlm/internlm3-8b-instruct', + messages=messages, + stream=True, + options=dict(num_ctx=8192, num_predict=2048) +) + +for chunk in stream: + print(chunk['message']['content'], end='', flush=True) +``` #### vLLM 推理 -我们还在推动PR(https://github.com/vllm-project/vllm/pull/12037) 合入vllm,现在请使用以下PR链接手动安装 +参考[安装文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 安装 vllm 最新代码 -```python -git clone https://github.com/RunningLeon/vllm.git -pip install -e . +```bash +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` 推理代码 diff --git a/ecosystem/README.md b/ecosystem/README.md index 3b435d9..dc86acc 100644 --- a/ecosystem/README.md +++ b/ecosystem/README.md @@ -48,11 +48,11 @@ swift sft --model_type internlm2-1_8b-chat \ LMDeploy is an efficient toolkit for compressing, deploying, and serving LLMs and VLMs. -With only 4 lines of code, you can perform `internlm2_5-7b-chat` inference after `pip install lmdeploy`: +With only 4 lines of code, you can perform `internlm3-8b-instruct` inference after `pip install lmdeploy`: ```python from lmdeploy import pipeline -pipe = pipeline("internlm/internlm2_5-7b-chat") +pipe = pipeline("internlm/internlm3-8b-instruct") response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` @@ -61,7 +61,13 @@ print(response) `vLLM` is a high-throughput and memory-efficient inference and serving engine for LLMs. -After the installation via `pip install vllm`, you can conduct the `internlm2_5-7b-chat` model inference as follows: +Refer to [installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) to install the latest code of vllm + +```bash +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +``` + +Then, you can conduct the `internlm3-8b-instruct` model inference as follows: ```python from vllm import LLM, SamplingParams @@ -75,7 +81,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="internlm/internlm2_5-7b-chat", trust_remote_code=True) +llm = LLM(model="internlm/internlm3-8b-instruct", trust_remote_code=True) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) @@ -132,7 +138,7 @@ curl 127.0.0.1:8080/generate_stream \ `llama.cpp` is a LLM inference framework developed in C/C++. Its goal is to enable LLM inference with minimal setup and state-of-the-art performance on a wide variety of hardware - locally and in the cloud. -`InternLM2` and `InternLM2.5` can be deployed with `llama.cpp` by following the below instructions: +`InternLM2`, `InternLM2.5` and `InternLM3` can be deployed with `llama.cpp` by following the below instructions: - Refer [this](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#build) guide to build llama.cpp from source - Convert the InternLM model to GGUF model and run it according to the [guide](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#prepare-and-quantize) @@ -141,14 +147,14 @@ curl 127.0.0.1:8080/generate_stream \ Ollama bundles model weights, configuration, and data into a single package, defined by a Modelfile. It optimizes setup and configuration details, enabling users to easily set up and execute LLMs locally (in CPU and GPU modes). -The following snippet presents the Modefile of InternLM2.5 with `internlm2_5-7b-chat` as an example. Note that the model has to be converted to GGUF model at first. +The following snippet presents the Modefile of InternLM2.5 with `internlm3-8b-instruct` as an example. Note that the model has to be converted to GGUF model at first. ```shell -echo 'FROM ./internlm2_5-7b-chat.gguf +echo 'FROM ./internlm3-8b-instruct.gguf TEMPLATE """{{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user -{{ .Prompt }} +{{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|>""" @@ -165,7 +171,7 @@ SYSTEM """You are an AI assistant whose name is InternLM (书生·浦语). Then, create an image from the above `Modelfile` like this: ```shell -ollama create internlm2.5:7b-chat -f ./Modelfile +ollama create internlm3:8b-instruct -f ./Modelfile ``` Regarding the usage of `ollama`, please refer [here](https://github.com/ollama/ollama/tree/main/docs). @@ -174,19 +180,19 @@ Regarding the usage of `ollama`, please refer [here](https://github.com/ollama/o llamafile lets you turn large language model (LLM) weights into executables. It combines [llama.cpp](https://github.com/ggerganov/llama.cpp) with [Cosmopolitan Libc](https://github.com/jart/cosmopolitan). -The best practice of deploying InternLM2 or InternLM2.5 using llamafile is shown as below: +The best practice of deploying InternLM2, InternLM2.5 or InternLM3 using llamafile is shown as below: -- Convert the model into GGUF model by `llama.cpp`. Suppose we get `internlm2_5-chat-7b.gguf` in this step +- Convert the model into GGUF model by `llama.cpp`. Suppose we get `internlm3-8b-instruct.gguf` in this step - Create the llamafile ```shell wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6.zip unzip llamafile-0.8.6.zip -cp llamafile-0.8.6/bin/llamafile internlm2_5.llamafile +cp llamafile-0.8.6/bin/llamafile internlm3.llamafile echo "-m -internlm2_5-chat-7b.gguf +internlm3-8b-instruct.gguf --host 0.0.0.0 -ngl @@ -194,8 +200,8 @@ internlm2_5-chat-7b.gguf ..." > .args llamafile-0.8.6/bin/zipalign -j0 \ - internlm2_5.llamafile \ - internlm2_5-chat-7b.gguf \ + internlm3.llamafile \ + internlm3-8b-instruct.gguf \ .args rm -rf .args @@ -204,7 +210,7 @@ rm -rf .args - Run the llamafile ```shell -./internlm2_5.llamafile +./internlm3.llamafile ``` Your browser should open automatically and display a chat interface. (If it doesn't, just open your browser and point it at http://localhost:8080) diff --git a/ecosystem/README_zh-CN.md b/ecosystem/README_zh-CN.md index e299c36..e824be5 100644 --- a/ecosystem/README_zh-CN.md +++ b/ecosystem/README_zh-CN.md @@ -48,11 +48,11 @@ SWIFT 支持 LLMs 和多模态大型模型(MLLMs)的训练、推理、评估 LMDeploy 是一个高效且友好的 LLMs 模型部署工具箱,功能涵盖了量化、推理和服务。 -通过 `pip install lmdeploy` 安装后,只用以下 4 行代码,即可使用 `internlm2_5-7b-chat` 模型完成 prompts 的批处理: +通过 `pip install lmdeploy` 安装后,只用以下 4 行代码,即可使用 `internlm3-8b-instruct` 模型完成 prompts 的批处理: ```python from lmdeploy import pipeline -pipe = pipeline("internlm/internlm2_5-7b-chat") +pipe = pipeline("internlm/internlm3-8b-instruct") response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` @@ -61,7 +61,13 @@ print(response) vLLM 是一个用于 LLMs 的高吞吐量和内存效率的推理和服务引擎。 -通过 `pip install vllm` 安装后,你可以按照以下方式使用 `internlm2_5-chat-7b` 模型进行推理: +参考[安装文档](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) 安装 vllm 最新代码 + +```bash +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +``` + +然后,你可以按照以下方式使用 `internlm3-8b-instruct` 模型进行推理: ```python from vllm import LLM, SamplingParams @@ -75,7 +81,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="internlm/internlm2_5-chat-7b", trust_remote_code=True) +llm = LLM(model="internlm/internlm3-8b-instruct", trust_remote_code=True) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) @@ -132,7 +138,7 @@ curl 127.0.0.1:8080/generate_stream \ llama.cpp 是一个用 C/C++ 开发的 LLMs 推理框架。其目标是在各种硬件上实现最小设置和最先进的性能的 LLM 推理——无论是在本地还是在云端。 -通过以下方式可以使用 llama.cpp 部署 InternLM2 和 InternLM2.5 模型: +通过以下方式可以使用 llama.cpp 部署 InternLM2, InternLM2.5 以及 InternLM3 模型: - 参考 [这里](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#build) 编译并安装 llama.cpp - 把 InternLM 模型转成 GGUF 格式,具体方法参考 [此处](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#prepare-and-quantize) @@ -141,14 +147,14 @@ llama.cpp 是一个用 C/C++ 开发的 LLMs 推理框架。其目标是在各种 Ollama 将模型权重、配置和数据打包到一个单一的包中,由 Modelfile 定义。它优化了安装和配置,使用户能够轻松地在本地(以 CPU 和 GPU 模式)设置和执行 LLMs。 -以下展示的是 `internlm2_5-7b-chat` 的 Modelfile。请注意,应首先把模型转换为 GGUF 模型。 +以下展示的是 `internlm3-8b-instruct` 的 Modelfile。请注意,应首先把模型转换为 GGUF 模型。 ```shell -echo 'FROM ./internlm2_5-7b-chat.gguf +echo 'FROM ./internlm3-8b-instruct.gguf TEMPLATE """{{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user -{{ .Prompt }} +{{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|>""" @@ -165,7 +171,7 @@ SYSTEM """You are an AI assistant whose name is InternLM (书生·浦语). 接着,使用上述 `Modelfile` 创建镜像: ```shell -ollama create internlm2.5:7b-chat -f ./Modelfile +ollama create internlm3:8b-instruct -f ./Modelfile ``` Ollama 的使用方法可以参考[这里](https://github.com/ollama/ollama/tree/main/docs)。 @@ -176,17 +182,17 @@ llamafile 可以把 LLMs 的权重转换为可执行文件。它结合了 llama. 使用 llamafile 部署 InternLM 系列模型的最佳实践如下: -- 通过 llama.cpp 将模型转换为 GGUF 模型。假设我们在这一步得到了 `internlm2_5-chat-7b.gguf` +- 通过 llama.cpp 将模型转换为 GGUF 模型。假设我们在这一步得到了 `internlm3-8b-instruct.gguf` - 创建 llamafile ```shell wget https://github.com/Mozilla-Ocho/llamafile/releases/download/0.8.6/llamafile-0.8.6.zip unzip llamafile-0.8.6.zip -cp llamafile-0.8.6/bin/llamafile internlm2_5.llamafile +cp llamafile-0.8.6/bin/llamafile internlm3.llamafile echo "-m -internlm2_5-7b-chat.gguf +internlm3-8b-instruct.gguf --host 0.0.0.0 -ngl @@ -194,8 +200,8 @@ internlm2_5-7b-chat.gguf ..." > .args llamafile-0.8.6/bin/zipalign -j0 \ - internlm2_5.llamafile \ - internlm2_5-7b-chat.gguf \ + internlm3.llamafile \ + internlm3-8b-instruct.gguf \ .args rm -rf .args @@ -204,7 +210,7 @@ rm -rf .args - Run the llamafile ```shell -./internlm2_5.llamafile +./internlm3.llamafile ``` 你的浏览器应该会自动打开并显示一个聊天界面。(如果没有,只需打开你的浏览器并访问 http://localhost:8080)