[Docs] Update Chat Examples (#817)

2025-01-15 14:13:15 +08:00 · 2025-01-15 14:13:15 +08:00 · a9ce00823f
parent 3f35636d02
commit a9ce00823f
4 changed files with 173 additions and 90 deletions
--- a/README.md
+++ b/README.md
@ -172,62 +172,79 @@ The chat models adopt [chatml format](./chat/chat_format.md) to support both cha
 To ensure a better usage effect, please make sure that the installed transformers library version meets the following requirements before performing inference with [Transformers](#import-from-transformers) or [ModelScope](#import-from-modelscope):

 ```
-transformers >= 4.38
+transformers >= 4.48
 ```

 ### Import from Transformers

-To load the InternLM2.5-7B-Chat model using Transformers, use the following code:
+To load the InternLM3-8B-Instruct model using Transformers, use the following code:

 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm3-8b-instruct", trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
-model = AutoModelForCausalLM.from_pretrained("internlm/internlm2_5-7b-chat", device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
-  # InternLM 7B in 4bit will cost nearly 8GB GPU memory.
+  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
-  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
-  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
-response, history = model.chat(tokenizer, "hello", history=[])
-print(response)
-# Output: Hello? How can I help you today?
-response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
-print(response)
+
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ### Import from ModelScope

-To load the InternLM2.5-7B-Chat model using ModelScope, use the following code:
+To load the InternLM3-8B-Instruct model using ModelScope, use the following code:

 ```python
 import torch
 from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm2_5-7b-chat')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm3-8b-instruct')
+tokenizer = AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
 # Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
-model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
 # (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
-  # InternLM 7B in 4bit will cost nearly 8GB GPU memory.
+  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
-response, history = model.chat(tokenizer, "hello", history=[])
-print(response)
-response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
-print(response)
+
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ### Dialogue

-You can interact with the InternLM Chat 7B model through a frontend interface by running the following code:
+You can interact with the InternLM3-8B-Instruct model through a frontend interface by running the following code:

 ```bash
 pip install streamlit
-pip install transformers>=4.38
+pip install transformers>=4.48
 streamlit run ./chat/web_demo.py
 ```

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@ -170,30 +170,38 @@ InternLM 支持众多知名的上下游项目，如 LLaMA-Factory、vLLM、llama
 为了保障更好的使用效果，在用 [Transformers](#import-from-transformers) 或 [ModelScope](#import-from-modelscope) 进行推理前，请确保安装的 transformers 库版本满足以下要求：

 ```
-transformers >= 4.38
+transformers >= 4.48
 ```

 ### 通过 Transformers 加载

-通过以下的代码从 Transformers 加载 InternLM2.5-7B-Chat 模型 （可修改模型名称替换不同的模型）
+通过以下的代码从 Transformers 加载 InternLM3-8B-Instruct 模型 （可修改模型名称替换不同的模型）

 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True)
 # 设置`torch_dtype=torch.float16`来将模型精度指定为torch.float16，否则可能会因为您的硬件原因造成显存不足的问题。
-model = AutoModelForCausalLM.from_pretrained("internlm/internlm2_5-7b-chat", device_map="auto",trust_remote_code=True, torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", trust_remote_code=True, torch_dtype=torch.float16)
 # (可选) 如果在低资源设备上，可以通过bitsandbytes加载4-bit或8-bit量化的模型，进一步节省GPU显存.
-  # 4-bit 量化的 InternLM 7B 大约会消耗 8GB 显存.
+  # 4-bit 量化的 InternLM3 8B 大约会消耗 8GB 显存.
  # pip install -U bitsandbytes
-  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
-  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
-response, history = model.chat(tokenizer, "你好", history=[])
-print(response)
-# 模型输出：你好！有什么我可以帮助你的吗？
-response, history = model.chat(tokenizer, "请提供三个管理时间的建议。", history=history)
-print(response)
+
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ### 通过 ModelScope 加载
@ -203,28 +211,36 @@ print(response)
 ```python
 import torch
 from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm2_5-7b-chat')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm3-8b-instruct')
+tokenizer = AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
+# 设置`torch_dtype=torch.float16`来将模型精度指定为torch.float16，否则可能会因为您的硬件原因造成显存不足的问题。
 model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
 # (可选) 如果在低资源设备上，可以通过bitsandbytes加载4-bit或8-bit量化的模型，进一步节省GPU显存.
-  # 4-bit 量化的 InternLM 7B 大约会消耗 8GB 显存.
+  # 4-bit 量化的 InternLM3 8B 大约会消耗 8GB 显存.
  # pip install -U bitsandbytes
  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
-model = model.eval()
-response, history = model.chat(tokenizer, "hello", history=[])
-print(response)
-response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
-print(response)
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ### 通过前端网页对话

-可以通过以下代码启动一个前端的界面来与 InternLM Chat 7B 模型进行交互
+可以通过以下代码启动一个前端的界面来与 InternLM3-8B-Instruct 模型进行交互

 ```bash
 pip install streamlit
-pip install transformers>=4.38
+pip install transformers>=4.48
 streamlit run ./chat/web_demo.py
 ```

--- a/chat/README.md
+++ b/chat/README.md
@ -8,51 +8,74 @@ You can also know more about the [chatml format](./chat_format.md) and how to us

 ## Import from Transformers

-To load the InternLM2.5 7B Chat model using Transformers, use the following code:
+To load the InternLM3-8B-Instruct model using Transformers, use the following code:

 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True).cuda()
->>> model = model.eval()
->>> response, history = model.chat(tokenizer, "hello", history=[])
->>> print(response)
-Hello! How can I help you today?
->>> response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
->>> print(response)
-Sure, here are three tips for effective time management:
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm3-8b-instruct", trust_remote_code=True)
+# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", trust_remote_code=True, torch_dtype=torch.float16)
+# (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
+  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
+  # pip install -U bitsandbytes
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained("internlm/internlm3-8b-instruct", device_map="auto", trust_remote_code=True, load_in_4bit=True)
+model = model.eval()

-1. Prioritize tasks based on importance and urgency: Make a list of all your tasks and categorize them into "important and urgent," "important but not urgent," and "not important but urgent." Focus on completing the tasks in the first category before moving on to the others.
-2. Use a calendar or planner: Write down deadlines and appointments in a calendar or planner so you don't forget them. This will also help you schedule your time more effectively and avoid overbooking yourself.
-3. Minimize distractions: Try to eliminate any potential distractions when working on important tasks. Turn off notifications on your phone, close unnecessary tabs on your computer, and find a quiet place to work if possible.
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")

-Remember, good time management skills take practice and patience. Start with small steps and gradually incorporate these habits into your daily routine.
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ## Import from ModelScope

-To load the InternLM2.5 Chat model using ModelScope, use the following code:
+To load the InternLM3-8B-Instruct model using ModelScope, use the following code:

 ```python
-from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
 import torch
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm2_5-7b-chat')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True,torch_dtype=torch.float16)
-model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",  trust_remote_code=True,torch_dtype=torch.float16)
+from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm3-8b-instruct')
+tokenizer = AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
+# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True, torch_dtype=torch.float16)
+# (Optional) If on low resource devices, you can load model in 4-bit or 8-bit to further save GPU memory via bitsandbytes.
+  # InternLM3 8B in 4bit will cost nearly 8GB GPU memory.
+  # pip install -U bitsandbytes
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
 model = model.eval()
-response, history = model.chat(tokenizer, "hello", history=[])
-print(response)
-response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
-print(response)
+
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ## Dialogue

-You can interact with the InternLM2.5 Chat model through a frontend interface by running the following code:
+You can interact with the InternLM3-8B-Instruct model through a frontend interface by running the following code:

 ```bash
 pip install streamlit
-pip install transformers>=4.38
+pip install transformers>=4.48
 streamlit run ./chat/web_demo.py
 ```

--- a/chat/README_zh-CN.md
+++ b/chat/README_zh-CN.md
@ -12,15 +12,29 @@
 通过以下的代码从 Transformers 加载 InternLM 模型 （可修改模型名称替换不同的模型）

 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm2_5-7b-chat", trust_remote_code=True).cuda()
->>> model = model.eval()
->>> response, history = model.chat(tokenizer, "你好", history=[])
->>> print(response)
-你好！有什么我可以帮助你的吗？
->>> response, history = model.chat(tokenizer, "请提供三个管理时间的建议。", history=history)
->>> print(response)
+import torch
+from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm3-8b-instruct')
+tokenizer = AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
+# 设置`torch_dtype=torch.float16`来将模型精度指定为torch.float16，否则可能会因为您的硬件原因造成显存不足的问题。
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
+# (可选) 如果在低资源设备上，可以通过bitsandbytes加载4-bit或8-bit量化的模型，进一步节省GPU显存.
+  # 4-bit 量化的 InternLM3 8B 大约会消耗 8GB 显存.
+  # pip install -U bitsandbytes
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ### 通过 ModelScope 加载
@ -28,24 +42,37 @@
 通过以下的代码从 ModelScope 加载 InternLM2.5-Chat 模型 （可修改模型名称替换不同的模型）

 ```python
-from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
 import torch
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm2_5-7b-chat', revision='v1.0.0')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True,torch_dtype=torch.float16)
-model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",  trust_remote_code=True,torch_dtype=torch.float16)
-model = model.eval()
-response, history = model.chat(tokenizer, "hello", history=[])
-print(response)
-response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
-print(response)
+from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm3-8b-instruct')
+tokenizer = AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
+# 设置`torch_dtype=torch.float16`来将模型精度指定为torch.float16，否则可能会因为您的硬件原因造成显存不足的问题。
+model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
+# (可选) 如果在低资源设备上，可以通过bitsandbytes加载4-bit或8-bit量化的模型，进一步节省GPU显存.
+  # 4-bit 量化的 InternLM3 8B 大约会消耗 8GB 显存.
+  # pip install -U bitsandbytes
+  # 8-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_8bit=True)
+  # 4-bit: model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, load_in_4bit=True)
+messages = [
+    {"role": "system", "content": "You are an AI assistant whose name is InternLM."},
+    {"role": "user", "content": "Please tell me five scenic spots in Shanghai"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+generated_ids = model.generate(tokenized_chat, max_new_tokens=512)
+
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(tokenized_chat, generated_ids)
+]
+response = tokenizer.batch_decode(generated_ids)[0]
 ```

 ## 通过前端网页对话

-可以通过以下代码启动一个前端的界面来与 InternLM2.5 Chat 7B 模型进行交互
+可以通过以下代码启动一个前端的界面来与 InternLM3-8B-Instruct 模型进行交互

 ```bash
 pip install streamlit
-pip install transformers>=4.38
+pip install transformers>=4.48
 streamlit run ./web_demo.py
 ```