diff --git a/README-ja-JP.md b/README-ja-JP.md
index ac1efba..756f560 100644
--- a/README-ja-JP.md
+++ b/README-ja-JP.md
@@ -85,22 +85,25 @@ InternLM 7B と InternLM 7B チャットは、InternLM を使って訓練され
 Transformers を使用して InternLM 7B チャットモデルをロードするには、以下のコードを使用します:
 
 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
->>> model = model.eval()
->>> response, history = model.chat(tokenizer, "こんにちは", history=[])
->>> print(response)
-こんにちは！どのようにお手伝いできますか？
->>> response, history = model.chat(tokenizer, "時間管理について3つの提案をお願いします", history=history)
->>> print(response)
-もちろんです！以下に簡潔な形で時間管理に関する3つの提案を示します。
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+# `torch_dtype=torch.float16` はモデルを float16 精度でロードできます。そうでない場合、transformers はモデルを float32 精度でロードするため、ビデオ メモリが不足する可能性があります。
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", torch_dtype=torch.float16, trust_remote_code=True).cuda()
+model = model.eval()
+response, history = model.chat(tokenizer, "こんにちは", history=[])
+print(response)
+# こんにちは！どのようにお手伝いできますか？
 
-1. To-Doリストを作成し、優先順位を付ける: タスクを明確にリストアップし、それぞれの優先度を判断しましょう。重要で緊急なタスクから順に取り組むことで、効率的に作業を進めることができます。
-2. 時間のブロック化を実践する: 作業を特定の時間枠に集中させるため、時間をブロック化しましょう。例えば、朝の2時間をメール対応に割り当て、午後の3時間をプロジェクトに集中するなど、タスクごとに時間を確保することが効果的です。
-3. ディストラクションを排除する: 集中力を保つために、ディストラクションを最小限に抑えましょう。通知をオフにし、SNSやメールに気を取られないようにすることで、作業効率を向上させることができます。
-
-これらの提案を実践することで、時間管理のスキルを向上させ、効果的に日々のタスクをこなしていくことができます。
+response, history = model.chat(tokenizer, "時間管理について3つの提案をお願いします", history=history)
+print(response)
+# もちろんです！以下に簡潔な形で時間管理に関する3つの提案を示します。
+# 
+# 1. To-Doリストを作成し、優先順位を付ける: タスクを明確にリストアップし、それぞれの優先度を判断しましょう。重要で緊急なタスクから順に取り組むことで、効率的に作業を進めることができます。
+# 2. 時間のブロック化を実践する: 作業を特定の時間枠に集中させるため、時間をブロック化しましょう。例えば、朝の2時間をメール対応に割り当て、午後の3時間をプロジェクトに集中するなど、タスクごとに時間を確保することが効果的です。
+# 3. ディストラクションを排除する: 集中力を保つために、ディストラクションを最小限に抑えましょう。通知をオフにし、SNSやメールに気を取られないようにすることで、作業効率を向上させることができます。
+#
+# これらの提案を実践することで、時間管理のスキルを向上させ、効果的に日々のタスクをこなしていくことができます。
 ```
 
 ストリーミング生成を行いたい場合は、「stream_chat」関数を使用できます。
@@ -109,7 +112,7 @@ Transformers を使用して InternLM 7B チャットモデルをロードする
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_path = "internlm/internlm-chat-7b"
-model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
 model = model.eval()
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 764eeca..035b57b 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -163,19 +163,22 @@ InternLM-7B 包含了一个拥有70亿参数的基础模型和一个为实际场
 通过以下的代码从 Transformers 加载 InternLM 模型 （可修改模型名称替换不同的模型）
 
 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
->>> model = model.eval()
->>> response, history = model.chat(tokenizer, "你好", history=[])
->>> print(response)
-你好！有什么我可以帮助你的吗？
->>> response, history = model.chat(tokenizer, "请提供三个管理时间的建议。", history=history)
->>> print(response)
-当然可以！以下是三个管理时间的建议：
-1. 制定计划：制定一个详细的计划，包括每天要完成的任务和活动。这将有助于您更好地组织时间，并确保您能够按时完成任务。
-2. 优先级：将任务按照优先级排序，先完成最重要的任务。这将确保您能够在最短的时间内完成最重要的任务，从而节省时间。
-3. 集中注意力：避免分心，集中注意力完成任务。关闭社交媒体和电子邮件通知，专注于任务，这将帮助您更快地完成任务，并减少错误的可能性。
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+# `torch_dtype=torch.float16` 可以令模型以 float16 精度加载，否则 transformers 会将模型加载为 float32 格式，有可能导致显存不足
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", torch_dtype=torch.float16, trust_remote_code=True).cuda()
+model = model.eval()
+response, history = model.chat(tokenizer, "你好", history=[])
+print(response)
+# 你好！有什么我可以帮助你的吗？
+
+response, history = model.chat(tokenizer, "请提供三个管理时间的建议。", history=history)
+print(response)
+# 当然可以！以下是三个管理时间的建议：
+# 1. 制定计划：制定一个详细的计划，包括每天要完成的任务和活动。这将有助于您更好地组织时间，并确保您能够按时完成任务。
+# 2. 优先级：将任务按照优先级排序，先完成最重要的任务。这将确保您能够在最短的时间内完成最重要的任务，从而节省时间。
+# 3. 集中注意力：避免分心，集中注意力完成任务。关闭社交媒体和电子邮件通知，专注于任务，这将帮助您更快地完成任务，并减少错误的可能性。
 ```
 
 如果想进行流式生成，则可以使用 `stream_chat` 接口：
@@ -184,7 +187,7 @@ InternLM-7B 包含了一个拥有70亿参数的基础模型和一个为实际场
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_path = "internlm/internlm-chat-7b"
-model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
 model = model.eval()
@@ -201,9 +204,9 @@ for response, history in model.stream_chat(tokenizer, "你好", history=[]):
 ```python
 from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
 import torch
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', revision='v1.0.0')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True,torch_dtype=torch.float16)
-model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",  trust_remote_code=True,torch_dtype=torch.float16)
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', revision='v1.0.2')
+tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)
 model = model.eval()
 response, history = model.chat(tokenizer, "hello", history=[])
 print(response)
diff --git a/README.md b/README.md
index eb21166..866e33c 100644
--- a/README.md
+++ b/README.md
@@ -157,22 +157,25 @@ We conducted a comprehensive evaluation of InternLM using the open-source evalua
 To load the InternLM 7B Chat model using Transformers, use the following code:
 
 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
->>> model = model.eval()
->>> response, history = model.chat(tokenizer, "hello", history=[])
->>> print(response)
-Hello! How can I help you today?
->>> response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
->>> print(response)
-Sure, here are three tips for effective time management:
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+# Set `torch_dtype=torch.float16` to load model in float16, otherwise it will be loaded as float32 and might cause OOM Error.
+model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", torch_dtype=torch.float16, trust_remote_code=True).cuda()
+model = model.eval()
+response, history = model.chat(tokenizer, "hello", history=[])
+print(response)
+# Hello! How can I help you today?
 
-1. Prioritize tasks based on importance and urgency: Make a list of all your tasks and categorize them into "important and urgent," "important but not urgent," and "not important but urgent." Focus on completing the tasks in the first category before moving on to the others.
-2. Use a calendar or planner: Write down deadlines and appointments in a calendar or planner so you don't forget them. This will also help you schedule your time more effectively and avoid overbooking yourself.
-3. Minimize distractions: Try to eliminate any potential distractions when working on important tasks. Turn off notifications on your phone, close unnecessary tabs on your computer, and find a quiet place to work if possible.
-
-Remember, good time management skills take practice and patience. Start with small steps and gradually incorporate these habits into your daily routine.
+response, history = model.chat(tokenizer, "please provide three suggestions about time management", history=history)
+print(response)
+# Sure, here are three tips for effective time management:
+#
+# 1. Prioritize tasks based on importance and urgency: Make a list of all your tasks and categorize them into "important and urgent," "important but not urgent," and "not important but urgent." Focus on completing the tasks in the first category before moving on to the others.
+# 2. Use a calendar or planner: Write down deadlines and appointments in a calendar or planner so you don't forget them. This will also help you schedule your time more effectively and avoid overbooking yourself.
+# 3. Minimize distractions: Try to eliminate any potential distractions when working on important tasks. Turn off notifications on your phone, close unnecessary tabs on your computer, and find a quiet place to work if possible.
+#
+# Remember, good time management skills take practice and patience. Start with small steps and gradually incorporate these habits into your daily routine.
 ```
 
 The responses can be streamed using `stream_chat`:
@@ -181,7 +184,7 @@ The responses can be streamed using `stream_chat`:
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_path = "internlm/internlm-chat-7b"
-model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
 model = model.eval()
@@ -198,9 +201,9 @@ To load the InternLM model using ModelScope, use the following code:
 ```python
 from modelscope import snapshot_download, AutoTokenizer, AutoModelForCausalLM
 import torch
-model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', revision='v1.0.0')
-tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True,torch_dtype=torch.float16)
-model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",  trust_remote_code=True,torch_dtype=torch.float16)
+model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', revision='v1.0.2')
+tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",  trust_remote_code=True, torch_dtype=torch.float16)
 model = model.eval()
 response, history = model.chat(tokenizer, "hello", history=[])
 print(response)