From ac7509389b85425ebc3900fa5c51637bdfa444c2 Mon Sep 17 00:00:00 2001
From: x54-729 <45304952+x54-729@users.noreply.github.com>
Date: Fri, 22 Dec 2023 21:57:14 +0800
Subject: [PATCH] fix(tools): set add_eos_token=True in tokenizer.py (#555)

---
 tools/tokenizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/tokenizer.py b/tools/tokenizer.py
index fc3800e..cf4ddec 100644
--- a/tools/tokenizer.py
+++ b/tools/tokenizer.py
@@ -10,7 +10,9 @@ model_path = os.path.join(current_dir, "V7_sft.model")
 sys.path.append(os.path.join(current_dir, "transformers"))
 from tokenization_internlm import InternLMTokenizer
 
-tokenizer = InternLMTokenizer(vocab_file=model_path)
+tokenizer = InternLMTokenizer(
+    vocab_file=model_path, add_bos_token=True, add_eos_token=True
+)
 
 
 def write_bin(context: str, bin_file) -> None: