From ac7509389b85425ebc3900fa5c51637bdfa444c2 Mon Sep 17 00:00:00 2001 From: x54-729 <45304952+x54-729@users.noreply.github.com> Date: Fri, 22 Dec 2023 21:57:14 +0800 Subject: [PATCH] fix(tools): set add_eos_token=True in tokenizer.py (#555) --- tools/tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/tokenizer.py b/tools/tokenizer.py index fc3800e..cf4ddec 100644 --- a/tools/tokenizer.py +++ b/tools/tokenizer.py @@ -10,7 +10,9 @@ model_path = os.path.join(current_dir, "V7_sft.model") sys.path.append(os.path.join(current_dir, "transformers")) from tokenization_internlm import InternLMTokenizer -tokenizer = InternLMTokenizer(vocab_file=model_path) +tokenizer = InternLMTokenizer( + vocab_file=model_path, add_bos_token=True, add_eos_token=True +) def write_bin(context: str, bin_file) -> None: