From 8c40539f6fd50a15c11d8ff1782d265a4eb2c0af Mon Sep 17 00:00:00 2001 From: x54-729 Date: Fri, 22 Dec 2023 21:08:26 +0800 Subject: [PATCH] add bos&eos in tools/tokenizer --- tools/tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/tokenizer.py b/tools/tokenizer.py index fc3800e..cf4ddec 100644 --- a/tools/tokenizer.py +++ b/tools/tokenizer.py @@ -10,7 +10,9 @@ model_path = os.path.join(current_dir, "V7_sft.model") sys.path.append(os.path.join(current_dir, "transformers")) from tokenization_internlm import InternLMTokenizer -tokenizer = InternLMTokenizer(vocab_file=model_path) +tokenizer = InternLMTokenizer( + vocab_file=model_path, add_bos_token=True, add_eos_token=True +) def write_bin(context: str, bin_file) -> None: