From 2538a19927ccf9a4cbbff27305444ed03c48aa63 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Tue, 17 Oct 2023 16:54:51 +0800 Subject: [PATCH] fix InternLMTokenizer to fit transformers==4.34.0 --- tools/transformers/tokenization_internlm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/transformers/tokenization_internlm.py b/tools/transformers/tokenization_internlm.py index b6a3489..5ce1e66 100644 --- a/tools/transformers/tokenization_internlm.py +++ b/tools/transformers/tokenization_internlm.py @@ -65,6 +65,13 @@ class InternLMTokenizer(PreTrainedTokenizer): **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -73,15 +80,8 @@ class InternLMTokenizer(PreTrainedTokenizer): clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self.decode_with_prefix_space = decode_with_prefix_space - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - self._no_prefix_space_tokens = None - """ Initialisation""" + """ Initialization""" @property def no_prefix_space_tokens(self):