From 32cb74493a7a1554ab6b7475f57e4ec9e648ae84 Mon Sep 17 00:00:00 2001 From: Michelle <97082656+MichelleMa8@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:08:29 +0800 Subject: [PATCH] fix auto loading gpt2 tokenizer (#5279) --- applications/ColossalQA/colossalqa/local/llm.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/applications/ColossalQA/colossalqa/local/llm.py b/applications/ColossalQA/colossalqa/local/llm.py index ff7346adc..0aa383e9d 100644 --- a/applications/ColossalQA/colossalqa/local/llm.py +++ b/applications/ColossalQA/colossalqa/local/llm.py @@ -136,6 +136,19 @@ class ColossalLLM(LLM): """Get the identifying parameters.""" return {"n": self.n} + def get_token_ids(self, text: str) -> List[int]: + """Return the ordered ids of the tokens in a text. + + Args: + text: The string input to tokenize. + + Returns: + A list of ids corresponding to the tokens in the text, in order they occur + in the text. + """ + # use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer + return self.api.tokenizer.encode(text) + class VllmLLM(LLM): """