fix auto loading gpt2 tokenizer (#5279)

pull/5284/head
Michelle 2024-01-18 14:08:29 +08:00 committed by GitHub
parent 5d9a0ae75b
commit 32cb74493a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 13 additions and 0 deletions

View File

@ -136,6 +136,19 @@ class ColossalLLM(LLM):
"""Get the identifying parameters."""
return {"n": self.n}
def get_token_ids(self, text: str) -> List[int]:
"""Return the ordered ids of the tokens in a text.
Args:
text: The string input to tokenize.
Returns:
A list of ids corresponding to the tokens in the text, in order they occur
in the text.
"""
# use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer
return self.api.tokenizer.encode(text)
class VllmLLM(LLM):
"""