Browse Source

fix auto loading gpt2 tokenizer (#5279)

pull/5284/head
Michelle 10 months ago committed by GitHub
parent
commit
32cb74493a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 13
      applications/ColossalQA/colossalqa/local/llm.py

13
applications/ColossalQA/colossalqa/local/llm.py

@ -136,6 +136,19 @@ class ColossalLLM(LLM):
"""Get the identifying parameters."""
return {"n": self.n}
def get_token_ids(self, text: str) -> List[int]:
"""Return the ordered ids of the tokens in a text.
Args:
text: The string input to tokenize.
Returns:
A list of ids corresponding to the tokens in the text, in order they occur
in the text.
"""
# use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer
return self.api.tokenizer.encode(text)
class VllmLLM(LLM):
"""

Loading…
Cancel
Save