You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ColossalAI/applications/ColossalQA/colossalqa/text_splitter/utils.py

20 lines
608 B

import re
def remove_format(text: str) -> str:
# if the accout of \t, \r, \v, \f is less than 3, replace \t, \r, \v, \f with space
if len(re.findall(r"\s", text.replace(" ", ""))) > 3:
# in case this is a line of a table
return text
return re.sub(r"\s", " ", text)
# remove newlines
def get_cleaned_paragraph(s: str) -> str:
text = str(s)
text = re.sub(r"\n{3,}", r"\n", text) # replace \n\n\n... with \n
text = re.sub("\n\n", "", text)
lines = text.split("\n")
lines_remove_format = [remove_format(line) for line in lines]
return lines_remove_format