update tokenizer

pull/51/head
gaoyang07 2023-07-10 19:24:21 +08:00
parent c18bec9361
commit dc8dd6ec4d
1 changed files with 50 additions and 100 deletions

View File

@ -1,24 +1,25 @@
import argparse import argparse
import json import json
import os import os
import sys
import warnings import warnings
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor sys.path.append("tools/transformers")
from termcolor import colored from tokenization_internlm import InternLMTokenizer
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(current_dir, "V7.model") model_path = os.path.join(current_dir, "V7_sft.model")
tokenizer = SentencePieceProcessor(model_file=model_path) tokenizer = InternLMTokenizer(vocab_file=model_path)
def write_bin(context: str, path: str) -> None: def write_bin(context: str, bin_output_path: str) -> None:
""" """
Write bin file. Write bin file based on the context.
Args: Args:
context (str): the context of raw file. context (str): the context of raw file.
path (str): the path for output bin file. bin_output_path (str): the path for output bin file.
Example: Example:
>>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt' >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
@ -34,20 +35,20 @@ def write_bin(context: str, path: str) -> None:
saved_bin = str.encode(json.dumps(data) + "\n") saved_bin = str.encode(json.dumps(data) + "\n")
# write bytes into bin path # write bytes into bin path
with open(path, "ab") as f: with open(bin_output_path, "ab") as f:
f.write(saved_bin) f.write(saved_bin)
def prepare_meta(bin_file_path: str): def prepare_meta(bin_output_path: str):
""" """
Prepare metadata for the given bin file. Prepare metadata for the given bin file.
Args: Args:
bin_file_path (str): the bin file path. bin_output_path (str): Output bin file path.
""" """
meta = [] meta = []
cur = 0 cur = 0
with open(bin_file_path, "rb") as f: with open(bin_output_path, "rb") as f:
while True: while True:
# read lines # read lines
line = f.readline() line = f.readline()
@ -62,109 +63,67 @@ def prepare_meta(bin_file_path: str):
meta.append((cur, length)) meta.append((cur, length))
# update the cur to generate the meta information of next line # update the cur to generate the meta information of next line
cur += len(line) cur += len(line)
print(meta)
# define path of the generated meta file # define path of the generated meta file
meta_fp = bin_file_path + ".meta" meta_fp = bin_output_path + ".meta"
# save the generated meta information # save the generated meta information
with open(meta_fp, "wb") as f: with open(meta_fp, "wb") as f:
meta = np.array(meta, dtype=np.int32) meta = np.array(meta, dtype=np.int32)
np.save(f, meta) np.save(f, meta)
def txt2bin(txt_file_path: str, bin_file_path: str): def text2bin(text_input_path: str, bin_output_path: str):
""" """
Read content from txt file and write to bin file Read content from the input file and write to bin file.
Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
Args: Args:
txt_file_path (str): txt file path. text_input_path (str): txt file path.
bin_file_path (str): output bin file path. bin_output_path (str): output bin file path.
""" """
# Check if the txt file exists # Check if the txt file exists
if not os.path.isfile(txt_file_path): if not os.path.isfile(text_input_path):
warnings.warn(colored(f"{txt_file_path} does not exist.", "red")) warnings.warn(f"{text_input_path} does not exist.")
return return
try: file_format = text_input_path.split(".")[-1]
# Open the text file assert file_format in ['txt', 'json', 'jsonl'], \
with open(txt_file_path, "r") as txt_file: print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.")
for line in txt_file:
with open(text_input_path, "r") as text_file:
if file_format == 'txt':
for line in text_file:
# Strip any leading/trailing whitespace # Strip any leading/trailing whitespace
stripped_line = line.strip() stripped_line = line.strip()
if stripped_line: if stripped_line:
# Pass each line to the write_bin function # Pass each line to the write_bin function
write_bin(stripped_line, bin_file_path) write_bin(stripped_line, bin_output_path)
print(colored(f"Successfully converted {txt_file_path} to {bin_file_path}", "green")) elif file_format == 'json':
data = json.load(text_file)
except Exception as e: # assuming data is a list of dictionaries
print(colored(f"Error while converting {txt_file_path} to {bin_file_path}: {str(e)}", "red")) for record in data:
# the type of record is dict, transfer the dict into str
context = json.dumps(record)
def json2bin(json_file_path: str, bin_file_path: str):
"""
Read content from json file and write to bin file
Args:
json_file_path (str): json file path.
bin_file_path (str): output bin file path.
"""
if not os.path.isfile(json_file_path):
warnings.warn(colored(f"{json_file_path} does not exist.", "red"))
return
try:
# load json file
with open(json_file_path, "r") as json_file:
data = json.load(json_file)
# assuming data is a list of dictionaries
for record in data:
# the type of record is dict, transfer the dict into str
context = json.dumps(record)
# encode the str and write into bin
write_bin(context, bin_file_path)
print(colored(f"Successfully converted {json_file_path} to {bin_file_path}", "green"))
except Exception as e:
print(colored(f"Error while converting {json_file_path} to {bin_file_path}: {str(e)}", "red"))
def jsonl2bin(jsonl_file_path: str, bin_file_path: str):
"""
Read content from jsonl file and write to bin file
Args:
jsonl_file_path: jsonl file path.
bin_file_path: bin file path.
"""
if not os.path.isfile(jsonl_file_path):
warnings.warn(colored(f"{jsonl_file_path} does not exist.", "red"))
return
try:
with open(jsonl_file_path, "r") as jsonl_file:
for line in jsonl_file:
# encode the str and write into bin # encode the str and write into bin
write_bin(line, bin_file_path) write_bin(context, bin_output_path)
print(colored(f"Successfully converted {jsonl_file_path} to {bin_file_path}", "green")) elif file_format == 'jsonl':
for line in text_file:
except Exception as e: # encode the str and write into bin
print(colored(f"Error while converting {jsonl_file_path} to {bin_file_path}: {str(e)}", "red")) write_bin(line, bin_output_path)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--raw_data_name", required=True, help="Input file name")
parser.add_argument( parser.add_argument(
"--input_file_type", "--text_input_path",
choices=["txt", "json", "jsonl"], type=str,
required=True, required=True,
help="Input file format (either txt, json or jsonl)", help="Path to the input text file.",
) )
parser.add_argument("--bin", required=True, help="Path to the output bin file") parser.add_argument(
"--bin_output_path", type=str, required=True, help="Path to the output bin file.")
return parser.parse_args() return parser.parse_args()
@ -173,21 +132,12 @@ def main():
# parse arguments # parse arguments
args = parse_args() args = parse_args()
# obtain the raw data path text2bin(args.text_input_path, args.bin_output_path)
input_file_path = f"{args.raw_data_name}.{args.input_file_type}" print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
# different methods for different raw data type, we only support "txt", "json" and "jsonl" data type.
if args.input_file_type == "txt":
txt2bin(input_file_path, args.bin)
elif args.input_file_type == "json":
json2bin(input_file_path, args.bin)
elif args.input_file_type == "jsonl":
jsonl2bin(input_file_path, args.bin)
else:
print(colored("Invalid input file type. Use --help for more information.", "red"))
# To avoid potential read/write errors, the metadata preparation follows after creating the .bin file. # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
prepare_meta(args.bin) prepare_meta(args.bin_output_path)
print(f"Successfully generated {args.bin_output_path}.meta")
if __name__ == "__main__": if __name__ == "__main__":