From dc8dd6ec4d6f00e081de2329f495360ba511bd3e Mon Sep 17 00:00:00 2001 From: gaoyang07 Date: Mon, 10 Jul 2023 19:24:21 +0800 Subject: [PATCH] update tokenizer --- tools/tokenizer.py | 150 +++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 100 deletions(-) diff --git a/tools/tokenizer.py b/tools/tokenizer.py index 44211a1..bc92ea4 100644 --- a/tools/tokenizer.py +++ b/tools/tokenizer.py @@ -1,24 +1,25 @@ import argparse import json import os +import sys import warnings import numpy as np -from sentencepiece import SentencePieceProcessor -from termcolor import colored +sys.path.append("tools/transformers") +from tokenization_internlm import InternLMTokenizer current_dir = os.path.dirname(os.path.abspath(__file__)) -model_path = os.path.join(current_dir, "V7.model") -tokenizer = SentencePieceProcessor(model_file=model_path) +model_path = os.path.join(current_dir, "V7_sft.model") +tokenizer = InternLMTokenizer(vocab_file=model_path) -def write_bin(context: str, path: str) -> None: +def write_bin(context: str, bin_output_path: str) -> None: """ - Write bin file. + Write bin file based on the context. Args: context (str): the context of raw file. - path (str): the path for output bin file. + bin_output_path (str): the path for output bin file. Example: >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt' @@ -34,20 +35,20 @@ def write_bin(context: str, path: str) -> None: saved_bin = str.encode(json.dumps(data) + "\n") # write bytes into bin path - with open(path, "ab") as f: + with open(bin_output_path, "ab") as f: f.write(saved_bin) -def prepare_meta(bin_file_path: str): +def prepare_meta(bin_output_path: str): """ Prepare metadata for the given bin file. Args: - bin_file_path (str): the bin file path. + bin_output_path (str): Output bin file path. """ meta = [] cur = 0 - with open(bin_file_path, "rb") as f: + with open(bin_output_path, "rb") as f: while True: # read lines line = f.readline() @@ -62,109 +63,67 @@ def prepare_meta(bin_file_path: str): meta.append((cur, length)) # update the cur to generate the meta information of next line cur += len(line) - print(meta) + # define path of the generated meta file - meta_fp = bin_file_path + ".meta" + meta_fp = bin_output_path + ".meta" # save the generated meta information with open(meta_fp, "wb") as f: meta = np.array(meta, dtype=np.int32) np.save(f, meta) -def txt2bin(txt_file_path: str, bin_file_path: str): +def text2bin(text_input_path: str, bin_output_path: str): """ - Read content from txt file and write to bin file + Read content from the input file and write to bin file. + Currently support 3 input formats: 'txt', 'json' and 'jsonl'. Args: - txt_file_path (str): txt file path. - bin_file_path (str): output bin file path. + text_input_path (str): txt file path. + bin_output_path (str): output bin file path. """ # Check if the txt file exists - if not os.path.isfile(txt_file_path): - warnings.warn(colored(f"{txt_file_path} does not exist.", "red")) + if not os.path.isfile(text_input_path): + warnings.warn(f"{text_input_path} does not exist.") return - try: - # Open the text file - with open(txt_file_path, "r") as txt_file: - for line in txt_file: + file_format = text_input_path.split(".")[-1] + assert file_format in ['txt', 'json', 'jsonl'], \ + print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.") + + with open(text_input_path, "r") as text_file: + if file_format == 'txt': + for line in text_file: # Strip any leading/trailing whitespace stripped_line = line.strip() if stripped_line: # Pass each line to the write_bin function - write_bin(stripped_line, bin_file_path) + write_bin(stripped_line, bin_output_path) - print(colored(f"Successfully converted {txt_file_path} to {bin_file_path}", "green")) - - except Exception as e: - print(colored(f"Error while converting {txt_file_path} to {bin_file_path}: {str(e)}", "red")) - - -def json2bin(json_file_path: str, bin_file_path: str): - """ - Read content from json file and write to bin file - - Args: - json_file_path (str): json file path. - bin_file_path (str): output bin file path. - """ - - if not os.path.isfile(json_file_path): - warnings.warn(colored(f"{json_file_path} does not exist.", "red")) - return - - try: - # load json file - with open(json_file_path, "r") as json_file: - data = json.load(json_file) - # assuming data is a list of dictionaries - for record in data: - # the type of record is dict, transfer the dict into str - context = json.dumps(record) - # encode the str and write into bin - write_bin(context, bin_file_path) - - print(colored(f"Successfully converted {json_file_path} to {bin_file_path}", "green")) - - except Exception as e: - print(colored(f"Error while converting {json_file_path} to {bin_file_path}: {str(e)}", "red")) - - -def jsonl2bin(jsonl_file_path: str, bin_file_path: str): - """ - Read content from jsonl file and write to bin file - - Args: - jsonl_file_path: jsonl file path. - bin_file_path: bin file path. - """ - - if not os.path.isfile(jsonl_file_path): - warnings.warn(colored(f"{jsonl_file_path} does not exist.", "red")) - return - - try: - with open(jsonl_file_path, "r") as jsonl_file: - for line in jsonl_file: + elif file_format == 'json': + data = json.load(text_file) + # assuming data is a list of dictionaries + for record in data: + # the type of record is dict, transfer the dict into str + context = json.dumps(record) # encode the str and write into bin - write_bin(line, bin_file_path) - - print(colored(f"Successfully converted {jsonl_file_path} to {bin_file_path}", "green")) - - except Exception as e: - print(colored(f"Error while converting {jsonl_file_path} to {bin_file_path}: {str(e)}", "red")) + write_bin(context, bin_output_path) + + elif file_format == 'jsonl': + for line in text_file: + # encode the str and write into bin + write_bin(line, bin_output_path) def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--raw_data_name", required=True, help="Input file name") parser.add_argument( - "--input_file_type", - choices=["txt", "json", "jsonl"], + "--text_input_path", + type=str, required=True, - help="Input file format (either txt, json or jsonl)", + help="Path to the input text file.", ) - parser.add_argument("--bin", required=True, help="Path to the output bin file") + parser.add_argument( + "--bin_output_path", type=str, required=True, help="Path to the output bin file.") return parser.parse_args() @@ -173,21 +132,12 @@ def main(): # parse arguments args = parse_args() - # obtain the raw data path - input_file_path = f"{args.raw_data_name}.{args.input_file_type}" - - # different methods for different raw data type, we only support "txt", "json" and "jsonl" data type. - if args.input_file_type == "txt": - txt2bin(input_file_path, args.bin) - elif args.input_file_type == "json": - json2bin(input_file_path, args.bin) - elif args.input_file_type == "jsonl": - jsonl2bin(input_file_path, args.bin) - else: - print(colored("Invalid input file type. Use --help for more information.", "red")) + text2bin(args.text_input_path, args.bin_output_path) + print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}") # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file. - prepare_meta(args.bin) + prepare_meta(args.bin_output_path) + print(f"Successfully generated {args.bin_output_path}.meta") if __name__ == "__main__":