From dc8dd6ec4d6f00e081de2329f495360ba511bd3e Mon Sep 17 00:00:00 2001
From: gaoyang07 <Gary1546308416AL@gmail.com>
Date: Mon, 10 Jul 2023 19:24:21 +0800
Subject: [PATCH] update tokenizer

---
 tools/tokenizer.py | 150 +++++++++++++++------------------------------
 1 file changed, 50 insertions(+), 100 deletions(-)

diff --git a/tools/tokenizer.py b/tools/tokenizer.py
index 44211a1..bc92ea4 100644
--- a/tools/tokenizer.py
+++ b/tools/tokenizer.py
@@ -1,24 +1,25 @@
 import argparse
 import json
 import os
+import sys
 import warnings
 
 import numpy as np
-from sentencepiece import SentencePieceProcessor
-from termcolor import colored
+sys.path.append("tools/transformers")
+from tokenization_internlm import InternLMTokenizer
 
 current_dir = os.path.dirname(os.path.abspath(__file__))
-model_path = os.path.join(current_dir, "V7.model")
-tokenizer = SentencePieceProcessor(model_file=model_path)
+model_path = os.path.join(current_dir, "V7_sft.model")
+tokenizer = InternLMTokenizer(vocab_file=model_path)
 
 
-def write_bin(context: str, path: str) -> None:
+def write_bin(context: str, bin_output_path: str) -> None:
     """
-    Write bin file.
+    Write bin file based on the context.
 
     Args:
         context (str): the context of raw file.
-        path (str): the path for output bin file.
+        bin_output_path (str): the path for output bin file.
 
     Example:
     >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
@@ -34,20 +35,20 @@ def write_bin(context: str, path: str) -> None:
     saved_bin = str.encode(json.dumps(data) + "\n")
 
     # write bytes into bin path
-    with open(path, "ab") as f:
+    with open(bin_output_path, "ab") as f:
         f.write(saved_bin)
 
 
-def prepare_meta(bin_file_path: str):
+def prepare_meta(bin_output_path: str):
     """
     Prepare metadata for the given bin file.
 
     Args:
-        bin_file_path (str): the bin file path.
+        bin_output_path (str): Output bin file path.
     """
     meta = []
     cur = 0
-    with open(bin_file_path, "rb") as f:
+    with open(bin_output_path, "rb") as f:
         while True:
             # read lines
             line = f.readline()
@@ -62,109 +63,67 @@ def prepare_meta(bin_file_path: str):
             meta.append((cur, length))
             # update the cur to generate the meta information of next line
             cur += len(line)
-    print(meta)
+
     # define path of the generated meta file
-    meta_fp = bin_file_path + ".meta"
+    meta_fp = bin_output_path + ".meta"
     # save the generated meta information
     with open(meta_fp, "wb") as f:
         meta = np.array(meta, dtype=np.int32)
         np.save(f, meta)
 
 
-def txt2bin(txt_file_path: str, bin_file_path: str):
+def text2bin(text_input_path: str, bin_output_path: str):
     """
-    Read content from txt file and write to bin file
+    Read content from the input file and write to bin file.
+    Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
 
     Args:
-        txt_file_path (str): txt file path.
-        bin_file_path (str): output bin file path.
+        text_input_path (str): txt file path.
+        bin_output_path (str): output bin file path.
     """
     # Check if the txt file exists
-    if not os.path.isfile(txt_file_path):
-        warnings.warn(colored(f"{txt_file_path} does not exist.", "red"))
+    if not os.path.isfile(text_input_path):
+        warnings.warn(f"{text_input_path} does not exist.")
         return
 
-    try:
-        # Open the text file
-        with open(txt_file_path, "r") as txt_file:
-            for line in txt_file:
+    file_format = text_input_path.split(".")[-1]
+    assert file_format in ['txt', 'json', 'jsonl'], \
+        print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.")
+
+    with open(text_input_path, "r") as text_file:
+        if file_format == 'txt':
+            for line in text_file:
                 # Strip any leading/trailing whitespace
                 stripped_line = line.strip()
                 if stripped_line:
                     # Pass each line to the write_bin function
-                    write_bin(stripped_line, bin_file_path)
+                    write_bin(stripped_line, bin_output_path)
 
-        print(colored(f"Successfully converted {txt_file_path} to {bin_file_path}", "green"))
-
-    except Exception as e:
-        print(colored(f"Error while converting {txt_file_path} to {bin_file_path}: {str(e)}", "red"))
-
-
-def json2bin(json_file_path: str, bin_file_path: str):
-    """
-    Read content from json file and write to bin file
-
-    Args:
-        json_file_path (str): json file path.
-        bin_file_path (str): output bin file path.
-    """
-
-    if not os.path.isfile(json_file_path):
-        warnings.warn(colored(f"{json_file_path} does not exist.", "red"))
-        return
-
-    try:
-        # load json file
-        with open(json_file_path, "r") as json_file:
-            data = json.load(json_file)
-        # assuming data is a list of dictionaries
-        for record in data:
-            # the type of record is dict, transfer the dict into str
-            context = json.dumps(record)
-            # encode the str and write into bin
-            write_bin(context, bin_file_path)
-
-        print(colored(f"Successfully converted {json_file_path} to {bin_file_path}", "green"))
-
-    except Exception as e:
-        print(colored(f"Error while converting {json_file_path} to {bin_file_path}: {str(e)}", "red"))
-
-
-def jsonl2bin(jsonl_file_path: str, bin_file_path: str):
-    """
-    Read content from jsonl file and write to bin file
-
-    Args:
-        jsonl_file_path: jsonl file path.
-        bin_file_path: bin file path.
-    """
-
-    if not os.path.isfile(jsonl_file_path):
-        warnings.warn(colored(f"{jsonl_file_path} does not exist.", "red"))
-        return
-
-    try:
-        with open(jsonl_file_path, "r") as jsonl_file:
-            for line in jsonl_file:
+        elif file_format == 'json':
+            data = json.load(text_file)
+            # assuming data is a list of dictionaries
+            for record in data:
+                # the type of record is dict, transfer the dict into str
+                context = json.dumps(record)
                 # encode the str and write into bin
-                write_bin(line, bin_file_path)
-
-        print(colored(f"Successfully converted {jsonl_file_path} to {bin_file_path}", "green"))
-
-    except Exception as e:
-        print(colored(f"Error while converting {jsonl_file_path} to {bin_file_path}: {str(e)}", "red"))
+                write_bin(context, bin_output_path)
+        
+        elif file_format == 'jsonl':
+            for line in text_file:
+                # encode the str and write into bin
+                write_bin(line, bin_output_path)
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--raw_data_name", required=True, help="Input file name")
     parser.add_argument(
-        "--input_file_type",
-        choices=["txt", "json", "jsonl"],
+        "--text_input_path",
+        type=str,
         required=True,
-        help="Input file format (either txt, json or jsonl)",
+        help="Path to the input text file.",
     )
-    parser.add_argument("--bin", required=True, help="Path to the output bin file")
+    parser.add_argument(
+        "--bin_output_path", type=str, required=True, help="Path to the output bin file.")
 
     return parser.parse_args()
 
@@ -173,21 +132,12 @@ def main():
     # parse arguments
     args = parse_args()
 
-    # obtain the raw data path
-    input_file_path = f"{args.raw_data_name}.{args.input_file_type}"
-
-    # different methods for different raw data type, we only support "txt", "json" and "jsonl" data type.
-    if args.input_file_type == "txt":
-        txt2bin(input_file_path, args.bin)
-    elif args.input_file_type == "json":
-        json2bin(input_file_path, args.bin)
-    elif args.input_file_type == "jsonl":
-        jsonl2bin(input_file_path, args.bin)
-    else:
-        print(colored("Invalid input file type. Use --help for more information.", "red"))
+    text2bin(args.text_input_path, args.bin_output_path)
+    print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
 
     # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
-    prepare_meta(args.bin)
+    prepare_meta(args.bin_output_path)
+    print(f"Successfully generated {args.bin_output_path}.meta")
 
 
 if __name__ == "__main__":