update the import and fix lints

2023-07-13 16:52:17 +08:00 · 2023-07-13 16:52:17 +08:00 · 2969032439
parent f0d9e56a1a
commit 2969032439
1 changed files with 12 additions and 13 deletions
--- a/tools/tokenizer.py
+++ b/tools/tokenizer.py
@ -2,14 +2,14 @@ import argparse
 import json
 import os
 import sys
 import warnings
 import numpy as np
 sys.path.append("tools/transformers")
 from tokenization_internlm import InternLMTokenizer
 current_dir = os.path.dirname(os.path.abspath(__file__))
 model_path = os.path.join(current_dir, "V7_sft.model")
 sys.path.append(os.path.join(current_dir, "transformers"))
 from tokenization_internlm import InternLMTokenizer
 tokenizer = InternLMTokenizer(vocab_file=model_path)
@ -82,15 +82,15 @@ def text2bin(text_input_path: str, bin_output_path: str):
    """
    # Check if the txt file exists
    if not os.path.isfile(text_input_path):
-        warnings.warn(f"{text_input_path} does not exist.")
+        raise FileNotFoundError(f"{text_input_path} does not exist.")
        return
    file_format = text_input_path.split(".")[-1]
-    assert file_format in ['txt', 'json', 'jsonl'], \
+    assert file_format in ["txt", "json", "jsonl"], print(
-        print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.")
+        "Invalid input file type. Currently support `txt`, `json` and `jsonl`."
    )
    with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
-        if file_format == 'txt':
+        if file_format == "txt":
            for line in text_file:
                # Strip any leading/trailing whitespace
                stripped_line = line.strip()
@ -98,7 +98,7 @@ def text2bin(text_input_path: str, bin_output_path: str):
                    # Pass each line to the write_bin function
                    write_bin(stripped_line, bin_file)
-        elif file_format == 'json':
+        elif file_format == "json":
            data = json.load(text_file)
            # assuming data is a list of dictionaries
            for record in data:
@ -106,8 +106,8 @@ def text2bin(text_input_path: str, bin_output_path: str):
                context = json.dumps(record)
                # encode the str and write into bin
                write_bin(context, bin_file)
-        
+
-        elif file_format == 'jsonl':
+        elif file_format == "jsonl":
            for line in text_file:
                # encode the str and write into bin
                write_bin(line, bin_file)
@ -121,8 +121,7 @@ def parse_args():
        required=True,
        help="Path to the input text file.",
    )
-    parser.add_argument(
+    parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.")
        "--bin_output_path", type=str, required=True, help="Path to the output bin file.")
    return parser.parse_args()