From 296903243968dd5e6cd7577d3eb47bef111d05e3 Mon Sep 17 00:00:00 2001 From: gaoyang07 Date: Thu, 13 Jul 2023 16:52:17 +0800 Subject: [PATCH] update the import and fix lints --- tools/tokenizer.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tools/tokenizer.py b/tools/tokenizer.py index 9a969ad..fc3800e 100644 --- a/tools/tokenizer.py +++ b/tools/tokenizer.py @@ -2,14 +2,14 @@ import argparse import json import os import sys -import warnings import numpy as np -sys.path.append("tools/transformers") -from tokenization_internlm import InternLMTokenizer current_dir = os.path.dirname(os.path.abspath(__file__)) model_path = os.path.join(current_dir, "V7_sft.model") +sys.path.append(os.path.join(current_dir, "transformers")) +from tokenization_internlm import InternLMTokenizer + tokenizer = InternLMTokenizer(vocab_file=model_path) @@ -82,15 +82,15 @@ def text2bin(text_input_path: str, bin_output_path: str): """ # Check if the txt file exists if not os.path.isfile(text_input_path): - warnings.warn(f"{text_input_path} does not exist.") - return + raise FileNotFoundError(f"{text_input_path} does not exist.") file_format = text_input_path.split(".")[-1] - assert file_format in ['txt', 'json', 'jsonl'], \ - print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.") + assert file_format in ["txt", "json", "jsonl"], print( + "Invalid input file type. Currently support `txt`, `json` and `jsonl`." + ) with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file: - if file_format == 'txt': + if file_format == "txt": for line in text_file: # Strip any leading/trailing whitespace stripped_line = line.strip() @@ -98,7 +98,7 @@ def text2bin(text_input_path: str, bin_output_path: str): # Pass each line to the write_bin function write_bin(stripped_line, bin_file) - elif file_format == 'json': + elif file_format == "json": data = json.load(text_file) # assuming data is a list of dictionaries for record in data: @@ -106,8 +106,8 @@ def text2bin(text_input_path: str, bin_output_path: str): context = json.dumps(record) # encode the str and write into bin write_bin(context, bin_file) - - elif file_format == 'jsonl': + + elif file_format == "jsonl": for line in text_file: # encode the str and write into bin write_bin(line, bin_file) @@ -121,8 +121,7 @@ def parse_args(): required=True, help="Path to the input text file.", ) - parser.add_argument( - "--bin_output_path", type=str, required=True, help="Path to the output bin file.") + parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.") return parser.parse_args()