mirror of https://github.com/InternLM/InternLM
update tokenizer
parent
c18bec9361
commit
dc8dd6ec4d
|
@ -1,24 +1,25 @@
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
sys.path.append("tools/transformers")
|
||||||
from termcolor import colored
|
from tokenization_internlm import InternLMTokenizer
|
||||||
|
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
model_path = os.path.join(current_dir, "V7.model")
|
model_path = os.path.join(current_dir, "V7_sft.model")
|
||||||
tokenizer = SentencePieceProcessor(model_file=model_path)
|
tokenizer = InternLMTokenizer(vocab_file=model_path)
|
||||||
|
|
||||||
|
|
||||||
def write_bin(context: str, path: str) -> None:
|
def write_bin(context: str, bin_output_path: str) -> None:
|
||||||
"""
|
"""
|
||||||
Write bin file.
|
Write bin file based on the context.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
context (str): the context of raw file.
|
context (str): the context of raw file.
|
||||||
path (str): the path for output bin file.
|
bin_output_path (str): the path for output bin file.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
>>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
|
>>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
|
||||||
|
@ -34,20 +35,20 @@ def write_bin(context: str, path: str) -> None:
|
||||||
saved_bin = str.encode(json.dumps(data) + "\n")
|
saved_bin = str.encode(json.dumps(data) + "\n")
|
||||||
|
|
||||||
# write bytes into bin path
|
# write bytes into bin path
|
||||||
with open(path, "ab") as f:
|
with open(bin_output_path, "ab") as f:
|
||||||
f.write(saved_bin)
|
f.write(saved_bin)
|
||||||
|
|
||||||
|
|
||||||
def prepare_meta(bin_file_path: str):
|
def prepare_meta(bin_output_path: str):
|
||||||
"""
|
"""
|
||||||
Prepare metadata for the given bin file.
|
Prepare metadata for the given bin file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
bin_file_path (str): the bin file path.
|
bin_output_path (str): Output bin file path.
|
||||||
"""
|
"""
|
||||||
meta = []
|
meta = []
|
||||||
cur = 0
|
cur = 0
|
||||||
with open(bin_file_path, "rb") as f:
|
with open(bin_output_path, "rb") as f:
|
||||||
while True:
|
while True:
|
||||||
# read lines
|
# read lines
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
|
@ -62,109 +63,67 @@ def prepare_meta(bin_file_path: str):
|
||||||
meta.append((cur, length))
|
meta.append((cur, length))
|
||||||
# update the cur to generate the meta information of next line
|
# update the cur to generate the meta information of next line
|
||||||
cur += len(line)
|
cur += len(line)
|
||||||
print(meta)
|
|
||||||
# define path of the generated meta file
|
# define path of the generated meta file
|
||||||
meta_fp = bin_file_path + ".meta"
|
meta_fp = bin_output_path + ".meta"
|
||||||
# save the generated meta information
|
# save the generated meta information
|
||||||
with open(meta_fp, "wb") as f:
|
with open(meta_fp, "wb") as f:
|
||||||
meta = np.array(meta, dtype=np.int32)
|
meta = np.array(meta, dtype=np.int32)
|
||||||
np.save(f, meta)
|
np.save(f, meta)
|
||||||
|
|
||||||
|
|
||||||
def txt2bin(txt_file_path: str, bin_file_path: str):
|
def text2bin(text_input_path: str, bin_output_path: str):
|
||||||
"""
|
"""
|
||||||
Read content from txt file and write to bin file
|
Read content from the input file and write to bin file.
|
||||||
|
Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
txt_file_path (str): txt file path.
|
text_input_path (str): txt file path.
|
||||||
bin_file_path (str): output bin file path.
|
bin_output_path (str): output bin file path.
|
||||||
"""
|
"""
|
||||||
# Check if the txt file exists
|
# Check if the txt file exists
|
||||||
if not os.path.isfile(txt_file_path):
|
if not os.path.isfile(text_input_path):
|
||||||
warnings.warn(colored(f"{txt_file_path} does not exist.", "red"))
|
warnings.warn(f"{text_input_path} does not exist.")
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
file_format = text_input_path.split(".")[-1]
|
||||||
# Open the text file
|
assert file_format in ['txt', 'json', 'jsonl'], \
|
||||||
with open(txt_file_path, "r") as txt_file:
|
print("Invalid input file type. Currently support `txt`, `json` and `jsonl`.")
|
||||||
for line in txt_file:
|
|
||||||
|
with open(text_input_path, "r") as text_file:
|
||||||
|
if file_format == 'txt':
|
||||||
|
for line in text_file:
|
||||||
# Strip any leading/trailing whitespace
|
# Strip any leading/trailing whitespace
|
||||||
stripped_line = line.strip()
|
stripped_line = line.strip()
|
||||||
if stripped_line:
|
if stripped_line:
|
||||||
# Pass each line to the write_bin function
|
# Pass each line to the write_bin function
|
||||||
write_bin(stripped_line, bin_file_path)
|
write_bin(stripped_line, bin_output_path)
|
||||||
|
|
||||||
print(colored(f"Successfully converted {txt_file_path} to {bin_file_path}", "green"))
|
elif file_format == 'json':
|
||||||
|
data = json.load(text_file)
|
||||||
except Exception as e:
|
|
||||||
print(colored(f"Error while converting {txt_file_path} to {bin_file_path}: {str(e)}", "red"))
|
|
||||||
|
|
||||||
|
|
||||||
def json2bin(json_file_path: str, bin_file_path: str):
|
|
||||||
"""
|
|
||||||
Read content from json file and write to bin file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
json_file_path (str): json file path.
|
|
||||||
bin_file_path (str): output bin file path.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not os.path.isfile(json_file_path):
|
|
||||||
warnings.warn(colored(f"{json_file_path} does not exist.", "red"))
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
# load json file
|
|
||||||
with open(json_file_path, "r") as json_file:
|
|
||||||
data = json.load(json_file)
|
|
||||||
# assuming data is a list of dictionaries
|
# assuming data is a list of dictionaries
|
||||||
for record in data:
|
for record in data:
|
||||||
# the type of record is dict, transfer the dict into str
|
# the type of record is dict, transfer the dict into str
|
||||||
context = json.dumps(record)
|
context = json.dumps(record)
|
||||||
# encode the str and write into bin
|
# encode the str and write into bin
|
||||||
write_bin(context, bin_file_path)
|
write_bin(context, bin_output_path)
|
||||||
|
|
||||||
print(colored(f"Successfully converted {json_file_path} to {bin_file_path}", "green"))
|
elif file_format == 'jsonl':
|
||||||
|
for line in text_file:
|
||||||
except Exception as e:
|
|
||||||
print(colored(f"Error while converting {json_file_path} to {bin_file_path}: {str(e)}", "red"))
|
|
||||||
|
|
||||||
|
|
||||||
def jsonl2bin(jsonl_file_path: str, bin_file_path: str):
|
|
||||||
"""
|
|
||||||
Read content from jsonl file and write to bin file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
jsonl_file_path: jsonl file path.
|
|
||||||
bin_file_path: bin file path.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not os.path.isfile(jsonl_file_path):
|
|
||||||
warnings.warn(colored(f"{jsonl_file_path} does not exist.", "red"))
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(jsonl_file_path, "r") as jsonl_file:
|
|
||||||
for line in jsonl_file:
|
|
||||||
# encode the str and write into bin
|
# encode the str and write into bin
|
||||||
write_bin(line, bin_file_path)
|
write_bin(line, bin_output_path)
|
||||||
|
|
||||||
print(colored(f"Successfully converted {jsonl_file_path} to {bin_file_path}", "green"))
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(colored(f"Error while converting {jsonl_file_path} to {bin_file_path}: {str(e)}", "red"))
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--raw_data_name", required=True, help="Input file name")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--input_file_type",
|
"--text_input_path",
|
||||||
choices=["txt", "json", "jsonl"],
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="Input file format (either txt, json or jsonl)",
|
help="Path to the input text file.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--bin", required=True, help="Path to the output bin file")
|
parser.add_argument(
|
||||||
|
"--bin_output_path", type=str, required=True, help="Path to the output bin file.")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
@ -173,21 +132,12 @@ def main():
|
||||||
# parse arguments
|
# parse arguments
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# obtain the raw data path
|
text2bin(args.text_input_path, args.bin_output_path)
|
||||||
input_file_path = f"{args.raw_data_name}.{args.input_file_type}"
|
print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
|
||||||
|
|
||||||
# different methods for different raw data type, we only support "txt", "json" and "jsonl" data type.
|
|
||||||
if args.input_file_type == "txt":
|
|
||||||
txt2bin(input_file_path, args.bin)
|
|
||||||
elif args.input_file_type == "json":
|
|
||||||
json2bin(input_file_path, args.bin)
|
|
||||||
elif args.input_file_type == "jsonl":
|
|
||||||
jsonl2bin(input_file_path, args.bin)
|
|
||||||
else:
|
|
||||||
print(colored("Invalid input file type. Use --help for more information.", "red"))
|
|
||||||
|
|
||||||
# To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
|
# To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
|
||||||
prepare_meta(args.bin)
|
prepare_meta(args.bin_output_path)
|
||||||
|
print(f"Successfully generated {args.bin_output_path}.meta")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue