update pretrain data script

llama3
Tong Li 7 months ago
parent d83c633ca6
commit e5b4a58543

@ -16,7 +16,7 @@ from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
supervised_tokenize_pretrain,
)
from datasets import dataset_dict, load_dataset
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from transformers import AutoTokenizer
from colossalai.logging import get_dist_logger
@ -55,15 +55,12 @@ def main():
if args.num_spliced_dataset_bins >= 100000:
raise ValueError("Too many spliced divisions, must be smaller than 100000")
assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}"
assert not os.path.exists(
args.data_jsonl_output_dir
), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
assert not os.path.exists(
args.data_arrow_output_dir
), f"Find existed arrow data output dir {args.data_arrow_output_dir}"
os.makedirs(args.data_jsonl_output_dir)
os.makedirs(args.data_arrow_output_dir)
if not os.path.exists(args.data_cache_dir):
os.makedirs(args.data_cache_dir)
if not os.path.exists(args.data_jsonl_output_dir):
os.makedirs(args.data_jsonl_output_dir)
if not os.path.exists(args.data_arrow_output_dir):
os.makedirs(args.data_arrow_output_dir)
# Prepare to all input datasets
input_data_paths = []
@ -86,7 +83,7 @@ def main():
train_splits.append(f"train[{start}%:{end}%]")
# Prepare to the tokenizer.
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if tokenizer.pad_token is None:

Loading…
Cancel
Save