diff --git a/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py b/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py index cb578b5f6..ebf56f889 100644 --- a/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py +++ b/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py @@ -16,7 +16,7 @@ from colossal_llama2.dataset.spliced_and_tokenized_dataset import ( supervised_tokenize_pretrain, ) from datasets import dataset_dict, load_dataset -from transformers.models.llama.tokenization_llama import LlamaTokenizer +from transformers import AutoTokenizer from colossalai.logging import get_dist_logger @@ -55,15 +55,12 @@ def main(): if args.num_spliced_dataset_bins >= 100000: raise ValueError("Too many spliced divisions, must be smaller than 100000") - assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}" - assert not os.path.exists( - args.data_jsonl_output_dir - ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}" - assert not os.path.exists( - args.data_arrow_output_dir - ), f"Find existed arrow data output dir {args.data_arrow_output_dir}" - os.makedirs(args.data_jsonl_output_dir) - os.makedirs(args.data_arrow_output_dir) + if not os.path.exists(args.data_cache_dir): + os.makedirs(args.data_cache_dir) + if not os.path.exists(args.data_jsonl_output_dir): + os.makedirs(args.data_jsonl_output_dir) + if not os.path.exists(args.data_arrow_output_dir): + os.makedirs(args.data_arrow_output_dir) # Prepare to all input datasets input_data_paths = [] @@ -86,7 +83,7 @@ def main(): train_splits.append(f"train[{start}%:{end}%]") # Prepare to the tokenizer. - tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) tokenizer.add_bos_token = False tokenizer.add_eos_token = False if tokenizer.pad_token is None: