import os from typing import Dict from torch.utils.data import ConcatDataset from internlm.data.single_dataset import JsonlDataset def get_dataset_dict(folder, split="valid") -> Dict: """ Return a dictionary of Datasets from a folder containing data files for validation. Args: folder (str): The path to the folder containing data files. split (str): The split of the data files to be used, default is "valid". Returns: A dictionary containing Datasets for each folder in the given path that contains data files with the specified split. Raises: AssertionError: If the given folder does not exist. Example: If the given folder is as follows, - data - zhihu - xxx.bin - valid.bin - baike - xxx.bin - valid.bin The returned dictionary will be, { 'zhihu': Dataset, 'baike': Dataset } """ assert os.path.exists(folder), f"folder `{folder}` not exists" data_dict = {} for root, dirs, files in os.walk(folder, followlinks=True): dirs.sort() # The order is guaranteed, and the newly added data starting with z needs to be ranked behind datasets = [] for fn in sorted(files): # Need sorted to ensure that the order is consistent if fn.endswith(".bin") and split in fn: fp = os.path.join(root, fn) ds = JsonlDataset(fp) datasets.append(ds) if datasets: ds = ConcatDataset(datasets=datasets) data_dict[os.path.basename(root)] = ds return data_dict