InternLM/internlm/data/dataset.py

57 lines
1.7 KiB
Python

import os
from typing import Dict
from torch.utils.data import ConcatDataset
from internlm.data.single_dataset import JsonlDataset
def get_dataset_dict(folder, split="valid") -> Dict:
"""
Return a dictionary of Datasets from a folder containing data files for validation.
Args:
folder (str): The path to the folder containing data files.
split (str): The split of the data files to be used, default is "valid".
Returns:
A dictionary containing Datasets for each folder in the given path
that contains data files with the specified split.
Raises:
AssertionError: If the given folder does not exist.
Example:
If the given folder is as follows,
- data
- zhihu
- xxx.bin
- valid.bin
- baike
- xxx.bin
- valid.bin
The returned dictionary will be,
{
'zhihu': Dataset,
'baike': Dataset
}
"""
assert os.path.exists(folder), f"folder `{folder}` not exists"
data_dict = {}
for root, dirs, files in os.walk(folder, followlinks=True):
dirs.sort() # The order is guaranteed, and the newly added data starting with z needs to be ranked behind
datasets = []
for fn in sorted(files): # Need sorted to ensure that the order is consistent
if fn.endswith(".bin") and split in fn:
fp = os.path.join(root, fn)
ds = JsonlDataset(fp)
datasets.append(ds)
if datasets:
ds = ConcatDataset(datasets=datasets)
data_dict[os.path.basename(root)] = ds
return data_dict