fix(data): broadcast list when walking folders (#475)

pull/477/head
Yang Gao 2023-11-07 13:12:35 +08:00 committed by GitHub
parent 095ebfff9d
commit 4d1b1cd5f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 9 additions and 1 deletions

View File

@ -9,6 +9,7 @@ from typing import Dict
import numpy as np import numpy as np
import torch import torch
import torch.distributed as dist
from torch.utils.data import ConcatDataset from torch.utils.data import ConcatDataset
from tqdm import tqdm from tqdm import tqdm
@ -372,7 +373,14 @@ def get_packed_dataset_without_short_length(
datasets = [] datasets = []
delete_samples = 0 delete_samples = 0
for root, dirs, files in os.walk(folder, followlinks=True): if gpc.get_global_rank() == 0:
triples = [list(os.walk(folder, followlinks=True))]
else:
triples = [None]
dist.broadcast_object_list(triples, src=0)
triples = triples[0]
for root, dirs, files in triples:
dirs.sort() # Let the folder need to be returned in a fixed order dirs.sort() # Let the folder need to be returned in a fixed order
if gpc.is_rank_for_log(): if gpc.is_rank_for_log():
logger.info(f"Reading {root}...") logger.info(f"Reading {root}...")