broadcast list when walking folders

pull/475/head
gaoyang07 2023-11-06 23:22:04 +08:00
parent 095ebfff9d
commit 535b0f795e
1 changed files with 9 additions and 1 deletions

View File

@ -9,6 +9,7 @@ from typing import Dict
import numpy as np import numpy as np
import torch import torch
import torch.distributed as dist
from torch.utils.data import ConcatDataset from torch.utils.data import ConcatDataset
from tqdm import tqdm from tqdm import tqdm
@ -372,7 +373,14 @@ def get_packed_dataset_without_short_length(
datasets = [] datasets = []
delete_samples = 0 delete_samples = 0
for root, dirs, files in os.walk(folder, followlinks=True): if gpc.get_global_rank() == 0:
triples = [list(os.walk(folder, followlinks=True))]
else:
triples = [None]
dist.broadcast_object_list(triples, src=0)
triples = triples[0]
for root, dirs, files in triples:
dirs.sort() # Let the folder need to be returned in a fixed order dirs.sort() # Let the folder need to be returned in a fixed order
if gpc.is_rank_for_log(): if gpc.is_rank_for_log():
logger.info(f"Reading {root}...") logger.info(f"Reading {root}...")