From 4d1b1cd5f1b1ff3e5ff16ff574ad6c64f7cfb2e3 Mon Sep 17 00:00:00 2001 From: Yang Gao Date: Tue, 7 Nov 2023 13:12:35 +0800 Subject: [PATCH] fix(data): broadcast list when walking folders (#475) --- internlm/data/packed_dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/internlm/data/packed_dataset.py b/internlm/data/packed_dataset.py index c0d689f..576862e 100644 --- a/internlm/data/packed_dataset.py +++ b/internlm/data/packed_dataset.py @@ -9,6 +9,7 @@ from typing import Dict import numpy as np import torch +import torch.distributed as dist from torch.utils.data import ConcatDataset from tqdm import tqdm @@ -372,7 +373,14 @@ def get_packed_dataset_without_short_length( datasets = [] delete_samples = 0 - for root, dirs, files in os.walk(folder, followlinks=True): + if gpc.get_global_rank() == 0: + triples = [list(os.walk(folder, followlinks=True))] + else: + triples = [None] + dist.broadcast_object_list(triples, src=0) + triples = triples[0] + + for root, dirs, files in triples: dirs.sort() # Let the folder need to be returned in a fixed order if gpc.is_rank_for_log(): logger.info(f"Reading {root}...")