mirror of https://github.com/InternLM/InternLM
fix(data): broadcast list when walking folders (#475)
parent
095ebfff9d
commit
4d1b1cd5f1
|
@ -9,6 +9,7 @@ from typing import Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
from torch.utils.data import ConcatDataset
|
from torch.utils.data import ConcatDataset
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
@ -372,7 +373,14 @@ def get_packed_dataset_without_short_length(
|
||||||
datasets = []
|
datasets = []
|
||||||
delete_samples = 0
|
delete_samples = 0
|
||||||
|
|
||||||
for root, dirs, files in os.walk(folder, followlinks=True):
|
if gpc.get_global_rank() == 0:
|
||||||
|
triples = [list(os.walk(folder, followlinks=True))]
|
||||||
|
else:
|
||||||
|
triples = [None]
|
||||||
|
dist.broadcast_object_list(triples, src=0)
|
||||||
|
triples = triples[0]
|
||||||
|
|
||||||
|
for root, dirs, files in triples:
|
||||||
dirs.sort() # Let the folder need to be returned in a fixed order
|
dirs.sort() # Let the folder need to be returned in a fixed order
|
||||||
if gpc.is_rank_for_log():
|
if gpc.is_rank_for_log():
|
||||||
logger.info(f"Reading {root}...")
|
logger.info(f"Reading {root}...")
|
||||||
|
|
Loading…
Reference in New Issue