mirror of https://github.com/hpcaitech/ColossalAI
63 lines
1.3 KiB
Python
63 lines
1.3 KiB
Python
# Code taken in large part from https://github.com/jcpeterson/openwebtext
|
|
|
|
import collections
|
|
import os
|
|
import os.path as op
|
|
import re
|
|
import tarfile
|
|
|
|
|
|
def extract_month(url_file_name):
|
|
month_re = r"(RS_.*2\d{3}-\d{2})"
|
|
month = op.split(url_file_name)[-1]
|
|
month = re.match(month_re, month).group()
|
|
return month
|
|
|
|
|
|
def chunks(l, n, s=0):
|
|
"""Yield successive n-sized chunks from l, skipping the first s chunks."""
|
|
if isinstance(l, collections.Iterable):
|
|
chnk = []
|
|
for i, elem in enumerate(l):
|
|
if i < s:
|
|
continue
|
|
|
|
chnk.append(elem)
|
|
if len(chnk) == n:
|
|
yield chnk
|
|
chnk = []
|
|
if len(chnk) != 0:
|
|
yield chnk
|
|
|
|
else:
|
|
for i in range(s, len(l), n):
|
|
yield l[i:i + n]
|
|
|
|
|
|
def extract_archive(archive_fp, outdir="."):
|
|
with tarfile.open(archive_fp, "r") as tar:
|
|
tar.extractall(outdir)
|
|
return outdir
|
|
|
|
|
|
def mkdir(fp):
|
|
try:
|
|
os.makedirs(fp)
|
|
except FileExistsError:
|
|
pass
|
|
return fp
|
|
|
|
|
|
def linecount(filename):
|
|
f = open(filename, 'rb')
|
|
lines = 0
|
|
buf_size = 1024 * 1024
|
|
read_f = f.raw.read
|
|
|
|
buf = read_f(buf_size)
|
|
while buf:
|
|
lines += buf.count(b'\n')
|
|
buf = read_f(buf_size)
|
|
|
|
return lines
|