ColossalAI/examples/language/gpt/tools/download/utils.py

63 lines
1.3 KiB
Python

# Code taken in large part from https://github.com/jcpeterson/openwebtext
import collections
import os
import os.path as op
import re
import tarfile
def extract_month(url_file_name):
month_re = r"(RS_.*2\d{3}-\d{2})"
month = op.split(url_file_name)[-1]
month = re.match(month_re, month).group()
return month
def chunks(l, n, s=0):
"""Yield successive n-sized chunks from l, skipping the first s chunks."""
if isinstance(l, collections.Iterable):
chnk = []
for i, elem in enumerate(l):
if i < s:
continue
chnk.append(elem)
if len(chnk) == n:
yield chnk
chnk = []
if len(chnk) != 0:
yield chnk
else:
for i in range(s, len(l), n):
yield l[i:i + n]
def extract_archive(archive_fp, outdir="."):
with tarfile.open(archive_fp, "r") as tar:
tar.extractall(outdir)
return outdir
def mkdir(fp):
try:
os.makedirs(fp)
except FileExistsError:
pass
return fp
def linecount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines