ColossalAI/examples/language/gpt/tools/download/get_urls.py

33 lines
930 B
Python

import datetime
import praw
import psaw
import tqdm
api = psaw.PushshiftAPI()
# all posts until the end of 2017
end_time = int(datetime.datetime(2018, 1, 1).timestamp())
query = api.search_submissions(before=end_time,
filter=['url', 'score'],
sort='desc',
score='>2',
is_self=False,
over_18=False)
with tqdm.tqdm() as pbar:
# download links from submissions
with open('urls.txt', 'w') as fh:
for subm in query:
url = subm.url
# weird issue with psaw/pushshift that breaks score=">2"
if subm.score < 3:
continue
#print(subm.score)
# pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc)))
pbar.update(1)
fh.write(url + '\n')
fh.flush()