mirror of https://github.com/hpcaitech/ColossalAI
33 lines
930 B
Python
33 lines
930 B
Python
import datetime
|
|
|
|
import praw
|
|
import psaw
|
|
import tqdm
|
|
|
|
api = psaw.PushshiftAPI()
|
|
|
|
# all posts until the end of 2017
|
|
end_time = int(datetime.datetime(2018, 1, 1).timestamp())
|
|
|
|
query = api.search_submissions(before=end_time,
|
|
filter=['url', 'score'],
|
|
sort='desc',
|
|
score='>2',
|
|
is_self=False,
|
|
over_18=False)
|
|
|
|
with tqdm.tqdm() as pbar:
|
|
# download links from submissions
|
|
with open('urls.txt', 'w') as fh:
|
|
for subm in query:
|
|
url = subm.url
|
|
|
|
# weird issue with psaw/pushshift that breaks score=">2"
|
|
if subm.score < 3:
|
|
continue
|
|
#print(subm.score)
|
|
# pbar.write(str(datetime.datetime.fromtimestamp(subm.created_utc)))
|
|
pbar.update(1)
|
|
fh.write(url + '\n')
|
|
fh.flush()
|