mirror of https://github.com/hpcaitech/ColossalAI
308 lines
7.0 KiB
Python
308 lines
7.0 KiB
Python
# coding=utf-8
|
|
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import glob
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
import tldextract
|
|
|
|
# List of the domains to blacklist.
|
|
domain_blacklist = set([
|
|
'500px',
|
|
'aapks',
|
|
'akamaihd',
|
|
'amazon',
|
|
'apple',
|
|
'artifactfire',
|
|
'artstation',
|
|
'awwni',
|
|
'bandcamp',
|
|
'battleforthenet',
|
|
'coinscalendar',
|
|
'dailymotion',
|
|
'deviantart',
|
|
'discord',
|
|
'discordapp',
|
|
'dlapkandroid',
|
|
'dropbox',
|
|
'e621',
|
|
'ebay',
|
|
'edealinfo',
|
|
'erome',
|
|
'eroshare',
|
|
'explosm',
|
|
'facebook',
|
|
'fbcdn',
|
|
'flickr',
|
|
'furaffinity',
|
|
'futhead',
|
|
'gatopardo',
|
|
'gfycat',
|
|
'gifsound',
|
|
'gifsoup',
|
|
'giphy',
|
|
'github',
|
|
'google',
|
|
'gunprime',
|
|
'gyazo',
|
|
'hotdealstar',
|
|
'imagefap',
|
|
'imageshack',
|
|
'imgflip',
|
|
'imgur',
|
|
'instagram',
|
|
'karmadecay',
|
|
'kryptocal',
|
|
'kym-cdn',
|
|
'liveleak',
|
|
'livememe',
|
|
'lmgtfy',
|
|
'magaimg',
|
|
'memegenerator',
|
|
'minorplanetcenter',
|
|
'minus',
|
|
'mobafire',
|
|
'morejpeg',
|
|
'nocookie',
|
|
'pcpartpicker',
|
|
'photobucket',
|
|
'pinimg',
|
|
'pinterest',
|
|
'pixiv',
|
|
'pornhub',
|
|
'prntscr',
|
|
'puu',
|
|
'qkme',
|
|
'quickmeme',
|
|
'radd',
|
|
'redd',
|
|
'reddit',
|
|
'reddit-stream',
|
|
'redditlog',
|
|
'redditmedia',
|
|
'reddituploads',
|
|
'redtube',
|
|
'reupp',
|
|
'reverb',
|
|
'roanoke',
|
|
'rollingstone',
|
|
'sli',
|
|
'soundcloud',
|
|
'soundgasm',
|
|
'spankbang',
|
|
'spotify',
|
|
'strawpoll',
|
|
'streamable',
|
|
'timeanddate',
|
|
'tinypic',
|
|
'touhouradio',
|
|
'tumblr',
|
|
'twimg',
|
|
'twitch',
|
|
'twitter',
|
|
'vid',
|
|
'vimeo',
|
|
'vine',
|
|
'vkaao',
|
|
'vocaroo',
|
|
'voyagefusion',
|
|
'walmart',
|
|
'wciu',
|
|
'wikimedia',
|
|
'wikipedia',
|
|
'xhamster',
|
|
'xkcd',
|
|
'xvideos',
|
|
'youtu',
|
|
'youtube',
|
|
'youtubedoubler',
|
|
'ytimg',
|
|
'zillexplorer',
|
|
])
|
|
|
|
|
|
def domain_is_in_blacklist(url):
|
|
domain = tldextract.extract(url).domain
|
|
return domain in domain_blacklist
|
|
|
|
|
|
# List of extentions to blacklist.
|
|
extentions_blacklist = (
|
|
'.3gp',
|
|
'.7z'
|
|
'.ai',
|
|
'.aif',
|
|
'.apk',
|
|
'.app',
|
|
'.avi',
|
|
'.bin',
|
|
'.bmp',
|
|
'.bz2',
|
|
'.css',
|
|
'.csv',
|
|
'.dat',
|
|
'.deb',
|
|
'.dmg',
|
|
'.doc',
|
|
'.docx',
|
|
'.exe',
|
|
'.gif',
|
|
'.gifv',
|
|
'.gz',
|
|
'.iso',
|
|
'.jar',
|
|
'.jpeg',
|
|
'.jpg',
|
|
'.js',
|
|
'.log',
|
|
'.mid',
|
|
'.midi',
|
|
'.mkv',
|
|
'.mov',
|
|
'.mp3',
|
|
'.mp4',
|
|
'.mpeg',
|
|
'.mpg',
|
|
'.ogg',
|
|
'.ogv',
|
|
'.otf',
|
|
'.pdf',
|
|
'.pkg',
|
|
'.png',
|
|
'.pps',
|
|
'.ppt',
|
|
'.pptx',
|
|
'.psd',
|
|
'.py',
|
|
'.qt',
|
|
'.ram',
|
|
'.rar',
|
|
'.sql',
|
|
'.svg',
|
|
'.swf',
|
|
'.tar.gz',
|
|
'.tar',
|
|
'.tgz',
|
|
'.tiff',
|
|
'.ttf',
|
|
'.txt',
|
|
'.wav',
|
|
'.webm',
|
|
'.wma',
|
|
'.wmv',
|
|
'.xls',
|
|
'.xlsx',
|
|
'.xml',
|
|
'.xz',
|
|
'.zip',
|
|
)
|
|
|
|
|
|
def extention_is_in_blacklist(url):
|
|
if url.split('?')[0].lower().endswith(extentions_blacklist):
|
|
return True
|
|
return False
|
|
|
|
|
|
# Malformed urls.
|
|
# This function is adapted from:
|
|
# https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
|
|
url_regex = re.compile(
|
|
r'^(?:http)s?://' # http:// or https://
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
r'(?::\d+)?' # optional port
|
|
r'(?:/?|[/?]\S+)$',
|
|
re.IGNORECASE)
|
|
|
|
|
|
def url_is_malformed(url):
|
|
return re.match(url_regex, url) is None
|
|
|
|
|
|
def print_progress(prefix, start_time, urls_counter, domain_blacklist_counter, extention_blacklist_counter,
|
|
short_url_counter, malformed_url_counter, duplicate_url_counter):
|
|
string = prefix + ' | '
|
|
string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time)
|
|
string += 'number of urls: {} | '.format(urls_counter)
|
|
string += 'domain blacklisted: {} | '.format(domain_blacklist_counter)
|
|
string += 'extention blacklisted: {} | '.format(extention_blacklist_counter)
|
|
string += 'short urls (<=8): {} | '.format(short_url_counter)
|
|
string += 'malformed urls: {} | '.format(malformed_url_counter)
|
|
string += 'duplicate urls: {}'.format(duplicate_url_counter)
|
|
print(string, flush=True)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
print('remove blacklisted urls ..')
|
|
|
|
# Path to the url files.
|
|
path = sys.argv[1]
|
|
# Output url file.
|
|
output = sys.argv[2]
|
|
|
|
# Get the list of url files.
|
|
files = glob.glob(path + '/*.txt')
|
|
print('> found {} files'.format(len(files)))
|
|
|
|
urls = set()
|
|
urls_counter = 0
|
|
domain_blacklist_counter = 0
|
|
extention_blacklist_counter = 0
|
|
short_url_counter = 0
|
|
malformed_url_counter = 0
|
|
duplicate_url_counter = 0
|
|
start_time = time.time()
|
|
for filename in files:
|
|
with open(filename, 'r') as f:
|
|
for line in f:
|
|
url = line.strip()
|
|
urls_counter += 1
|
|
if domain_is_in_blacklist(url):
|
|
print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True)
|
|
domain_blacklist_counter += 1
|
|
elif extention_is_in_blacklist(url):
|
|
print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True)
|
|
extention_blacklist_counter += 1
|
|
elif len(url) <= 8:
|
|
print('[SHORT URL]: {}'.format(url), flush=True)
|
|
short_url_counter += 1
|
|
elif url_is_malformed(url):
|
|
print('[MALFORMED URL]: {}'.format(url), flush=True)
|
|
malformed_url_counter += 1
|
|
elif url in urls:
|
|
print('[DUPLICATE URL]: {}'.format(url), flush=True)
|
|
duplicate_url_counter += 1
|
|
else:
|
|
urls.add(url)
|
|
if urls_counter % 100000 == 0:
|
|
print_progress('PROGRESS', start_time, urls_counter, domain_blacklist_counter,
|
|
extention_blacklist_counter, short_url_counter, malformed_url_counter,
|
|
duplicate_url_counter)
|
|
|
|
print_progress('FINAL', start_time, urls_counter, domain_blacklist_counter, extention_blacklist_counter,
|
|
short_url_counter, malformed_url_counter, duplicate_url_counter)
|
|
|
|
# Write the final set of urls.
|
|
print('> writing cleaned up url list to {}'.format(output))
|
|
with open(output, 'w') as f:
|
|
for url in urls:
|
|
f.write(url + '\n')
|
|
|
|
print('done :-)')
|