259 lines
12 KiB
Python
259 lines
12 KiB
Python
__author__ = 'John Berlin (n0tan3rd@gmail.com)'
|
|
__version__ = '1.0.0'
|
|
__copyright__ = 'Copyright (c) 2018-Present John Berlin'
|
|
__license__ = 'MIT'
|
|
|
|
import time
|
|
import csv
|
|
import re
|
|
from os import path, makedirs
|
|
from glob import glob
|
|
import argparse
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import ujson as json
|
|
|
|
RAW_LISTS = 'rawUALists'
|
|
"""str: Default raw user agent dump path"""
|
|
|
|
CSV_DUMP = 'csv'
|
|
"""str: Default csv user agent list dump path"""
|
|
|
|
JSON_DUMP = 'json'
|
|
"""str: Default json user agent list dump path"""
|
|
|
|
WIMB_ORDER_RE = re.compile(r'page(\d+)\.html')
|
|
"""re: regular expression helper for sorting paginated ua html files"""
|
|
|
|
UA_LIST = [
|
|
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.01',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
|
|
'54.0.2840.71 Safari/537.36',
|
|
'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safa'
|
|
'ri/537.36',
|
|
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.'
|
|
'0.2228.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko'
|
|
') Chrome/42.0.2311.135 '
|
|
'Safari/537.36 Edge/12.246',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, '
|
|
'like Gecko) Version/9.0.2 Safari/601.3.9',
|
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/47.0.2526.111 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0',
|
|
]
|
|
"""list[str]: user agent strings used when fetching the lists"""
|
|
|
|
|
|
def get_xml_lists(save_path):
|
|
"""
|
|
Fetches the xml user agent lists and saves them at save_path
|
|
:param str save_path: Path to where to dump the raw user agent xml lists
|
|
"""
|
|
with open(path.join(save_path, 'ua_org_allagents.xml'), 'w') as out:
|
|
request = requests.get('http://www.user-agents.org/allagents.xml')
|
|
if request.ok:
|
|
out.write(request.text)
|
|
else:
|
|
print('Could not get http://www.user-agents.org/allagents.xml')
|
|
|
|
with open(path.join(save_path, 'techpatterns_com_useragentswitcher.xml'), 'w') as out:
|
|
request = requests.get(
|
|
'https://techpatterns.com/downloads/firefox/useragentswitcher.xml')
|
|
if request.ok:
|
|
out.write(request.text)
|
|
else:
|
|
print(
|
|
'Could not get https://techpatterns.com/downloads/firefox/useragentswitcher.xml')
|
|
|
|
|
|
def gen_from_xml(xml_dir, csv_dir=CSV_DUMP, json_dir=JSON_DUMP):
|
|
"""
|
|
Generates csv and json versions of techpatterns_com_useragentswitcher.xml
|
|
and ua_org_allagents.xml
|
|
:param str xml_dir: Path to the directory containing the two user agent lists in xml
|
|
:param str csv_dir: Path to directory to dump the csv files in. Defaults to <cwd>/csv
|
|
:param str json_dir: Path to directory to dump the json files in. Defaults to <cwd>/json
|
|
"""
|
|
ua_list = []
|
|
print('Generating user agent list for techpatterns_com_useragentswitcher.xml')
|
|
with open(path.join(xml_dir, 'techpatterns_com_useragentswitcher.xml'), 'r') as iin:
|
|
soup = BeautifulSoup(iin, 'lxml')
|
|
for search_folder in ['Browsers - Windows', 'Browsers - Mac',
|
|
'Browsers - Linux', 'Browsers - Unix',
|
|
'Mobile Devices', 'Spiders - Search', 'Miscellaneous']:
|
|
print(search_folder)
|
|
for folder in soup.find_all(
|
|
'folder', attrs={"description": search_folder}):
|
|
for user_agent in folder.find_all('useragent'):
|
|
ua_list.append(
|
|
dict(kind=search_folder, description=user_agent['description'],
|
|
ua=user_agent['useragent']))
|
|
with open(path.join(csv_dir, 'techpatterns_com_useragentswitcher.csv'), 'w') as csv_out:
|
|
csv_writer = csv.DictWriter(
|
|
csv_out, fieldnames=['kind', 'description', 'ua'])
|
|
csv_writer.writeheader()
|
|
csv_writer.writerows(ua_list)
|
|
with open(path.join(json_dir, 'techpatterns_com_useragentswitcher.json'), 'w') as json_out:
|
|
json_out.write(json.dumps(ua_list))
|
|
|
|
ua_list = []
|
|
print('Generating user agent list for ua_org_allagents.xml')
|
|
with open(path.join(xml_dir, 'ua_org_allagents.xml'), 'r') as iin:
|
|
soup = BeautifulSoup(iin, 'xml')
|
|
for user_agent in soup.find_all('user-agent'):
|
|
ua_list.append(dict(description=user_agent.find(
|
|
'Description').text, ua=user_agent.find('String').text))
|
|
with open(path.join(csv_dir, 'ua_org_allagents.csv'), 'w') as csv_out:
|
|
csv_writer = csv.DictWriter(csv_out, fieldnames=['description', 'ua'])
|
|
csv_writer.writeheader()
|
|
csv_writer.writerows(ua_list)
|
|
with open(path.join(json_dir, 'ua_org_allagents.json'), 'w') as json_out:
|
|
json_out.write(json.dumps(ua_list))
|
|
|
|
|
|
def xml_lists(raw_lists_path, csv_dir=CSV_DUMP, json_dir=JSON_DUMP):
|
|
"""
|
|
Fetches the xml user agent lists and transforms them into csv and json
|
|
:param str raw_lists_path: Path to directory to dump the raw lists. Defaults to <cwd>/rawUALists
|
|
:param str csv_dir: Path to directory to dump the csv files in. Defaults to <cwd>/csv
|
|
:param str json_dir: Path to directory to dump the json files in. Defaults to <cwd>/json
|
|
"""
|
|
get_xml_lists(raw_lists_path)
|
|
gen_from_xml(raw_lists_path, csv_dir=csv_dir, json_dir=json_dir)
|
|
|
|
|
|
def mine_dev_whatismybrowser(browser, save_path=RAW_LISTS, to_page=30):
|
|
"""
|
|
Retrieves the user agent strings for a browser listed on
|
|
developers.whatismybrowser.com up to to_pages
|
|
:param str browser: The browser to get the paginated list of user agent strings for
|
|
:param str save_path: The path to a directory to dump the. Defaults to <cwd>/rawUALists
|
|
:param int to_page: How many pages do you want to extract. Defaults to 30
|
|
"""
|
|
browser = browser.lower()
|
|
base_url = "https://developers.whatismybrowser.com/useragents/explore/software_name/%s" \
|
|
% browser
|
|
pag_url = base_url + "/%d"
|
|
save_dir = path.join(save_path, '%sUAHTML' % browser)
|
|
save_html = path.join(save_dir, 'page%d.html')
|
|
if not path.exists(save_dir):
|
|
makedirs(save_dir, exist_ok=True)
|
|
count = 0
|
|
with requests.session() as session:
|
|
for i in range(1, to_page + 1):
|
|
request = session.get(pag_url % i,
|
|
headers={'User-Agent': UA_LIST[count]}, timeout=5.0)
|
|
count += 1
|
|
if count == 8:
|
|
count = 0
|
|
if request.ok:
|
|
print('Got %s user agents on page %d' % (browser, i))
|
|
with open(save_html % i, 'w') as out:
|
|
out.write(request.text)
|
|
else:
|
|
print('Could not get %s user agents on page %d' % (browser, i))
|
|
time.sleep(2)
|
|
|
|
|
|
def wimb_page_order(ua_page):
|
|
"""
|
|
Helper for collect_ua_whatismybrowser that sorts the pages in correct order
|
|
:param str ua_page: Path to user agent html file
|
|
:return int: user agent pagination index
|
|
"""
|
|
return int(WIMB_ORDER_RE.match(path.basename(ua_page)).group(1))
|
|
|
|
|
|
def collect_ua_whatismybrowser(
|
|
browser, raw_dir=RAW_LISTS, csv_dir=CSV_DUMP, json_dir=JSON_DUMP):
|
|
"""
|
|
Parses all pages associated with a browser, generating browser.csv and browser.json
|
|
:param str browser: The browser to retrieve user agent strings for
|
|
:param str raw_dir: Path to the directory containing browser html file directory.
|
|
Defaults to <cwd>/rawUALists
|
|
:param str csv_dir: Path to directory to dump the csv files in. Defaults to <cwd>/csv
|
|
:param str json_dir: Path to directory to dump the json files in. Defaults to <cwd>/json
|
|
"""
|
|
ua_list = []
|
|
for page in sorted(glob(path.join(raw_dir, path.join(
|
|
'%sUAHTML', '*.html')) % browser), key=wimb_page_order):
|
|
with open(page, 'r') as iin:
|
|
soup = BeautifulSoup(iin, 'lxml')
|
|
for tr in soup.find_all('tr'):
|
|
ua_tds = tr.select('td.useragent')
|
|
if ua_tds:
|
|
tds = tr.find_all('td')
|
|
ua_list.append(
|
|
dict(ua=ua_tds[0].text, version=tds[1].text, commonality=tds[-1].text))
|
|
with open(path.join(csv_dir, '%s.csv' % browser), 'w') as csv_out:
|
|
csv_writer = csv.DictWriter(
|
|
csv_out, fieldnames=['ua', 'version', 'commonality'])
|
|
csv_writer.writeheader()
|
|
csv_writer.writerows(ua_list)
|
|
with open(path.join(json_dir, '%s.json' % browser), 'w') as json_out:
|
|
json_out.write(json.dumps(ua_list))
|
|
|
|
|
|
def whatismybrowser(raw_list_dir, to_page=30,
|
|
csv_dir=CSV_DUMP, json_dir=JSON_DUMP):
|
|
"""
|
|
Fetches user agent strings for Chrome, Firefox, Opera, Safari, IE, Android browser and
|
|
generates csv and json lists of the user agents per browser
|
|
:param str raw_list_dir:
|
|
:param int to_page: How many pages do you want to extract. Defaults to 30
|
|
:param str csv_dir: Path to directory to dump the csv files in. Defaults to <cwd>/csv
|
|
:param str json_dir: Path to directory to dump the json files in. Defaults to <cwd>/json
|
|
"""
|
|
browser_list = ['chrome', 'firefox', 'opera',
|
|
'safari', 'internet-explorer', 'android-browser']
|
|
for browser in browser_list:
|
|
print('Fetching user agent strings for %s' % browser)
|
|
mine_dev_whatismybrowser(
|
|
browser, save_path=raw_list_dir, to_page=to_page)
|
|
print('Collecting user agent strings for %s' % browser)
|
|
collect_ua_whatismybrowser(browser, raw_dir=raw_list_dir,
|
|
csv_dir=csv_dir, json_dir=json_dir)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(prog='useragents',
|
|
description='Get some user agent string lists')
|
|
parser.add_argument('-d', '--dump', help='Directory to dump the raw lists. '
|
|
'Defaults to <cwd>/rawUALists',
|
|
default=RAW_LISTS, type=str)
|
|
parser.add_argument('-c', '--csv', help='Directory to dump the csv lists in. '
|
|
'Defaults to <cwd>/csv',
|
|
default=CSV_DUMP, type=str)
|
|
parser.add_argument('-j', '--json', help='Directory to dump the json lists in. '
|
|
'Defaults to <cwd>/json',
|
|
default=JSON_DUMP, type=str)
|
|
parser.add_argument('-p', '--pages',
|
|
help='Number of pages that should be retrieved for '
|
|
'whatismybrowser user agents. Defaults to 30',
|
|
default=30, type=int)
|
|
fetch_group = parser.add_mutually_exclusive_group()
|
|
fetch_group.add_argument('-a', '--all',
|
|
help='Get both xml and whatismybrowser lists',
|
|
action='store_true', default=True)
|
|
fetch_group.add_argument('-w', '--wimb',
|
|
help='Get whatismybrowser lists',
|
|
action='store_true')
|
|
fetch_group.add_argument('-x', '--xml',
|
|
help='Get xml lists',
|
|
action='store_true')
|
|
args = parser.parse_args()
|
|
if not path.exists(args.dump):
|
|
makedirs(args.dump)
|
|
if not path.exists(args.csv):
|
|
makedirs(args.csv, exist_ok=True)
|
|
if not path.exists(args.json):
|
|
makedirs(args.json, exist_ok=True)
|
|
if args.all:
|
|
xml_lists(args.dump, csv_dir=args.csv, json_dir=args.json)
|
|
whatismybrowser(args.dump, to_page=args.pages, csv_dir=args.csv, json_dir=args.json)
|
|
elif args.xml:
|
|
xml_lists(args.dump, csv_dir=args.csv, json_dir=args.json)
|
|
elif args.wimb:
|
|
whatismybrowser(args.dump, to_page=args.pages, csv_dir=args.csv, json_dir=args.json)
|