__author__ = 'John Berlin (n0tan3rd@gmail.com)' __version__ = '1.0.0' __copyright__ = 'Copyright (c) 2018-Present John Berlin' __license__ = 'MIT' import time import csv import re from os import path, makedirs from glob import glob import argparse import requests from bs4 import BeautifulSoup import ujson as json RAW_LISTS = 'rawUALists' """str: Default raw user agent dump path""" CSV_DUMP = 'csv' """str: Default csv user agent list dump path""" JSON_DUMP = 'json' """str: Default json user agent list dump path""" WIMB_ORDER_RE = re.compile(r'page(\d+)\.html') """re: regular expression helper for sorting paginated ua html files""" UA_LIST = [ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.01', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' '54.0.2840.71 Safari/537.36', 'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safa' 'ri/537.36', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.' '0.2228.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko' ') Chrome/42.0.2311.135 ' 'Safari/537.36 Edge/12.246', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, ' 'like Gecko) Version/9.0.2 Safari/601.3.9', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/47.0.2526.111 Safari/537.36', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0', ] """list[str]: user agent strings used when fetching the lists""" def get_xml_lists(save_path): """ Fetches the xml user agent lists and saves them at save_path :param str save_path: Path to where to dump the raw user agent xml lists """ with open(path.join(save_path, 'ua_org_allagents.xml'), 'w') as out: request = requests.get('http://www.user-agents.org/allagents.xml') if request.ok: out.write(request.text) else: print('Could not get http://www.user-agents.org/allagents.xml') with open(path.join(save_path, 'techpatterns_com_useragentswitcher.xml'), 'w') as out: request = requests.get( 'https://techpatterns.com/downloads/firefox/useragentswitcher.xml') if request.ok: out.write(request.text) else: print( 'Could not get https://techpatterns.com/downloads/firefox/useragentswitcher.xml') def gen_from_xml(xml_dir, csv_dir=CSV_DUMP, json_dir=JSON_DUMP): """ Generates csv and json versions of techpatterns_com_useragentswitcher.xml and ua_org_allagents.xml :param str xml_dir: Path to the directory containing the two user agent lists in xml :param str csv_dir: Path to directory to dump the csv files in. Defaults to /csv :param str json_dir: Path to directory to dump the json files in. Defaults to /json """ ua_list = [] print('Generating user agent list for techpatterns_com_useragentswitcher.xml') with open(path.join(xml_dir, 'techpatterns_com_useragentswitcher.xml'), 'r') as iin: soup = BeautifulSoup(iin, 'lxml') for search_folder in ['Browsers - Windows', 'Browsers - Mac', 'Browsers - Linux', 'Browsers - Unix', 'Mobile Devices', 'Spiders - Search', 'Miscellaneous']: print(search_folder) for folder in soup.find_all( 'folder', attrs={"description": search_folder}): for user_agent in folder.find_all('useragent'): ua_list.append( dict(kind=search_folder, description=user_agent['description'], ua=user_agent['useragent'])) with open(path.join(csv_dir, 'techpatterns_com_useragentswitcher.csv'), 'w') as csv_out: csv_writer = csv.DictWriter( csv_out, fieldnames=['kind', 'description', 'ua']) csv_writer.writeheader() csv_writer.writerows(ua_list) with open(path.join(json_dir, 'techpatterns_com_useragentswitcher.json'), 'w') as json_out: json_out.write(json.dumps(ua_list)) ua_list = [] print('Generating user agent list for ua_org_allagents.xml') with open(path.join(xml_dir, 'ua_org_allagents.xml'), 'r') as iin: soup = BeautifulSoup(iin, 'xml') for user_agent in soup.find_all('user-agent'): ua_list.append(dict(description=user_agent.find( 'Description').text, ua=user_agent.find('String').text)) with open(path.join(csv_dir, 'ua_org_allagents.csv'), 'w') as csv_out: csv_writer = csv.DictWriter(csv_out, fieldnames=['description', 'ua']) csv_writer.writeheader() csv_writer.writerows(ua_list) with open(path.join(json_dir, 'ua_org_allagents.json'), 'w') as json_out: json_out.write(json.dumps(ua_list)) def xml_lists(raw_lists_path, csv_dir=CSV_DUMP, json_dir=JSON_DUMP): """ Fetches the xml user agent lists and transforms them into csv and json :param str raw_lists_path: Path to directory to dump the raw lists. Defaults to /rawUALists :param str csv_dir: Path to directory to dump the csv files in. Defaults to /csv :param str json_dir: Path to directory to dump the json files in. Defaults to /json """ get_xml_lists(raw_lists_path) gen_from_xml(raw_lists_path, csv_dir=csv_dir, json_dir=json_dir) def mine_dev_whatismybrowser(browser, save_path=RAW_LISTS, to_page=30): """ Retrieves the user agent strings for a browser listed on developers.whatismybrowser.com up to to_pages :param str browser: The browser to get the paginated list of user agent strings for :param str save_path: The path to a directory to dump the. Defaults to /rawUALists :param int to_page: How many pages do you want to extract. Defaults to 30 """ browser = browser.lower() base_url = "https://developers.whatismybrowser.com/useragents/explore/software_name/%s" \ % browser pag_url = base_url + "/%d" save_dir = path.join(save_path, '%sUAHTML' % browser) save_html = path.join(save_dir, 'page%d.html') if not path.exists(save_dir): makedirs(save_dir, exist_ok=True) count = 0 with requests.session() as session: for i in range(1, to_page + 1): request = session.get(pag_url % i, headers={'User-Agent': UA_LIST[count]}, timeout=5.0) count += 1 if count == 8: count = 0 if request.ok: print('Got %s user agents on page %d' % (browser, i)) with open(save_html % i, 'w') as out: out.write(request.text) else: print('Could not get %s user agents on page %d' % (browser, i)) time.sleep(2) def wimb_page_order(ua_page): """ Helper for collect_ua_whatismybrowser that sorts the pages in correct order :param str ua_page: Path to user agent html file :return int: user agent pagination index """ return int(WIMB_ORDER_RE.match(path.basename(ua_page)).group(1)) def collect_ua_whatismybrowser( browser, raw_dir=RAW_LISTS, csv_dir=CSV_DUMP, json_dir=JSON_DUMP): """ Parses all pages associated with a browser, generating browser.csv and browser.json :param str browser: The browser to retrieve user agent strings for :param str raw_dir: Path to the directory containing browser html file directory. Defaults to /rawUALists :param str csv_dir: Path to directory to dump the csv files in. Defaults to /csv :param str json_dir: Path to directory to dump the json files in. Defaults to /json """ ua_list = [] for page in sorted(glob(path.join(raw_dir, path.join( '%sUAHTML', '*.html')) % browser), key=wimb_page_order): with open(page, 'r') as iin: soup = BeautifulSoup(iin, 'lxml') for tr in soup.find_all('tr'): ua_tds = tr.select('td.useragent') if ua_tds: tds = tr.find_all('td') ua_list.append( dict(ua=ua_tds[0].text, version=tds[1].text, commonality=tds[-1].text)) with open(path.join(csv_dir, '%s.csv' % browser), 'w') as csv_out: csv_writer = csv.DictWriter( csv_out, fieldnames=['ua', 'version', 'commonality']) csv_writer.writeheader() csv_writer.writerows(ua_list) with open(path.join(json_dir, '%s.json' % browser), 'w') as json_out: json_out.write(json.dumps(ua_list)) def whatismybrowser(raw_list_dir, to_page=30, csv_dir=CSV_DUMP, json_dir=JSON_DUMP): """ Fetches user agent strings for Chrome, Firefox, Opera, Safari, IE, Android browser and generates csv and json lists of the user agents per browser :param str raw_list_dir: :param int to_page: How many pages do you want to extract. Defaults to 30 :param str csv_dir: Path to directory to dump the csv files in. Defaults to /csv :param str json_dir: Path to directory to dump the json files in. Defaults to /json """ browser_list = ['chrome', 'firefox', 'opera', 'safari', 'internet-explorer', 'android-browser'] for browser in browser_list: print('Fetching user agent strings for %s' % browser) mine_dev_whatismybrowser( browser, save_path=raw_list_dir, to_page=to_page) print('Collecting user agent strings for %s' % browser) collect_ua_whatismybrowser(browser, raw_dir=raw_list_dir, csv_dir=csv_dir, json_dir=json_dir) if __name__ == '__main__': parser = argparse.ArgumentParser(prog='useragents', description='Get some user agent string lists') parser.add_argument('-d', '--dump', help='Directory to dump the raw lists. ' 'Defaults to /rawUALists', default=RAW_LISTS, type=str) parser.add_argument('-c', '--csv', help='Directory to dump the csv lists in. ' 'Defaults to /csv', default=CSV_DUMP, type=str) parser.add_argument('-j', '--json', help='Directory to dump the json lists in. ' 'Defaults to /json', default=JSON_DUMP, type=str) parser.add_argument('-p', '--pages', help='Number of pages that should be retrieved for ' 'whatismybrowser user agents. Defaults to 30', default=30, type=int) fetch_group = parser.add_mutually_exclusive_group() fetch_group.add_argument('-a', '--all', help='Get both xml and whatismybrowser lists', action='store_true', default=True) fetch_group.add_argument('-w', '--wimb', help='Get whatismybrowser lists', action='store_true') fetch_group.add_argument('-x', '--xml', help='Get xml lists', action='store_true') args = parser.parse_args() if not path.exists(args.dump): makedirs(args.dump) if not path.exists(args.csv): makedirs(args.csv, exist_ok=True) if not path.exists(args.json): makedirs(args.json, exist_ok=True) if args.all: xml_lists(args.dump, csv_dir=args.csv, json_dir=args.json) whatismybrowser(args.dump, to_page=args.pages, csv_dir=args.csv, json_dir=args.json) elif args.xml: xml_lists(args.dump, csv_dir=args.csv, json_dir=args.json) elif args.wimb: whatismybrowser(args.dump, to_page=args.pages, csv_dir=args.csv, json_dir=args.json)