diff --git a/.codespellrc b/.codespellrc index 86ce487f..8de44712 100644 --- a/.codespellrc +++ b/.codespellrc @@ -6,7 +6,7 @@ check-hidden = true # Try to identify incomplete words which are part of a regex, hence having [] at the beginning # Ignore all urls as something with :// in it # Ignore all lines with codespell-ignore in them for pragma annotation -ignore-regex = (\b([A-Z][A-Z][A-Z]+|gir\.st)\b)|\[[a-zA-Z]+\][a-z]+\b|[a-z]+://\S+|.*codespell-ignore.* +ignore-regex = (\b([A-Z][A-Z][A-Z]+|gir\.st)\b)|\[[a-zA-Z]+\][a-z]+\b|[a-z]+://\S+|^\w*bots\w*\s*=.*|.*codespell-ignore.* # some oddly named variables, some names, etc # wee -- comes in regex etc for weeks ignore-words-list = assertIn,theis,timere,alls,wee,wight,ans,re-use,pre-emptive diff --git a/config/filter.d/apache-badbots.conf b/config/filter.d/apache-badbots.conf index 12d4105b..564fbf06 100644 --- a/config/filter.d/apache-badbots.conf +++ b/config/filter.d/apache-badbots.conf @@ -8,9 +8,11 @@ [Definition] badbotscustom = EmailCollector|WebEMailExtrac|TrackBack/1\.02|sogou music spider|(?:Mozilla/\d+\.\d+ )?Jorgee -badbots = Atomic_Email_Hunter/4\.0|atSpider/1\.0|autoemailspider|bwh3_user_agent|China Local Browse 2\.6|ContactBot/0\.2|ContentSmartz|DataCha0s/2\.0|DBrowse 1\.4b|DBrowse 1\.4d|Demo Bot DOT 16b|Demo Bot Z 16b|DSurf15a 01|DSurf15a 71|DSurf15a 81|DSurf15a VA|EBrowse 1\.4b|Educate Search VxB|EmailSiphon|EmailSpider|EmailWolf 1\.00|ESurf15a 15|ExtractorPro|Franklin Locator 1\.8|FSurf15a 01|Full Web Bot 0416B|Full Web Bot 0516B|Full Web Bot 2816B|Guestbook Auto Submitter|Industry Program 1\.0\.x|ISC Systems iRc Search 2\.1|IUPUI Research Bot v 1\.9a|LARBIN-EXPERIMENTAL \(efp@gmx\.net\)|LetsCrawl\.com/1\.0 \+http\://letscrawl\.com/|Lincoln State Web Browser|LMQueueBot/0\.2|LWP\:\:Simple/5\.803|Mac Finder 1\.0\.xx|MFC Foundation Class Library 4\.0|Microsoft URL Control - 6\.00\.8xxx|Missauga Locate 1\.0\.0|Missigua Locator 1\.9|Missouri College Browse|Mizzu Labs 2\.2|Mo College 1\.9|MVAClient|Mozilla/2\.0 \(compatible; NEWT ActiveX; Win32\)|Mozilla/3\.0 \(compatible; Indy Library\)|Mozilla/3\.0 \(compatible; scan4mail \(advanced version\) http\://www\.peterspages\.net/?scan4mail\)|Mozilla/4\.0 \(compatible; Advanced Email Extractor v2\.xx\)|Mozilla/4\.0 \(compatible; Iplexx Spider/1\.0 http\://www\.iplexx\.at\)|Mozilla/4\.0 \(compatible; MSIE 5\.0; Windows NT; DigExt; DTS Agent|Mozilla/4\.0 efp@gmx\.net|Mozilla/5\.0 \(Version\: xxxx Type\:xx\)|NameOfAgent \(CMS Spider\)|NASA Search 1\.0|Nsauditor/1\.x|PBrowse 1\.4b|PEval 1\.4b|Poirot|Port Huron Labs|Production Bot 0116B|Production Bot 2016B|Production Bot DOT 3016B|Program Shareware 1\.0\.2|PSurf15a 11|PSurf15a 51|PSurf15a VA|psycheclone|RSurf15a 41|RSurf15a 51|RSurf15a 81|searchbot admin@google\.com|ShablastBot 1\.0|snap\.com beta crawler v0|Snapbot/1\.0|Snapbot/1\.0 \(Snap Shots, \+http\://www\.snap\.com\)|sogou develop spider|Sogou Orion spider/3\.0\(\+http\://www\.sogou\.com/docs/help/webmasters\.htm#07\)|sogou spider|Sogou web spider/3\.0\(\+http\://www\.sogou\.com/docs/help/webmasters\.htm#07\)|sohu agent|SSurf15a 11 |TSurf15a 11|Under the Rainbow 2\.2|User-Agent\: Mozilla/4\.0 \(compatible; MSIE 6\.0; Windows NT 5\.1\)|VadixBot|WebVulnCrawl\.unknown/1\.0 libwww-perl/5\.803|Wells Search II|WEP Search 00 +badbots = 01h4x\.com|360Spider|404checker|404enemy|80legs|ADmantX|AIBOT|ALittle Client|ASPSeek|Abonti|Aboundex|Aboundexbot|Acunetix|AdsTxtCrawlerTP|AfD-Verbotsverfahren|AhrefsBot|AiHitBot|Aipbot|Alexibot|AllSubmitter|Alligator|AlphaBot|Anarchie|Anarchy|Anarchy99|Ankit|Anthill|Apexoo|Aspiegel|Asterias|Atomseobot|Attach|AwarioRssBot|AwarioSmartBot|BBBike|BDCbot|BDFetch|BLEXBot|BackDoorBot|BackStreet|BackWeb|Backlink-Ceck|BacklinkCrawler|Badass|Bandit|Barkrowler|BatchFTP|Battleztar Bazinga|BetaBot|Bigfoot|Bitacle|BlackWidow|Black Hole|Blackboard|Blow|BlowFish|Boardreader|Bolt|BotALot|Brandprotect|Brandwatch|Buck|Buddy|BuiltBotTough|BuiltWith|Bullseye|BunnySlippers|BuzzSumo|Bytespider|CATExplorador|CCBot|CODE87|CSHttp|Calculon|CazoodleBot|Cegbfeieh|CensysInspect|ChatGPT-User|CheTeam|CheeseBot|CherryPicker|ChinaClaw|Chlooe|Citoid|Claritybot|Cliqzbot|Cloud mapping|Cocolyzebot|Cogentbot|Collector|Copier|CopyRightCheck|Copyscape|Cosmos|Craftbot|Crawling at Home Project|CrazyWebCrawler|Crescent|CrunchBot|Curious|Custo|CyotekWebCopy|DBLBot|DIIbot|DSearch|DTS Agent|DataCha0s|DatabaseDriverMysqli|Demon|Deusu|Devil|Digincore|DigitalPebble|Dirbuster|Disco|Discobot|Discoverybot|Dispatch|DittoSpyder|DnBCrawler-Analytics|DnyzBot|DomCopBot|DomainAppender|DomainCrawler|DomainSigmaCrawler|DomainStatsBot|Domains Project|Dotbot|Download Wonder|Dragonfly|Drip|ECCP/1\.0|EMail Siphon|EMail Wolf|EasyDL|Ebingbong|Ecxi|EirGrabber|EroCrawler|Evil|Exabot|Express WebPictures|ExtLinksBot|Extractor|ExtractorPro|Extreme Picture Finder|EyeNetIE|Ezooms|FDM|FHscan|FacebookBot|FemtosearchBot|Fimap|Firefox/7\.0|FlashGet|Flunky|Foobot|Freeuploader|FrontPage|Fuzz|FyberSpider|Fyrebot|G-i-g-a-b-o-t|GPTBot|GT\:\:WWW|GalaxyBot|Genieo|GermCrawler|GetRight|GetWeb|Getintent|Gigabot|Go!Zilla|Go-Ahead-Got-It|GoZilla|Google-Extended|Gotit|GrabNet|Grabber|Grafula|GrapeFX|GrapeshotCrawler|GridBot|HEADMasterSEO|HMView|HTMLparser|HTTP\:\:Lite|HTTrack|Haansoft|HaosouSpider|Harvest|Havij|Heritrix|Hloader|HonoluluBot|Humanlinks|HybridBot|IDBTE4M|IDBot|IRLbot|Iblog|Id-search|IlseBot|Image Fetch|Image Sucker|ImagesiftBot|IndeedBot|Indy Library|InfoNaviRobot|InfoTekies|Intelliseek|InterGET|InternetSeer|Internet Ninja|Iria|Iskanie|IstellaBot|JOC Web Spider|JamesBOT|Jbrofuzz|JennyBot|JetCar|Jetty|JikeSpider|Joomla|Jorgee|JustView|Jyxobot|Kenjin Spider|Keybot Translation-Search-Machine|Keyword Density|Kinza|Kozmosbot|LNSpiderguy|LWP\:\:Simple|Lanshanbot|Larbin|Leap|LeechFTP|LeechGet|LexiBot|Lftp|LibWeb|Libwhisker|LieBaoFast|Lightspeedsystems|Likse|LinkScan|LinkWalker|Linkbot|LinkextractorPro|LinkpadBot|LinksManager|LinqiaMetadataDownloaderBot|LinqiaRSSBot|LinqiaScrapeBot|Lipperhey|Lipperhey Spider|Litemage_walker|Lmspider|Ltx71|MFC_Tear_Sample|MIDown tool|MIIxpc|MJ12bot|MQQBrowser|MSFrontPage|MSIECrawler|MTRobot|Mag-Net|Magnet|Mail\.RU_Bot|Majestic-SEO|Majestic12|Majestic SEO|MarkMonitor|MarkWatch|Mass Downloader|Masscan|Mata Hari|MauiBot|Mb2345Browser|MeanPath Bot|Meanpathbot|Mediatoolkitbot|MegaIndex\.ru|Metauri|MicroMessenger|Microsoft Data Access|Microsoft URL Control|Minefield|Mister PiX|Moblie Safari|Mojeek|Mojolicious|MolokaiBot|Morfeus Fucking Scanner|Mozlila|Mr\.4x3|Msrabot|Musobot|NICErsPRO|NPbot|Name Intelligence|Nameprotect|Navroad|NearSite|Needle|Nessus|NetAnts|NetLyzer|NetMechanic|NetSpider|NetZIP|Net Vampire|Netcraft|Nettrack|Netvibes|NextGenSearchBot|Nibbler|Niki-bot|Nikto|NimbleCrawler|Nimbostratus|Ninja|Nmap|Nuclei|Nutch|Octopus|Offline Explorer|Offline Navigator|OnCrawl|OpenLinkProfiler|OpenVAS|Openfind|Openvas|OrangeBot|OrangeSpider|OutclicksBot|OutfoxBot|PECL\:\:HTTP|PHPCrawl|POE-Component-Client-HTTP|PageAnalyzer|PageGrabber|PageScorer|PageThing\.com|Page Analyzer|Pandalytics|Panscient|Papa Foto|Pavuk|PeoplePal|Petalbot|Pi-Monster|Picscout|Picsearch|PictureFinder|Piepmatz|Pimonster|Pixray|PleaseCrawl|Pockey|ProPowerBot|ProWebWalker|Probethenet|Proximic|Psbot|Pu_iN|Pump|PxBroker|PyCurl|QueryN Metasearch|Quick-Crawler|RSSingBot|Rainbot|RankActive|RankActiveLinkBot|RankFlex|RankingBot|RankingBot2|Rankivabot|RankurBot|Re-re|ReGet|RealDownload|Reaper|RebelMouse|Recorder|RedesScrapy|RepoMonkey|Ripper|RocketCrawler|Rogerbot|SBIder|SEOkicks|SEOkicks-Robot|SEOlyticsCrawler|SEOprofiler|SEOstats|SISTRIX|SMTBot|SalesIntelligent|ScanAlert|Scanbot|ScoutJet|Scrapy|Screaming|ScreenerBot|ScrepyBot|Searchestate|SearchmetricsBot|Seekport|SeekportBot|SemanticJuice|Semrush|SemrushBot|SentiBot|SenutoBot|SeoSiteCheckup|SeobilityBot|Seomoz|Shodan|Siphon|SiteCheckerBotCrawler|SiteExplorer|SiteLockSpider|SiteSnagger|SiteSucker|Site Sucker|Sitebeam|Siteimprove|Sitevigil|SlySearch|SmartDownload|Snake|Snapbot|Snoopy|SocialRankIOBot|Sociscraper|Sogou web spider|Sosospider|Sottopop|SpaceBison|Spammen|SpankBot|Spanner|Spbot|Spinn3r|SputnikBot|Sqlmap|Sqlworm|Sqworm|Steeler|Stripper|Sucker|Sucuri|SuperBot|SuperHTTP|Surfbot|SurveyBot|Suzuran|Swiftbot|Szukacz|T0PHackTeam|T8Abot|Teleport|TeleportPro|Telesoft|Telesphoreo|Telesphorep|TheNomad|The Intraformant|Thumbor|TightTwatBot|TinyTestBot|Titan|Toata|Toweyabot|Tracemyfile|Trendiction|Trendictionbot|True_Robot|Turingos|Turnitin|TurnitinBot|TwengaBot|Twice|Typhoeus|URLy\.Warning|URLy Warning|UnisterBot|Upflow|V-BOT|VB Project|VCI|Vacuum|Vagabondo|VelenPublicWebCrawler|VeriCiteCrawler|VidibleScraper|Virusdie|VoidEYE|Voil|Voltron|WASALive-Bot|WBSearchBot|WEBDAV|WISENutbot|WPScan|WWW-Collector-E|WWW-Mechanize|WWW\:\:Mechanize|WWWOFFLE|Wallpapers|Wallpapers/3\.0|WallpapersHD|WeSEE|WebAuto|WebBandit|WebCollage|WebCopier|WebEnhancer|WebFetch|WebFuck|WebGo IS|WebImageCollector|WebLeacher|WebPix|WebReaper|WebSauger|WebStripper|WebSucker|WebWhacker|WebZIP|Web Auto|Web Collage|Web Enhancer|Web Fetch|Web Fuck|Web Pix|Web Sauger|Web Sucker|Webalta|WebmasterWorldForumBot|Webshag|WebsiteExtractor|WebsiteQuester|Website Quester|Webster|Whack|Whacker|Whatweb|Who\.is Bot|Widow|WinHTTrack|WiseGuys Robot|Wonderbot|Woobot|Wotbox|Wprecon|Xaldon WebSpider|Xaldon_WebSpider|Xenu|YoudaoBot|Zade|Zauba|Zermelo|Zeus|Zitebot|ZmEu|ZoomBot|ZoominfoBot|ZumBot|ZyBorg|adscanner|anthropic-ai|archive\.org_bot|arquivo-web-crawler|arquivo\.pt|autoemailspider|backlink-check|cah\.io\.community|check1\.exe|clark-crawler|coccocbot|cognitiveseo|cohere-ai|com\.plumanalytics|crawl\.sogou\.com|crawler\.feedback|crawler4j|dataforseo\.com|dataforseobot|demandbase-bot|domainsproject\.org|eCatch|evc-batch|facebookscraper|gopher|heritrix|imagesift\.com|instabid|internetVista monitor|ips-agent|isitwp\.com|iubenda-radar|linkdexbot|lwp-request|lwp-trivial|magpie-crawler|meanpathbot|mediawords|muhstik-scan|netEstate NE Crawler|oBot|omgili|openai|openai\.com|page scorer|pcBrowser|plumanalytics|polaris version|probe-image-size|ripz|s1z\.ru|satoristudio\.net|scalaj-http|scan\.lol|seobility|seocompany\.store|seoscanners|seostar|serpstatbot|sexsearcher|sitechecker\.pro|siteripz|sogouspider|sp_auditbot|spyfu|sysscan|tAkeOut|trendiction\.com|trendiction\.de|ubermetrics-technologies\.com|voyagerx\.com|webgains-bot|webmeup-crawler|webpros\.com|webprosbot|x09Mozilla|x22Mozilla|xpymep1\.exe|zauba\.io|zgrab -failregex = ^ -.*"(GET|POST|HEAD).*HTTP.*"(?:%(badbots)s|%(badbotscustom)s)"$ +requri = /\S* +rescode = \d+ +failregex = ^ [^"]*"[A-Z]+\s+%(requri)s\s+[^"]*" %(rescode)s \d+ "[^"]*" "(?:%(badbots)s|%(badbotscustom)s)"$ ignoreregex = @@ -19,6 +21,6 @@ datepattern = ^[^\[]*\[({DATE}) # DEV Notes: # List of bad bots fetched from http://www.user-agents.org -# Generated on Thu Nov 7 14:23:35 PST 2013 by files/gen_badbots. +# Generated on Tue Feb 27 01:57:26 PM EET 2024 by ./files/gen_badbots. # # Author: Yaroslav Halchenko diff --git a/files/gen_badbots b/files/gen_badbots index 9c450bca..c6da0ef8 100755 --- a/files/gen_badbots +++ b/files/gen_badbots @@ -34,16 +34,12 @@ # #-----------------\____________________________________/------------------ -url=http://www.user-agents.org/index.shtml +url=https://raw.githubusercontent.com/mitchellkrogza/nginx-ultimate-bad-bot-blocker/master/_generator_lists/bad-user-agents.list badbots=$( -for f in "" "?g_m" "?moz" "?n_s" "?t_z"; do - wget -q -O- $url$f; -done \ -| grep -h -B4 'S '\ -| sed -e 's/ //g' \ -| awk '/^--/{getline; gsub(" ",""); print $0}' \ -| sed -e 's/\([.\:|()+]\)/\\\1/g' \ +curl -sS $url \ | uniq \ +| sed -e 's/\\ / /g' \ +| sed -e 's/\([.\:|()+]\)/\\\1/g' \ | tr '\n' '|' \ | sed -e 's/|$//g' ) @@ -60,13 +56,18 @@ cat >| config/filter.d/apache-badbots.conf < -.*"(GET|POST).*HTTP.*"(?:%(badbots)s|%(badbotscustom)s)"$ +requri = /\S* +rescode = \d+ +failregex = ^ [^"]*"[A-Z]+\s+%(requri)s\s+[^"]*" %(rescode)s \d+ "[^"]*" "(?:%(badbots)s|%(badbotscustom)s)"$ ignoreregex = +datepattern = ^[^\[]*\[({DATE}) + {^LN-BEG} + # DEV Notes: # List of bad bots fetched from http://www.user-agents.org # Generated on `date` by $0.