Add files via upload
parent
1491253d06
commit
2afab9ed57
|
@ -0,0 +1,211 @@
|
|||
__all__ = [
|
||||
'AbstractBasicAuthHandler',
|
||||
'AbstractDigestAuthHandler',
|
||||
'BaseHandler',
|
||||
'Browser',
|
||||
'BrowserStateError',
|
||||
'CacheFTPHandler',
|
||||
'ContentTooShortError',
|
||||
'Cookie',
|
||||
'CookieJar',
|
||||
'CookiePolicy',
|
||||
'DefaultCookiePolicy',
|
||||
'DefaultFactory',
|
||||
'FTPHandler',
|
||||
'Factory',
|
||||
'FileCookieJar',
|
||||
'FileHandler',
|
||||
'FormNotFoundError',
|
||||
'FormsFactory',
|
||||
'HTTPBasicAuthHandler',
|
||||
'HTTPCookieProcessor',
|
||||
'HTTPDefaultErrorHandler',
|
||||
'HTTPDigestAuthHandler',
|
||||
'HTTPEquivProcessor',
|
||||
'HTTPError',
|
||||
'HTTPErrorProcessor',
|
||||
'HTTPHandler',
|
||||
'HTTPPasswordMgr',
|
||||
'HTTPPasswordMgrWithDefaultRealm',
|
||||
'HTTPProxyPasswordMgr',
|
||||
'HTTPRedirectDebugProcessor',
|
||||
'HTTPRedirectHandler',
|
||||
'HTTPRefererProcessor',
|
||||
'HTTPRefreshProcessor',
|
||||
'HTTPResponseDebugProcessor',
|
||||
'HTTPRobotRulesProcessor',
|
||||
'HTTPSClientCertMgr',
|
||||
'HeadParser',
|
||||
'History',
|
||||
'LWPCookieJar',
|
||||
'Link',
|
||||
'LinkNotFoundError',
|
||||
'LinksFactory',
|
||||
'LoadError',
|
||||
'MSIECookieJar',
|
||||
'MozillaCookieJar',
|
||||
'OpenerDirector',
|
||||
'OpenerFactory',
|
||||
'ParseError',
|
||||
'ProxyBasicAuthHandler',
|
||||
'ProxyDigestAuthHandler',
|
||||
'ProxyHandler',
|
||||
'Request',
|
||||
'RobotExclusionError',
|
||||
'RobustFactory',
|
||||
'RobustFormsFactory',
|
||||
'RobustLinksFactory',
|
||||
'RobustTitleFactory',
|
||||
'SeekableResponseOpener',
|
||||
'TitleFactory',
|
||||
'URLError',
|
||||
'USE_BARE_EXCEPT',
|
||||
'UnknownHandler',
|
||||
'UserAgent',
|
||||
'UserAgentBase',
|
||||
'XHTMLCompatibleHeadParser',
|
||||
'__version__',
|
||||
'build_opener',
|
||||
'install_opener',
|
||||
'lwp_cookie_str',
|
||||
'make_response',
|
||||
'request_host',
|
||||
'response_seek_wrapper', # XXX deprecate in public interface?
|
||||
'seek_wrapped_response', # XXX should probably use this internally in place of response_seek_wrapper()
|
||||
'str2time',
|
||||
'urlopen',
|
||||
'urlretrieve',
|
||||
'urljoin',
|
||||
|
||||
# ClientForm API
|
||||
'AmbiguityError',
|
||||
'ControlNotFoundError',
|
||||
'FormParser',
|
||||
'ItemCountError',
|
||||
'ItemNotFoundError',
|
||||
'LocateError',
|
||||
'Missing',
|
||||
'ParseFile',
|
||||
'ParseFileEx',
|
||||
'ParseResponse',
|
||||
'ParseResponseEx',
|
||||
'ParseString',
|
||||
'XHTMLCompatibleFormParser',
|
||||
# deprecated
|
||||
'CheckboxControl',
|
||||
'Control',
|
||||
'FileControl',
|
||||
'HTMLForm',
|
||||
'HiddenControl',
|
||||
'IgnoreControl',
|
||||
'ImageControl',
|
||||
'IsindexControl',
|
||||
'Item',
|
||||
'Label',
|
||||
'ListControl',
|
||||
'PasswordControl',
|
||||
'RadioControl',
|
||||
'ScalarControl',
|
||||
'SelectControl',
|
||||
'SubmitButtonControl',
|
||||
'SubmitControl',
|
||||
'TextControl',
|
||||
'TextareaControl',
|
||||
]
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from _version import __version__
|
||||
|
||||
# high-level stateful browser-style interface
|
||||
from _mechanize import \
|
||||
Browser, History, \
|
||||
BrowserStateError, LinkNotFoundError, FormNotFoundError
|
||||
|
||||
# configurable URL-opener interface
|
||||
from _useragent import UserAgentBase, UserAgent
|
||||
from _html import \
|
||||
Link, \
|
||||
Factory, DefaultFactory, RobustFactory, \
|
||||
FormsFactory, LinksFactory, TitleFactory, \
|
||||
RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
|
||||
|
||||
# urllib2 work-alike interface. This is a superset of the urllib2 interface.
|
||||
from _urllib2 import *
|
||||
import _urllib2
|
||||
if hasattr(_urllib2, "HTTPSHandler"):
|
||||
__all__.append("HTTPSHandler")
|
||||
del _urllib2
|
||||
|
||||
# misc
|
||||
from _http import HeadParser
|
||||
from _http import XHTMLCompatibleHeadParser
|
||||
from _opener import ContentTooShortError, OpenerFactory, urlretrieve
|
||||
from _response import \
|
||||
response_seek_wrapper, seek_wrapped_response, make_response
|
||||
from _rfc3986 import urljoin
|
||||
from _util import http2time as str2time
|
||||
|
||||
# cookies
|
||||
from _clientcookie import Cookie, CookiePolicy, DefaultCookiePolicy, \
|
||||
CookieJar, FileCookieJar, LoadError, request_host_lc as request_host, \
|
||||
effective_request_host
|
||||
from _lwpcookiejar import LWPCookieJar, lwp_cookie_str
|
||||
# 2.4 raises SyntaxError due to generator / try/finally use
|
||||
if sys.version_info[:2] > (2,4):
|
||||
try:
|
||||
import sqlite3
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
from _firefox3cookiejar import Firefox3CookieJar
|
||||
from _mozillacookiejar import MozillaCookieJar
|
||||
from _msiecookiejar import MSIECookieJar
|
||||
|
||||
# forms
|
||||
from _form import (
|
||||
AmbiguityError,
|
||||
ControlNotFoundError,
|
||||
FormParser,
|
||||
ItemCountError,
|
||||
ItemNotFoundError,
|
||||
LocateError,
|
||||
Missing,
|
||||
ParseError,
|
||||
ParseFile,
|
||||
ParseFileEx,
|
||||
ParseResponse,
|
||||
ParseResponseEx,
|
||||
ParseString,
|
||||
XHTMLCompatibleFormParser,
|
||||
# deprecated
|
||||
CheckboxControl,
|
||||
Control,
|
||||
FileControl,
|
||||
HTMLForm,
|
||||
HiddenControl,
|
||||
IgnoreControl,
|
||||
ImageControl,
|
||||
IsindexControl,
|
||||
Item,
|
||||
Label,
|
||||
ListControl,
|
||||
PasswordControl,
|
||||
RadioControl,
|
||||
ScalarControl,
|
||||
SelectControl,
|
||||
SubmitButtonControl,
|
||||
SubmitControl,
|
||||
TextControl,
|
||||
TextareaControl,
|
||||
)
|
||||
|
||||
# If you hate the idea of turning bugs into warnings, do:
|
||||
# import mechanize; mechanize.USE_BARE_EXCEPT = False
|
||||
USE_BARE_EXCEPT = True
|
||||
|
||||
logger = logging.getLogger("mechanize")
|
||||
if logger.level is logging.NOTSET:
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
del logger
|
Binary file not shown.
|
@ -0,0 +1,68 @@
|
|||
"""HTTP Authentication and Proxy support.
|
||||
|
||||
|
||||
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it under
|
||||
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
from _urllib2_fork import HTTPPasswordMgr
|
||||
|
||||
|
||||
# TODO: stop deriving from HTTPPasswordMgr
|
||||
class HTTPProxyPasswordMgr(HTTPPasswordMgr):
|
||||
# has default realm and host/port
|
||||
def add_password(self, realm, uri, user, passwd):
|
||||
# uri could be a single URI or a sequence
|
||||
if uri is None or isinstance(uri, basestring):
|
||||
uris = [uri]
|
||||
else:
|
||||
uris = uri
|
||||
passwd_by_domain = self.passwd.setdefault(realm, {})
|
||||
for uri in uris:
|
||||
for default_port in True, False:
|
||||
reduced_uri = self.reduce_uri(uri, default_port)
|
||||
passwd_by_domain[reduced_uri] = (user, passwd)
|
||||
|
||||
def find_user_password(self, realm, authuri):
|
||||
attempts = [(realm, authuri), (None, authuri)]
|
||||
# bleh, want default realm to take precedence over default
|
||||
# URI/authority, hence this outer loop
|
||||
for default_uri in False, True:
|
||||
for realm, authuri in attempts:
|
||||
authinfo_by_domain = self.passwd.get(realm, {})
|
||||
for default_port in True, False:
|
||||
reduced_authuri = self.reduce_uri(authuri, default_port)
|
||||
for uri, authinfo in authinfo_by_domain.iteritems():
|
||||
if uri is None and not default_uri:
|
||||
continue
|
||||
if self.is_suburi(uri, reduced_authuri):
|
||||
return authinfo
|
||||
user, password = None, None
|
||||
|
||||
if user is not None:
|
||||
break
|
||||
return user, password
|
||||
|
||||
def reduce_uri(self, uri, default_port=True):
|
||||
if uri is None:
|
||||
return None
|
||||
return HTTPPasswordMgr.reduce_uri(self, uri, default_port)
|
||||
|
||||
def is_suburi(self, base, test):
|
||||
if base is None:
|
||||
# default to the proxy's host/port
|
||||
hostport, path = test
|
||||
base = (hostport, "/")
|
||||
return HTTPPasswordMgr.is_suburi(self, base, test)
|
||||
|
||||
|
||||
class HTTPSClientCertMgr(HTTPPasswordMgr):
|
||||
# implementation inheritance: this is not a proper subclass
|
||||
def add_key_cert(self, uri, key_file, cert_file):
|
||||
self.add_password(None, uri, key_file, cert_file)
|
||||
def find_key_cert(self, authuri):
|
||||
return HTTPPasswordMgr.find_user_password(self, None, authuri)
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,28 @@
|
|||
import logging
|
||||
|
||||
from _response import response_seek_wrapper
|
||||
from _urllib2_fork import BaseHandler
|
||||
|
||||
|
||||
class HTTPResponseDebugProcessor(BaseHandler):
|
||||
handler_order = 900 # before redirections, after everything else
|
||||
|
||||
def http_response(self, request, response):
|
||||
if not hasattr(response, "seek"):
|
||||
response = response_seek_wrapper(response)
|
||||
info = logging.getLogger("mechanize.http_responses").info
|
||||
try:
|
||||
info(response.read())
|
||||
finally:
|
||||
response.seek(0)
|
||||
info("*****************************************************")
|
||||
return response
|
||||
|
||||
https_response = http_response
|
||||
|
||||
class HTTPRedirectDebugProcessor(BaseHandler):
|
||||
def http_request(self, request):
|
||||
if hasattr(request, "redirect_dict"):
|
||||
info = logging.getLogger("mechanize.http_redirects").info
|
||||
info("redirecting to %s", request.get_full_url())
|
||||
return request
|
Binary file not shown.
|
@ -0,0 +1,248 @@
|
|||
"""Firefox 3 "cookies.sqlite" cookie persistence.
|
||||
|
||||
Copyright 2008 John J Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from _clientcookie import CookieJar, Cookie, MappingIterator
|
||||
from _util import isstringlike, experimental
|
||||
debug = logging.getLogger("mechanize.cookies").debug
|
||||
|
||||
|
||||
class Firefox3CookieJar(CookieJar):
|
||||
|
||||
"""Firefox 3 cookie jar.
|
||||
|
||||
The cookies are stored in Firefox 3's "cookies.sqlite" format.
|
||||
|
||||
Constructor arguments:
|
||||
|
||||
filename: filename of cookies.sqlite (typically found at the top level
|
||||
of a firefox profile directory)
|
||||
autoconnect: as a convenience, connect to the SQLite cookies database at
|
||||
Firefox3CookieJar construction time (default True)
|
||||
policy: an object satisfying the mechanize.CookiePolicy interface
|
||||
|
||||
Note that this is NOT a FileCookieJar, and there are no .load(),
|
||||
.save() or .restore() methods. The database is in sync with the
|
||||
cookiejar object's state after each public method call.
|
||||
|
||||
Following Firefox's own behaviour, session cookies are never saved to
|
||||
the database.
|
||||
|
||||
The file is created, and an sqlite database written to it, if it does
|
||||
not already exist. The moz_cookies database table is created if it does
|
||||
not already exist.
|
||||
"""
|
||||
|
||||
# XXX
|
||||
# handle DatabaseError exceptions
|
||||
# add a FileCookieJar (explicit .save() / .revert() / .load() methods)
|
||||
|
||||
def __init__(self, filename, autoconnect=True, policy=None):
|
||||
experimental("Firefox3CookieJar is experimental code")
|
||||
CookieJar.__init__(self, policy)
|
||||
if filename is not None and not isstringlike(filename):
|
||||
raise ValueError("filename must be string-like")
|
||||
self.filename = filename
|
||||
self._conn = None
|
||||
if autoconnect:
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
import sqlite3 # not available in Python 2.4 stdlib
|
||||
self._conn = sqlite3.connect(self.filename)
|
||||
self._conn.isolation_level = "DEFERRED"
|
||||
self._create_table_if_necessary()
|
||||
|
||||
def close(self):
|
||||
self._conn.close()
|
||||
|
||||
def _transaction(self, func):
|
||||
try:
|
||||
cur = self._conn.cursor()
|
||||
try:
|
||||
result = func(cur)
|
||||
finally:
|
||||
cur.close()
|
||||
except:
|
||||
self._conn.rollback()
|
||||
raise
|
||||
else:
|
||||
self._conn.commit()
|
||||
return result
|
||||
|
||||
def _execute(self, query, params=()):
|
||||
return self._transaction(lambda cur: cur.execute(query, params))
|
||||
|
||||
def _query(self, query, params=()):
|
||||
# XXX should we bother with a transaction?
|
||||
cur = self._conn.cursor()
|
||||
try:
|
||||
cur.execute(query, params)
|
||||
return cur.fetchall()
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
def _create_table_if_necessary(self):
|
||||
self._execute("""\
|
||||
CREATE TABLE IF NOT EXISTS moz_cookies (id INTEGER PRIMARY KEY, name TEXT,
|
||||
value TEXT, host TEXT, path TEXT,expiry INTEGER,
|
||||
lastAccessed INTEGER, isSecure INTEGER, isHttpOnly INTEGER)""")
|
||||
|
||||
def _cookie_from_row(self, row):
|
||||
(pk, name, value, domain, path, expires,
|
||||
last_accessed, secure, http_only) = row
|
||||
|
||||
version = 0
|
||||
domain = domain.encode("ascii", "ignore")
|
||||
path = path.encode("ascii", "ignore")
|
||||
name = name.encode("ascii", "ignore")
|
||||
value = value.encode("ascii", "ignore")
|
||||
secure = bool(secure)
|
||||
|
||||
# last_accessed isn't a cookie attribute, so isn't added to rest
|
||||
rest = {}
|
||||
if http_only:
|
||||
rest["HttpOnly"] = None
|
||||
|
||||
if name == "":
|
||||
name = value
|
||||
value = None
|
||||
|
||||
initial_dot = domain.startswith(".")
|
||||
domain_specified = initial_dot
|
||||
|
||||
discard = False
|
||||
if expires == "":
|
||||
expires = None
|
||||
discard = True
|
||||
|
||||
return Cookie(version, name, value,
|
||||
None, False,
|
||||
domain, domain_specified, initial_dot,
|
||||
path, False,
|
||||
secure,
|
||||
expires,
|
||||
discard,
|
||||
None,
|
||||
None,
|
||||
rest)
|
||||
|
||||
def clear(self, domain=None, path=None, name=None):
|
||||
CookieJar.clear(self, domain, path, name)
|
||||
where_parts = []
|
||||
sql_params = []
|
||||
if domain is not None:
|
||||
where_parts.append("host = ?")
|
||||
sql_params.append(domain)
|
||||
if path is not None:
|
||||
where_parts.append("path = ?")
|
||||
sql_params.append(path)
|
||||
if name is not None:
|
||||
where_parts.append("name = ?")
|
||||
sql_params.append(name)
|
||||
where = " AND ".join(where_parts)
|
||||
if where:
|
||||
where = " WHERE " + where
|
||||
def clear(cur):
|
||||
cur.execute("DELETE FROM moz_cookies%s" % where,
|
||||
tuple(sql_params))
|
||||
self._transaction(clear)
|
||||
|
||||
def _row_from_cookie(self, cookie, cur):
|
||||
expires = cookie.expires
|
||||
if cookie.discard:
|
||||
expires = ""
|
||||
|
||||
domain = unicode(cookie.domain)
|
||||
path = unicode(cookie.path)
|
||||
name = unicode(cookie.name)
|
||||
value = unicode(cookie.value)
|
||||
secure = bool(int(cookie.secure))
|
||||
|
||||
if value is None:
|
||||
value = name
|
||||
name = ""
|
||||
|
||||
last_accessed = int(time.time())
|
||||
http_only = cookie.has_nonstandard_attr("HttpOnly")
|
||||
|
||||
query = cur.execute("""SELECT MAX(id) + 1 from moz_cookies""")
|
||||
pk = query.fetchone()[0]
|
||||
if pk is None:
|
||||
pk = 1
|
||||
|
||||
return (pk, name, value, domain, path, expires,
|
||||
last_accessed, secure, http_only)
|
||||
|
||||
def set_cookie(self, cookie):
|
||||
if cookie.discard:
|
||||
CookieJar.set_cookie(self, cookie)
|
||||
return
|
||||
|
||||
def set_cookie(cur):
|
||||
# XXX
|
||||
# is this RFC 2965-correct?
|
||||
# could this do an UPDATE instead?
|
||||
row = self._row_from_cookie(cookie, cur)
|
||||
name, unused, domain, path = row[1:5]
|
||||
cur.execute("""\
|
||||
DELETE FROM moz_cookies WHERE host = ? AND path = ? AND name = ?""",
|
||||
(domain, path, name))
|
||||
cur.execute("""\
|
||||
INSERT INTO moz_cookies VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", row)
|
||||
self._transaction(set_cookie)
|
||||
|
||||
def __iter__(self):
|
||||
# session (non-persistent) cookies
|
||||
for cookie in MappingIterator(self._cookies):
|
||||
yield cookie
|
||||
# persistent cookies
|
||||
for row in self._query("""\
|
||||
SELECT * FROM moz_cookies ORDER BY name, path, host"""):
|
||||
yield self._cookie_from_row(row)
|
||||
|
||||
def _cookies_for_request(self, request):
|
||||
session_cookies = CookieJar._cookies_for_request(self, request)
|
||||
def get_cookies(cur):
|
||||
query = cur.execute("SELECT host from moz_cookies")
|
||||
domains = [row[0] for row in query.fetchall()]
|
||||
cookies = []
|
||||
for domain in domains:
|
||||
cookies += self._persistent_cookies_for_domain(domain,
|
||||
request, cur)
|
||||
return cookies
|
||||
persistent_coookies = self._transaction(get_cookies)
|
||||
return session_cookies + persistent_coookies
|
||||
|
||||
def _persistent_cookies_for_domain(self, domain, request, cur):
|
||||
cookies = []
|
||||
if not self._policy.domain_return_ok(domain, request):
|
||||
return []
|
||||
debug("Checking %s for cookies to return", domain)
|
||||
query = cur.execute("""\
|
||||
SELECT * from moz_cookies WHERE host = ? ORDER BY path""",
|
||||
(domain,))
|
||||
cookies = [self._cookie_from_row(row) for row in query.fetchall()]
|
||||
last_path = None
|
||||
r = []
|
||||
for cookie in cookies:
|
||||
if (cookie.path != last_path and
|
||||
not self._policy.path_return_ok(cookie.path, request)):
|
||||
last_path = cookie.path
|
||||
continue
|
||||
if not self._policy.return_ok(cookie, request):
|
||||
debug(" not returning cookie")
|
||||
continue
|
||||
debug(" it's a match")
|
||||
r.append(cookie)
|
||||
return r
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,105 @@
|
|||
from cStringIO import StringIO
|
||||
|
||||
import _response
|
||||
import _urllib2_fork
|
||||
|
||||
|
||||
# GzipConsumer was taken from Fredrik Lundh's effbot.org-0.1-20041009 library
|
||||
class GzipConsumer:
|
||||
|
||||
def __init__(self, consumer):
|
||||
self.__consumer = consumer
|
||||
self.__decoder = None
|
||||
self.__data = ""
|
||||
|
||||
def __getattr__(self, key):
|
||||
return getattr(self.__consumer, key)
|
||||
|
||||
def feed(self, data):
|
||||
if self.__decoder is None:
|
||||
# check if we have a full gzip header
|
||||
data = self.__data + data
|
||||
try:
|
||||
i = 10
|
||||
flag = ord(data[3])
|
||||
if flag & 4: # extra
|
||||
x = ord(data[i]) + 256*ord(data[i+1])
|
||||
i = i + 2 + x
|
||||
if flag & 8: # filename
|
||||
while ord(data[i]):
|
||||
i = i + 1
|
||||
i = i + 1
|
||||
if flag & 16: # comment
|
||||
while ord(data[i]):
|
||||
i = i + 1
|
||||
i = i + 1
|
||||
if flag & 2: # crc
|
||||
i = i + 2
|
||||
if len(data) < i:
|
||||
raise IndexError("not enough data")
|
||||
if data[:3] != "\x1f\x8b\x08":
|
||||
raise IOError("invalid gzip data")
|
||||
data = data[i:]
|
||||
except IndexError:
|
||||
self.__data = data
|
||||
return # need more data
|
||||
import zlib
|
||||
self.__data = ""
|
||||
self.__decoder = zlib.decompressobj(-zlib.MAX_WBITS)
|
||||
data = self.__decoder.decompress(data)
|
||||
if data:
|
||||
self.__consumer.feed(data)
|
||||
|
||||
def close(self):
|
||||
if self.__decoder:
|
||||
data = self.__decoder.flush()
|
||||
if data:
|
||||
self.__consumer.feed(data)
|
||||
self.__consumer.close()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
# the rest of this module is John Lee's stupid code, not
|
||||
# Fredrik's nice code :-)
|
||||
|
||||
class stupid_gzip_consumer:
|
||||
def __init__(self): self.data = []
|
||||
def feed(self, data): self.data.append(data)
|
||||
|
||||
class stupid_gzip_wrapper(_response.closeable_response):
|
||||
def __init__(self, response):
|
||||
self._response = response
|
||||
|
||||
c = stupid_gzip_consumer()
|
||||
gzc = GzipConsumer(c)
|
||||
gzc.feed(response.read())
|
||||
self.__data = StringIO("".join(c.data))
|
||||
|
||||
def read(self, size=-1):
|
||||
return self.__data.read(size)
|
||||
def readline(self, size=-1):
|
||||
return self.__data.readline(size)
|
||||
def readlines(self, sizehint=-1):
|
||||
return self.__data.readlines(sizehint)
|
||||
|
||||
def __getattr__(self, name):
|
||||
# delegate unknown methods/attributes
|
||||
return getattr(self._response, name)
|
||||
|
||||
class HTTPGzipProcessor(_urllib2_fork.BaseHandler):
|
||||
handler_order = 200 # response processing before HTTPEquivProcessor
|
||||
|
||||
def http_request(self, request):
|
||||
request.add_header("Accept-Encoding", "gzip")
|
||||
return request
|
||||
|
||||
def http_response(self, request, response):
|
||||
# post-process response
|
||||
enc_hdrs = response.info().getheaders("Content-encoding")
|
||||
for enc_hdr in enc_hdrs:
|
||||
if ("gzip" in enc_hdr) or ("compress" in enc_hdr):
|
||||
return stupid_gzip_wrapper(response)
|
||||
return response
|
||||
|
||||
https_response = http_response
|
Binary file not shown.
|
@ -0,0 +1,241 @@
|
|||
"""Utility functions for HTTP header value parsing and construction.
|
||||
|
||||
Copyright 1997-1998, Gisle Aas
|
||||
Copyright 2002-2006, John J. Lee
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import os, re
|
||||
from types import StringType
|
||||
from types import UnicodeType
|
||||
STRING_TYPES = StringType, UnicodeType
|
||||
|
||||
from _util import http2time
|
||||
import _rfc3986
|
||||
|
||||
|
||||
def is_html_file_extension(url, allow_xhtml):
|
||||
ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
|
||||
html_exts = [".htm", ".html"]
|
||||
if allow_xhtml:
|
||||
html_exts += [".xhtml"]
|
||||
return ext in html_exts
|
||||
|
||||
|
||||
def is_html(ct_headers, url, allow_xhtml=False):
|
||||
"""
|
||||
ct_headers: Sequence of Content-Type headers
|
||||
url: Response URL
|
||||
|
||||
"""
|
||||
if not ct_headers:
|
||||
return is_html_file_extension(url, allow_xhtml)
|
||||
headers = split_header_words(ct_headers)
|
||||
if len(headers) < 1:
|
||||
return is_html_file_extension(url, allow_xhtml)
|
||||
first_header = headers[0]
|
||||
first_parameter = first_header[0]
|
||||
ct = first_parameter[0]
|
||||
html_types = ["text/html"]
|
||||
if allow_xhtml:
|
||||
html_types += [
|
||||
"text/xhtml", "text/xml",
|
||||
"application/xml", "application/xhtml+xml",
|
||||
]
|
||||
return ct in html_types
|
||||
|
||||
|
||||
def unmatched(match):
|
||||
"""Return unmatched part of re.Match object."""
|
||||
start, end = match.span(0)
|
||||
return match.string[:start]+match.string[end:]
|
||||
|
||||
token_re = re.compile(r"^\s*([^=\s;,]+)")
|
||||
quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
|
||||
value_re = re.compile(r"^\s*=\s*([^\s;,]*)")
|
||||
escape_re = re.compile(r"\\(.)")
|
||||
def split_header_words(header_values):
|
||||
r"""Parse header values into a list of lists containing key,value pairs.
|
||||
|
||||
The function knows how to deal with ",", ";" and "=" as well as quoted
|
||||
values after "=". A list of space separated tokens are parsed as if they
|
||||
were separated by ";".
|
||||
|
||||
If the header_values passed as argument contains multiple values, then they
|
||||
are treated as if they were a single value separated by comma ",".
|
||||
|
||||
This means that this function is useful for parsing header fields that
|
||||
follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
|
||||
the requirement for tokens).
|
||||
|
||||
headers = #header
|
||||
header = (token | parameter) *( [";"] (token | parameter))
|
||||
|
||||
token = 1*<any CHAR except CTLs or separators>
|
||||
separators = "(" | ")" | "<" | ">" | "@"
|
||||
| "," | ";" | ":" | "\" | <">
|
||||
| "/" | "[" | "]" | "?" | "="
|
||||
| "{" | "}" | SP | HT
|
||||
|
||||
quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
|
||||
qdtext = <any TEXT except <">>
|
||||
quoted-pair = "\" CHAR
|
||||
|
||||
parameter = attribute "=" value
|
||||
attribute = token
|
||||
value = token | quoted-string
|
||||
|
||||
Each header is represented by a list of key/value pairs. The value for a
|
||||
simple token (not part of a parameter) is None. Syntactically incorrect
|
||||
headers will not necessarily be parsed as you would want.
|
||||
|
||||
This is easier to describe with some examples:
|
||||
|
||||
>>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
|
||||
[[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
|
||||
>>> split_header_words(['text/html; charset="iso-8859-1"'])
|
||||
[[('text/html', None), ('charset', 'iso-8859-1')]]
|
||||
>>> split_header_words([r'Basic realm="\"foo\bar\""'])
|
||||
[[('Basic', None), ('realm', '"foobar"')]]
|
||||
|
||||
"""
|
||||
assert type(header_values) not in STRING_TYPES
|
||||
result = []
|
||||
for text in header_values:
|
||||
orig_text = text
|
||||
pairs = []
|
||||
while text:
|
||||
m = token_re.search(text)
|
||||
if m:
|
||||
text = unmatched(m)
|
||||
name = m.group(1)
|
||||
m = quoted_value_re.search(text)
|
||||
if m: # quoted value
|
||||
text = unmatched(m)
|
||||
value = m.group(1)
|
||||
value = escape_re.sub(r"\1", value)
|
||||
else:
|
||||
m = value_re.search(text)
|
||||
if m: # unquoted value
|
||||
text = unmatched(m)
|
||||
value = m.group(1)
|
||||
value = value.rstrip()
|
||||
else:
|
||||
# no value, a lone token
|
||||
value = None
|
||||
pairs.append((name, value))
|
||||
elif text.lstrip().startswith(","):
|
||||
# concatenated headers, as per RFC 2616 section 4.2
|
||||
text = text.lstrip()[1:]
|
||||
if pairs: result.append(pairs)
|
||||
pairs = []
|
||||
else:
|
||||
# skip junk
|
||||
non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
|
||||
assert nr_junk_chars > 0, (
|
||||
"split_header_words bug: '%s', '%s', %s" %
|
||||
(orig_text, text, pairs))
|
||||
text = non_junk
|
||||
if pairs: result.append(pairs)
|
||||
return result
|
||||
|
||||
join_escape_re = re.compile(r"([\"\\])")
|
||||
def join_header_words(lists):
|
||||
"""Do the inverse of the conversion done by split_header_words.
|
||||
|
||||
Takes a list of lists of (key, value) pairs and produces a single header
|
||||
value. Attribute values are quoted if needed.
|
||||
|
||||
>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
||||
'text/plain; charset="iso-8859/1"'
|
||||
>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
||||
'text/plain, charset="iso-8859/1"'
|
||||
|
||||
"""
|
||||
headers = []
|
||||
for pairs in lists:
|
||||
attr = []
|
||||
for k, v in pairs:
|
||||
if v is not None:
|
||||
if not re.search(r"^\w+$", v):
|
||||
v = join_escape_re.sub(r"\\\1", v) # escape " and \
|
||||
v = '"%s"' % v
|
||||
if k is None: # Netscape cookies may have no name
|
||||
k = v
|
||||
else:
|
||||
k = "%s=%s" % (k, v)
|
||||
attr.append(k)
|
||||
if attr: headers.append("; ".join(attr))
|
||||
return ", ".join(headers)
|
||||
|
||||
def strip_quotes(text):
|
||||
if text.startswith('"'):
|
||||
text = text[1:]
|
||||
if text.endswith('"'):
|
||||
text = text[:-1]
|
||||
return text
|
||||
|
||||
def parse_ns_headers(ns_headers):
|
||||
"""Ad-hoc parser for Netscape protocol cookie-attributes.
|
||||
|
||||
The old Netscape cookie format for Set-Cookie can for instance contain
|
||||
an unquoted "," in the expires field, so we have to use this ad-hoc
|
||||
parser instead of split_header_words.
|
||||
|
||||
XXX This may not make the best possible effort to parse all the crap
|
||||
that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
|
||||
parser is probably better, so could do worse than following that if
|
||||
this ever gives any trouble.
|
||||
|
||||
Currently, this is also used for parsing RFC 2109 cookies.
|
||||
|
||||
"""
|
||||
known_attrs = ("expires", "domain", "path", "secure",
|
||||
# RFC 2109 attrs (may turn up in Netscape cookies, too)
|
||||
"version", "port", "max-age")
|
||||
|
||||
result = []
|
||||
for ns_header in ns_headers:
|
||||
pairs = []
|
||||
version_set = False
|
||||
params = re.split(r";\s*", ns_header)
|
||||
for ii in range(len(params)):
|
||||
param = params[ii]
|
||||
param = param.rstrip()
|
||||
if param == "": continue
|
||||
if "=" not in param:
|
||||
k, v = param, None
|
||||
else:
|
||||
k, v = re.split(r"\s*=\s*", param, 1)
|
||||
k = k.lstrip()
|
||||
if ii != 0:
|
||||
lc = k.lower()
|
||||
if lc in known_attrs:
|
||||
k = lc
|
||||
if k == "version":
|
||||
# This is an RFC 2109 cookie.
|
||||
v = strip_quotes(v)
|
||||
version_set = True
|
||||
if k == "expires":
|
||||
# convert expires date to seconds since epoch
|
||||
v = http2time(strip_quotes(v)) # None if invalid
|
||||
pairs.append((k, v))
|
||||
|
||||
if pairs:
|
||||
if not version_set:
|
||||
pairs.append(("version", "0"))
|
||||
result.append(pairs)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest, _headersutil
|
||||
return doctest.testmod(_headersutil)
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
Binary file not shown.
|
@ -0,0 +1,629 @@
|
|||
"""HTML handling.
|
||||
|
||||
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it under
|
||||
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import codecs
|
||||
import copy
|
||||
import htmlentitydefs
|
||||
import re
|
||||
|
||||
import _sgmllib_copy as sgmllib
|
||||
|
||||
import _beautifulsoup
|
||||
import _form
|
||||
from _headersutil import split_header_words, is_html as _is_html
|
||||
import _request
|
||||
import _rfc3986
|
||||
|
||||
DEFAULT_ENCODING = "latin-1"
|
||||
|
||||
COMPRESS_RE = re.compile(r"\s+")
|
||||
|
||||
|
||||
class CachingGeneratorFunction(object):
|
||||
"""Caching wrapper around a no-arguments iterable."""
|
||||
|
||||
def __init__(self, iterable):
|
||||
self._cache = []
|
||||
# wrap iterable to make it non-restartable (otherwise, repeated
|
||||
# __call__ would give incorrect results)
|
||||
self._iterator = iter(iterable)
|
||||
|
||||
def __call__(self):
|
||||
cache = self._cache
|
||||
for item in cache:
|
||||
yield item
|
||||
for item in self._iterator:
|
||||
cache.append(item)
|
||||
yield item
|
||||
|
||||
|
||||
class EncodingFinder:
|
||||
def __init__(self, default_encoding):
|
||||
self._default_encoding = default_encoding
|
||||
def encoding(self, response):
|
||||
# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
|
||||
# headers may be in the response. HTTP-EQUIV headers come last,
|
||||
# so try in order from first to last.
|
||||
for ct in response.info().getheaders("content-type"):
|
||||
for k, v in split_header_words([ct])[0]:
|
||||
if k == "charset":
|
||||
encoding = v
|
||||
try:
|
||||
codecs.lookup(v)
|
||||
except LookupError:
|
||||
continue
|
||||
else:
|
||||
return encoding
|
||||
return self._default_encoding
|
||||
|
||||
|
||||
class ResponseTypeFinder:
|
||||
def __init__(self, allow_xhtml):
|
||||
self._allow_xhtml = allow_xhtml
|
||||
def is_html(self, response, encoding):
|
||||
ct_hdrs = response.info().getheaders("content-type")
|
||||
url = response.geturl()
|
||||
# XXX encoding
|
||||
return _is_html(ct_hdrs, url, self._allow_xhtml)
|
||||
|
||||
|
||||
class Args(object):
|
||||
|
||||
# idea for this argument-processing trick is from Peter Otten
|
||||
|
||||
def __init__(self, args_map):
|
||||
self.__dict__["dictionary"] = dict(args_map)
|
||||
|
||||
def __getattr__(self, key):
|
||||
try:
|
||||
return self.dictionary[key]
|
||||
except KeyError:
|
||||
return getattr(self.__class__, key)
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if key == "dictionary":
|
||||
raise AttributeError()
|
||||
self.dictionary[key] = value
|
||||
|
||||
|
||||
def form_parser_args(
|
||||
select_default=False,
|
||||
form_parser_class=None,
|
||||
request_class=None,
|
||||
backwards_compat=False,
|
||||
):
|
||||
return Args(locals())
|
||||
|
||||
|
||||
class Link:
|
||||
def __init__(self, base_url, url, text, tag, attrs):
|
||||
assert None not in [url, tag, attrs]
|
||||
self.base_url = base_url
|
||||
self.absolute_url = _rfc3986.urljoin(base_url, url)
|
||||
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
|
||||
def __cmp__(self, other):
|
||||
try:
|
||||
for name in "url", "text", "tag", "attrs":
|
||||
if getattr(self, name) != getattr(other, name):
|
||||
return -1
|
||||
except AttributeError:
|
||||
return -1
|
||||
return 0
|
||||
def __repr__(self):
|
||||
return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
|
||||
self.base_url, self.url, self.text, self.tag, self.attrs)
|
||||
|
||||
|
||||
class LinksFactory:
|
||||
|
||||
def __init__(self,
|
||||
link_parser_class=None,
|
||||
link_class=Link,
|
||||
urltags=None,
|
||||
):
|
||||
import _pullparser
|
||||
if link_parser_class is None:
|
||||
link_parser_class = _pullparser.TolerantPullParser
|
||||
self.link_parser_class = link_parser_class
|
||||
self.link_class = link_class
|
||||
if urltags is None:
|
||||
urltags = {
|
||||
"a": "href",
|
||||
"area": "href",
|
||||
"frame": "src",
|
||||
"iframe": "src",
|
||||
}
|
||||
self.urltags = urltags
|
||||
self._response = None
|
||||
self._encoding = None
|
||||
|
||||
def set_response(self, response, base_url, encoding):
|
||||
self._response = response
|
||||
self._encoding = encoding
|
||||
self._base_url = base_url
|
||||
|
||||
def links(self):
|
||||
"""Return an iterator that provides links of the document."""
|
||||
response = self._response
|
||||
encoding = self._encoding
|
||||
base_url = self._base_url
|
||||
p = self.link_parser_class(response, encoding=encoding)
|
||||
|
||||
try:
|
||||
for token in p.tags(*(self.urltags.keys()+["base"])):
|
||||
if token.type == "endtag":
|
||||
continue
|
||||
if token.data == "base":
|
||||
base_href = dict(token.attrs).get("href")
|
||||
if base_href is not None:
|
||||
base_url = base_href
|
||||
continue
|
||||
attrs = dict(token.attrs)
|
||||
tag = token.data
|
||||
text = None
|
||||
# XXX use attr_encoding for ref'd doc if that doc does not
|
||||
# provide one by other means
|
||||
#attr_encoding = attrs.get("charset")
|
||||
url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
|
||||
if not url:
|
||||
# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
|
||||
# For our purposes a link is something with a URL, so
|
||||
# ignore this.
|
||||
continue
|
||||
|
||||
url = _rfc3986.clean_url(url, encoding)
|
||||
if tag == "a":
|
||||
if token.type != "startendtag":
|
||||
# hmm, this'd break if end tag is missing
|
||||
text = p.get_compressed_text(("endtag", tag))
|
||||
# but this doesn't work for e.g.
|
||||
# <a href="blah"><b>Andy</b></a>
|
||||
#text = p.get_compressed_text()
|
||||
|
||||
yield Link(base_url, url, text, tag, token.attrs)
|
||||
except sgmllib.SGMLParseError, exc:
|
||||
raise _form.ParseError(exc)
|
||||
|
||||
class FormsFactory:
|
||||
|
||||
"""Makes a sequence of objects satisfying HTMLForm interface.
|
||||
|
||||
After calling .forms(), the .global_form attribute is a form object
|
||||
containing all controls not a descendant of any FORM element.
|
||||
|
||||
For constructor argument docs, see ParseResponse argument docs.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
select_default=False,
|
||||
form_parser_class=None,
|
||||
request_class=None,
|
||||
backwards_compat=False,
|
||||
):
|
||||
self.select_default = select_default
|
||||
if form_parser_class is None:
|
||||
form_parser_class = _form.FormParser
|
||||
self.form_parser_class = form_parser_class
|
||||
if request_class is None:
|
||||
request_class = _request.Request
|
||||
self.request_class = request_class
|
||||
self.backwards_compat = backwards_compat
|
||||
self._response = None
|
||||
self.encoding = None
|
||||
self.global_form = None
|
||||
|
||||
def set_response(self, response, encoding):
|
||||
self._response = response
|
||||
self.encoding = encoding
|
||||
self.global_form = None
|
||||
|
||||
def forms(self):
|
||||
encoding = self.encoding
|
||||
forms = _form.ParseResponseEx(
|
||||
self._response,
|
||||
select_default=self.select_default,
|
||||
form_parser_class=self.form_parser_class,
|
||||
request_class=self.request_class,
|
||||
encoding=encoding,
|
||||
_urljoin=_rfc3986.urljoin,
|
||||
_urlparse=_rfc3986.urlsplit,
|
||||
_urlunparse=_rfc3986.urlunsplit,
|
||||
)
|
||||
self.global_form = forms[0]
|
||||
return forms[1:]
|
||||
|
||||
class TitleFactory:
|
||||
def __init__(self):
|
||||
self._response = self._encoding = None
|
||||
|
||||
def set_response(self, response, encoding):
|
||||
self._response = response
|
||||
self._encoding = encoding
|
||||
|
||||
def _get_title_text(self, parser):
|
||||
import _pullparser
|
||||
text = []
|
||||
tok = None
|
||||
while 1:
|
||||
try:
|
||||
tok = parser.get_token()
|
||||
except _pullparser.NoMoreTokensError:
|
||||
break
|
||||
if tok.type == "data":
|
||||
text.append(str(tok))
|
||||
elif tok.type == "entityref":
|
||||
t = unescape("&%s;" % tok.data,
|
||||
parser._entitydefs, parser.encoding)
|
||||
text.append(t)
|
||||
elif tok.type == "charref":
|
||||
t = unescape_charref(tok.data, parser.encoding)
|
||||
text.append(t)
|
||||
elif tok.type in ["starttag", "endtag", "startendtag"]:
|
||||
tag_name = tok.data
|
||||
if tok.type == "endtag" and tag_name == "title":
|
||||
break
|
||||
text.append(str(tok))
|
||||
return COMPRESS_RE.sub(" ", "".join(text).strip())
|
||||
|
||||
def title(self):
|
||||
import _pullparser
|
||||
p = _pullparser.TolerantPullParser(
|
||||
self._response, encoding=self._encoding)
|
||||
try:
|
||||
try:
|
||||
p.get_tag("title")
|
||||
except _pullparser.NoMoreTokensError:
|
||||
return None
|
||||
else:
|
||||
return self._get_title_text(p)
|
||||
except sgmllib.SGMLParseError, exc:
|
||||
raise _form.ParseError(exc)
|
||||
|
||||
|
||||
def unescape(data, entities, encoding):
|
||||
if data is None or "&" not in data:
|
||||
return data
|
||||
|
||||
def replace_entities(match):
|
||||
ent = match.group()
|
||||
if ent[1] == "#":
|
||||
return unescape_charref(ent[2:-1], encoding)
|
||||
|
||||
repl = entities.get(ent[1:-1])
|
||||
if repl is not None:
|
||||
repl = unichr(repl)
|
||||
if type(repl) != type(""):
|
||||
try:
|
||||
repl = repl.encode(encoding)
|
||||
except UnicodeError:
|
||||
repl = ent
|
||||
else:
|
||||
repl = ent
|
||||
return repl
|
||||
|
||||
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
|
||||
|
||||
def unescape_charref(data, encoding):
|
||||
name, base = data, 10
|
||||
if name.startswith("x"):
|
||||
name, base= name[1:], 16
|
||||
uc = unichr(int(name, base))
|
||||
if encoding is None:
|
||||
return uc
|
||||
else:
|
||||
try:
|
||||
repl = uc.encode(encoding)
|
||||
except UnicodeError:
|
||||
repl = "&#%s;" % data
|
||||
return repl
|
||||
|
||||
|
||||
class MechanizeBs(_beautifulsoup.BeautifulSoup):
|
||||
_entitydefs = htmlentitydefs.name2codepoint
|
||||
# don't want the magic Microsoft-char workaround
|
||||
PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
|
||||
lambda(x):x.group(1) + ' />'),
|
||||
(re.compile('<!\s+([^<>]*)>'),
|
||||
lambda(x):'<!' + x.group(1) + '>')
|
||||
]
|
||||
|
||||
def __init__(self, encoding, text=None, avoidParserProblems=True,
|
||||
initialTextIsEverything=True):
|
||||
self._encoding = encoding
|
||||
_beautifulsoup.BeautifulSoup.__init__(
|
||||
self, text, avoidParserProblems, initialTextIsEverything)
|
||||
|
||||
def handle_charref(self, ref):
|
||||
t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
|
||||
self.handle_data(t)
|
||||
def handle_entityref(self, ref):
|
||||
t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
|
||||
self.handle_data(t)
|
||||
def unescape_attrs(self, attrs):
|
||||
escaped_attrs = []
|
||||
for key, val in attrs:
|
||||
val = unescape(val, self._entitydefs, self._encoding)
|
||||
escaped_attrs.append((key, val))
|
||||
return escaped_attrs
|
||||
|
||||
class RobustLinksFactory:
|
||||
|
||||
compress_re = COMPRESS_RE
|
||||
|
||||
def __init__(self,
|
||||
link_parser_class=None,
|
||||
link_class=Link,
|
||||
urltags=None,
|
||||
):
|
||||
if link_parser_class is None:
|
||||
link_parser_class = MechanizeBs
|
||||
self.link_parser_class = link_parser_class
|
||||
self.link_class = link_class
|
||||
if urltags is None:
|
||||
urltags = {
|
||||
"a": "href",
|
||||
"area": "href",
|
||||
"frame": "src",
|
||||
"iframe": "src",
|
||||
}
|
||||
self.urltags = urltags
|
||||
self._bs = None
|
||||
self._encoding = None
|
||||
self._base_url = None
|
||||
|
||||
def set_soup(self, soup, base_url, encoding):
|
||||
self._bs = soup
|
||||
self._base_url = base_url
|
||||
self._encoding = encoding
|
||||
|
||||
def links(self):
|
||||
bs = self._bs
|
||||
base_url = self._base_url
|
||||
encoding = self._encoding
|
||||
for ch in bs.recursiveChildGenerator():
|
||||
if (isinstance(ch, _beautifulsoup.Tag) and
|
||||
ch.name in self.urltags.keys()+["base"]):
|
||||
link = ch
|
||||
attrs = bs.unescape_attrs(link.attrs)
|
||||
attrs_dict = dict(attrs)
|
||||
if link.name == "base":
|
||||
base_href = attrs_dict.get("href")
|
||||
if base_href is not None:
|
||||
base_url = base_href
|
||||
continue
|
||||
url_attr = self.urltags[link.name]
|
||||
url = attrs_dict.get(url_attr)
|
||||
if not url:
|
||||
continue
|
||||
url = _rfc3986.clean_url(url, encoding)
|
||||
text = link.fetchText(lambda t: True)
|
||||
if not text:
|
||||
# follow _pullparser's weird behaviour rigidly
|
||||
if link.name == "a":
|
||||
text = ""
|
||||
else:
|
||||
text = None
|
||||
else:
|
||||
text = self.compress_re.sub(" ", " ".join(text).strip())
|
||||
yield Link(base_url, url, text, link.name, attrs)
|
||||
|
||||
|
||||
class RobustFormsFactory(FormsFactory):
|
||||
def __init__(self, *args, **kwds):
|
||||
args = form_parser_args(*args, **kwds)
|
||||
if args.form_parser_class is None:
|
||||
args.form_parser_class = _form.RobustFormParser
|
||||
FormsFactory.__init__(self, **args.dictionary)
|
||||
|
||||
def set_response(self, response, encoding):
|
||||
self._response = response
|
||||
self.encoding = encoding
|
||||
|
||||
|
||||
class RobustTitleFactory:
|
||||
def __init__(self):
|
||||
self._bs = self._encoding = None
|
||||
|
||||
def set_soup(self, soup, encoding):
|
||||
self._bs = soup
|
||||
self._encoding = encoding
|
||||
|
||||
def title(self):
|
||||
title = self._bs.first("title")
|
||||
if title == _beautifulsoup.Null:
|
||||
return None
|
||||
else:
|
||||
inner_html = "".join([str(node) for node in title.contents])
|
||||
return COMPRESS_RE.sub(" ", inner_html.strip())
|
||||
|
||||
|
||||
class Factory:
|
||||
"""Factory for forms, links, etc.
|
||||
|
||||
This interface may expand in future.
|
||||
|
||||
Public methods:
|
||||
|
||||
set_request_class(request_class)
|
||||
set_response(response)
|
||||
forms()
|
||||
links()
|
||||
|
||||
Public attributes:
|
||||
|
||||
Note that accessing these attributes may raise ParseError.
|
||||
|
||||
encoding: string specifying the encoding of response if it contains a text
|
||||
document (this value is left unspecified for documents that do not have
|
||||
an encoding, e.g. an image file)
|
||||
is_html: true if response contains an HTML document (XHTML may be
|
||||
regarded as HTML too)
|
||||
title: page title, or None if no title or not HTML
|
||||
global_form: form object containing all controls that are not descendants
|
||||
of any FORM element, or None if the forms_factory does not support
|
||||
supplying a global form
|
||||
|
||||
"""
|
||||
|
||||
LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
|
||||
|
||||
def __init__(self, forms_factory, links_factory, title_factory,
|
||||
encoding_finder=EncodingFinder(DEFAULT_ENCODING),
|
||||
response_type_finder=ResponseTypeFinder(allow_xhtml=False),
|
||||
):
|
||||
"""
|
||||
|
||||
Pass keyword arguments only.
|
||||
|
||||
default_encoding: character encoding to use if encoding cannot be
|
||||
determined (or guessed) from the response. You should turn on
|
||||
HTTP-EQUIV handling if you want the best chance of getting this right
|
||||
without resorting to this default. The default value of this
|
||||
parameter (currently latin-1) may change in future.
|
||||
|
||||
"""
|
||||
self._forms_factory = forms_factory
|
||||
self._links_factory = links_factory
|
||||
self._title_factory = title_factory
|
||||
self._encoding_finder = encoding_finder
|
||||
self._response_type_finder = response_type_finder
|
||||
|
||||
self.set_response(None)
|
||||
|
||||
def set_request_class(self, request_class):
|
||||
"""Set request class (mechanize.Request by default).
|
||||
|
||||
HTMLForm instances returned by .forms() will return instances of this
|
||||
class when .click()ed.
|
||||
|
||||
"""
|
||||
self._forms_factory.request_class = request_class
|
||||
|
||||
def set_response(self, response):
|
||||
"""Set response.
|
||||
|
||||
The response must either be None or implement the same interface as
|
||||
objects returned by mechanize.urlopen().
|
||||
|
||||
"""
|
||||
self._response = response
|
||||
self._forms_genf = self._links_genf = None
|
||||
self._get_title = None
|
||||
for name in self.LAZY_ATTRS:
|
||||
try:
|
||||
delattr(self, name)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name not in self.LAZY_ATTRS:
|
||||
return getattr(self.__class__, name)
|
||||
|
||||
if name == "encoding":
|
||||
self.encoding = self._encoding_finder.encoding(
|
||||
copy.copy(self._response))
|
||||
return self.encoding
|
||||
elif name == "is_html":
|
||||
self.is_html = self._response_type_finder.is_html(
|
||||
copy.copy(self._response), self.encoding)
|
||||
return self.is_html
|
||||
elif name == "title":
|
||||
if self.is_html:
|
||||
self.title = self._title_factory.title()
|
||||
else:
|
||||
self.title = None
|
||||
return self.title
|
||||
elif name == "global_form":
|
||||
self.forms()
|
||||
return self.global_form
|
||||
|
||||
def forms(self):
|
||||
"""Return iterable over HTMLForm-like objects.
|
||||
|
||||
Raises mechanize.ParseError on failure.
|
||||
"""
|
||||
# this implementation sets .global_form as a side-effect, for benefit
|
||||
# of __getattr__ impl
|
||||
if self._forms_genf is None:
|
||||
try:
|
||||
self._forms_genf = CachingGeneratorFunction(
|
||||
self._forms_factory.forms())
|
||||
except: # XXXX define exception!
|
||||
self.set_response(self._response)
|
||||
raise
|
||||
self.global_form = getattr(
|
||||
self._forms_factory, "global_form", None)
|
||||
return self._forms_genf()
|
||||
|
||||
def links(self):
|
||||
"""Return iterable over mechanize.Link-like objects.
|
||||
|
||||
Raises mechanize.ParseError on failure.
|
||||
"""
|
||||
if self._links_genf is None:
|
||||
try:
|
||||
self._links_genf = CachingGeneratorFunction(
|
||||
self._links_factory.links())
|
||||
except: # XXXX define exception!
|
||||
self.set_response(self._response)
|
||||
raise
|
||||
return self._links_genf()
|
||||
|
||||
class DefaultFactory(Factory):
|
||||
"""Based on sgmllib."""
|
||||
def __init__(self, i_want_broken_xhtml_support=False):
|
||||
Factory.__init__(
|
||||
self,
|
||||
forms_factory=FormsFactory(),
|
||||
links_factory=LinksFactory(),
|
||||
title_factory=TitleFactory(),
|
||||
response_type_finder=ResponseTypeFinder(
|
||||
allow_xhtml=i_want_broken_xhtml_support),
|
||||
)
|
||||
|
||||
def set_response(self, response):
|
||||
Factory.set_response(self, response)
|
||||
if response is not None:
|
||||
self._forms_factory.set_response(
|
||||
copy.copy(response), self.encoding)
|
||||
self._links_factory.set_response(
|
||||
copy.copy(response), response.geturl(), self.encoding)
|
||||
self._title_factory.set_response(
|
||||
copy.copy(response), self.encoding)
|
||||
|
||||
class RobustFactory(Factory):
|
||||
"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
|
||||
DefaultFactory.
|
||||
|
||||
"""
|
||||
def __init__(self, i_want_broken_xhtml_support=False,
|
||||
soup_class=None):
|
||||
Factory.__init__(
|
||||
self,
|
||||
forms_factory=RobustFormsFactory(),
|
||||
links_factory=RobustLinksFactory(),
|
||||
title_factory=RobustTitleFactory(),
|
||||
response_type_finder=ResponseTypeFinder(
|
||||
allow_xhtml=i_want_broken_xhtml_support),
|
||||
)
|
||||
if soup_class is None:
|
||||
soup_class = MechanizeBs
|
||||
self._soup_class = soup_class
|
||||
|
||||
def set_response(self, response):
|
||||
Factory.set_response(self, response)
|
||||
if response is not None:
|
||||
data = response.read()
|
||||
soup = self._soup_class(self.encoding, data)
|
||||
self._forms_factory.set_response(
|
||||
copy.copy(response), self.encoding)
|
||||
self._links_factory.set_soup(
|
||||
soup, response.geturl(), self.encoding)
|
||||
self._title_factory.set_soup(soup, self.encoding)
|
Binary file not shown.
|
@ -0,0 +1,447 @@
|
|||
"""HTTP related handlers.
|
||||
|
||||
Note that some other HTTP handlers live in more specific modules: _auth.py,
|
||||
_gzip.py, etc.
|
||||
|
||||
|
||||
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import HTMLParser
|
||||
from cStringIO import StringIO
|
||||
import htmlentitydefs
|
||||
import logging
|
||||
import robotparser
|
||||
import socket
|
||||
import time
|
||||
|
||||
import _sgmllib_copy as sgmllib
|
||||
from _urllib2_fork import HTTPError, BaseHandler
|
||||
|
||||
from _headersutil import is_html
|
||||
from _html import unescape, unescape_charref
|
||||
from _request import Request
|
||||
from _response import response_seek_wrapper
|
||||
import _rfc3986
|
||||
import _sockettimeout
|
||||
|
||||
debug = logging.getLogger("mechanize").debug
|
||||
debug_robots = logging.getLogger("mechanize.robots").debug
|
||||
|
||||
# monkeypatch urllib2.HTTPError to show URL
|
||||
## import urllib2
|
||||
## def urllib2_str(self):
|
||||
## return 'HTTP Error %s: %s (%s)' % (
|
||||
## self.code, self.msg, self.geturl())
|
||||
## urllib2.HTTPError.__str__ = urllib2_str
|
||||
|
||||
|
||||
CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
|
||||
DEFAULT_ENCODING = 'latin-1'
|
||||
|
||||
# XXX would self.reset() work, instead of raising this exception?
|
||||
class EndOfHeadError(Exception): pass
|
||||
class AbstractHeadParser:
|
||||
# only these elements are allowed in or before HEAD of document
|
||||
head_elems = ("html", "head",
|
||||
"title", "base",
|
||||
"script", "style", "meta", "link", "object")
|
||||
_entitydefs = htmlentitydefs.name2codepoint
|
||||
_encoding = DEFAULT_ENCODING
|
||||
|
||||
def __init__(self):
|
||||
self.http_equiv = []
|
||||
|
||||
def start_meta(self, attrs):
|
||||
http_equiv = content = None
|
||||
for key, value in attrs:
|
||||
if key == "http-equiv":
|
||||
http_equiv = self.unescape_attr_if_required(value)
|
||||
elif key == "content":
|
||||
content = self.unescape_attr_if_required(value)
|
||||
if http_equiv is not None and content is not None:
|
||||
self.http_equiv.append((http_equiv, content))
|
||||
|
||||
def end_head(self):
|
||||
raise EndOfHeadError()
|
||||
|
||||
def handle_entityref(self, name):
|
||||
#debug("%s", name)
|
||||
self.handle_data(unescape(
|
||||
'&%s;' % name, self._entitydefs, self._encoding))
|
||||
|
||||
def handle_charref(self, name):
|
||||
#debug("%s", name)
|
||||
self.handle_data(unescape_charref(name, self._encoding))
|
||||
|
||||
def unescape_attr(self, name):
|
||||
#debug("%s", name)
|
||||
return unescape(name, self._entitydefs, self._encoding)
|
||||
|
||||
def unescape_attrs(self, attrs):
|
||||
#debug("%s", attrs)
|
||||
escaped_attrs = {}
|
||||
for key, val in attrs.items():
|
||||
escaped_attrs[key] = self.unescape_attr(val)
|
||||
return escaped_attrs
|
||||
|
||||
def unknown_entityref(self, ref):
|
||||
self.handle_data("&%s;" % ref)
|
||||
|
||||
def unknown_charref(self, ref):
|
||||
self.handle_data("&#%s;" % ref)
|
||||
|
||||
|
||||
class XHTMLCompatibleHeadParser(AbstractHeadParser,
|
||||
HTMLParser.HTMLParser):
|
||||
def __init__(self):
|
||||
HTMLParser.HTMLParser.__init__(self)
|
||||
AbstractHeadParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag not in self.head_elems:
|
||||
raise EndOfHeadError()
|
||||
try:
|
||||
method = getattr(self, 'start_' + tag)
|
||||
except AttributeError:
|
||||
try:
|
||||
method = getattr(self, 'do_' + tag)
|
||||
except AttributeError:
|
||||
pass # unknown tag
|
||||
else:
|
||||
method(attrs)
|
||||
else:
|
||||
method(attrs)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag not in self.head_elems:
|
||||
raise EndOfHeadError()
|
||||
try:
|
||||
method = getattr(self, 'end_' + tag)
|
||||
except AttributeError:
|
||||
pass # unknown tag
|
||||
else:
|
||||
method()
|
||||
|
||||
def unescape(self, name):
|
||||
# Use the entitydefs passed into constructor, not
|
||||
# HTMLParser.HTMLParser's entitydefs.
|
||||
return self.unescape_attr(name)
|
||||
|
||||
def unescape_attr_if_required(self, name):
|
||||
return name # HTMLParser.HTMLParser already did it
|
||||
|
||||
class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
|
||||
|
||||
def _not_called(self):
|
||||
assert False
|
||||
|
||||
def __init__(self):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
AbstractHeadParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, method, attrs):
|
||||
if tag not in self.head_elems:
|
||||
raise EndOfHeadError()
|
||||
if tag == "meta":
|
||||
method(attrs)
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
self.handle_starttag(tag, self._not_called, attrs)
|
||||
|
||||
def handle_endtag(self, tag, method):
|
||||
if tag in self.head_elems:
|
||||
method()
|
||||
else:
|
||||
raise EndOfHeadError()
|
||||
|
||||
def unescape_attr_if_required(self, name):
|
||||
return self.unescape_attr(name)
|
||||
|
||||
def parse_head(fileobj, parser):
|
||||
"""Return a list of key, value pairs."""
|
||||
while 1:
|
||||
data = fileobj.read(CHUNK)
|
||||
try:
|
||||
parser.feed(data)
|
||||
except EndOfHeadError:
|
||||
break
|
||||
if len(data) != CHUNK:
|
||||
# this should only happen if there is no HTML body, or if
|
||||
# CHUNK is big
|
||||
break
|
||||
return parser.http_equiv
|
||||
|
||||
class HTTPEquivProcessor(BaseHandler):
|
||||
"""Append META HTTP-EQUIV headers to regular HTTP headers."""
|
||||
|
||||
handler_order = 300 # before handlers that look at HTTP headers
|
||||
|
||||
def __init__(self, head_parser_class=HeadParser,
|
||||
i_want_broken_xhtml_support=False,
|
||||
):
|
||||
self.head_parser_class = head_parser_class
|
||||
self._allow_xhtml = i_want_broken_xhtml_support
|
||||
|
||||
def http_response(self, request, response):
|
||||
if not hasattr(response, "seek"):
|
||||
response = response_seek_wrapper(response)
|
||||
http_message = response.info()
|
||||
url = response.geturl()
|
||||
ct_hdrs = http_message.getheaders("content-type")
|
||||
if is_html(ct_hdrs, url, self._allow_xhtml):
|
||||
try:
|
||||
try:
|
||||
html_headers = parse_head(response,
|
||||
self.head_parser_class())
|
||||
finally:
|
||||
response.seek(0)
|
||||
except (HTMLParser.HTMLParseError,
|
||||
sgmllib.SGMLParseError):
|
||||
pass
|
||||
else:
|
||||
for hdr, val in html_headers:
|
||||
# add a header
|
||||
http_message.dict[hdr.lower()] = val
|
||||
text = hdr + ": " + val
|
||||
for line in text.split("\n"):
|
||||
http_message.headers.append(line + "\n")
|
||||
return response
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class MechanizeRobotFileParser(robotparser.RobotFileParser):
|
||||
|
||||
def __init__(self, url='', opener=None):
|
||||
robotparser.RobotFileParser.__init__(self, url)
|
||||
self._opener = opener
|
||||
self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
|
||||
|
||||
def set_opener(self, opener=None):
|
||||
import _opener
|
||||
if opener is None:
|
||||
opener = _opener.OpenerDirector()
|
||||
self._opener = opener
|
||||
|
||||
def set_timeout(self, timeout):
|
||||
self._timeout = timeout
|
||||
|
||||
def read(self):
|
||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||
if self._opener is None:
|
||||
self.set_opener()
|
||||
req = Request(self.url, unverifiable=True, visit=False,
|
||||
timeout=self._timeout)
|
||||
try:
|
||||
f = self._opener.open(req)
|
||||
except HTTPError, f:
|
||||
pass
|
||||
except (IOError, socket.error, OSError), exc:
|
||||
debug_robots("ignoring error opening %r: %s" %
|
||||
(self.url, exc))
|
||||
return
|
||||
lines = []
|
||||
line = f.readline()
|
||||
while line:
|
||||
lines.append(line.strip())
|
||||
line = f.readline()
|
||||
status = f.code
|
||||
if status == 401 or status == 403:
|
||||
self.disallow_all = True
|
||||
debug_robots("disallow all")
|
||||
elif status >= 400:
|
||||
self.allow_all = True
|
||||
debug_robots("allow all")
|
||||
elif status == 200 and lines:
|
||||
debug_robots("parse lines")
|
||||
self.parse(lines)
|
||||
|
||||
class RobotExclusionError(HTTPError):
|
||||
def __init__(self, request, *args):
|
||||
apply(HTTPError.__init__, (self,)+args)
|
||||
self.request = request
|
||||
|
||||
class HTTPRobotRulesProcessor(BaseHandler):
|
||||
# before redirections, after everything else
|
||||
handler_order = 800
|
||||
|
||||
try:
|
||||
from httplib import HTTPMessage
|
||||
except:
|
||||
from mimetools import Message
|
||||
http_response_class = Message
|
||||
else:
|
||||
http_response_class = HTTPMessage
|
||||
|
||||
def __init__(self, rfp_class=MechanizeRobotFileParser):
|
||||
self.rfp_class = rfp_class
|
||||
self.rfp = None
|
||||
self._host = None
|
||||
|
||||
def http_request(self, request):
|
||||
scheme = request.get_type()
|
||||
if scheme not in ["http", "https"]:
|
||||
# robots exclusion only applies to HTTP
|
||||
return request
|
||||
|
||||
if request.get_selector() == "/robots.txt":
|
||||
# /robots.txt is always OK to fetch
|
||||
return request
|
||||
|
||||
host = request.get_host()
|
||||
|
||||
# robots.txt requests don't need to be allowed by robots.txt :-)
|
||||
origin_req = getattr(request, "_origin_req", None)
|
||||
if (origin_req is not None and
|
||||
origin_req.get_selector() == "/robots.txt" and
|
||||
origin_req.get_host() == host
|
||||
):
|
||||
return request
|
||||
|
||||
if host != self._host:
|
||||
self.rfp = self.rfp_class()
|
||||
try:
|
||||
self.rfp.set_opener(self.parent)
|
||||
except AttributeError:
|
||||
debug("%r instance does not support set_opener" %
|
||||
self.rfp.__class__)
|
||||
self.rfp.set_url(scheme+"://"+host+"/robots.txt")
|
||||
self.rfp.set_timeout(request.timeout)
|
||||
self.rfp.read()
|
||||
self._host = host
|
||||
|
||||
ua = request.get_header("User-agent", "")
|
||||
if self.rfp.can_fetch(ua, request.get_full_url()):
|
||||
return request
|
||||
else:
|
||||
# XXX This should really have raised URLError. Too late now...
|
||||
msg = "request disallowed by robots.txt"
|
||||
raise RobotExclusionError(
|
||||
request,
|
||||
request.get_full_url(),
|
||||
403, msg,
|
||||
self.http_response_class(StringIO()), StringIO(msg))
|
||||
|
||||
https_request = http_request
|
||||
|
||||
class HTTPRefererProcessor(BaseHandler):
|
||||
"""Add Referer header to requests.
|
||||
|
||||
This only makes sense if you use each RefererProcessor for a single
|
||||
chain of requests only (so, for example, if you use a single
|
||||
HTTPRefererProcessor to fetch a series of URLs extracted from a single
|
||||
page, this will break).
|
||||
|
||||
There's a proper implementation of this in mechanize.Browser.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
self.referer = None
|
||||
|
||||
def http_request(self, request):
|
||||
if ((self.referer is not None) and
|
||||
not request.has_header("Referer")):
|
||||
request.add_unredirected_header("Referer", self.referer)
|
||||
return request
|
||||
|
||||
def http_response(self, request, response):
|
||||
self.referer = response.geturl()
|
||||
return response
|
||||
|
||||
https_request = http_request
|
||||
https_response = http_response
|
||||
|
||||
|
||||
def clean_refresh_url(url):
|
||||
# e.g. Firefox 1.5 does (something like) this
|
||||
if ((url.startswith('"') and url.endswith('"')) or
|
||||
(url.startswith("'") and url.endswith("'"))):
|
||||
url = url[1:-1]
|
||||
return _rfc3986.clean_url(url, "latin-1") # XXX encoding
|
||||
|
||||
def parse_refresh_header(refresh):
|
||||
"""
|
||||
>>> parse_refresh_header("1; url=http://example.com/")
|
||||
(1.0, 'http://example.com/')
|
||||
>>> parse_refresh_header("1; url='http://example.com/'")
|
||||
(1.0, 'http://example.com/')
|
||||
>>> parse_refresh_header("1")
|
||||
(1.0, None)
|
||||
>>> parse_refresh_header("blah") # doctest: +IGNORE_EXCEPTION_DETAIL
|
||||
Traceback (most recent call last):
|
||||
ValueError: invalid literal for float(): blah
|
||||
|
||||
"""
|
||||
|
||||
ii = refresh.find(";")
|
||||
if ii != -1:
|
||||
pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
|
||||
jj = newurl_spec.find("=")
|
||||
key = None
|
||||
if jj != -1:
|
||||
key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
|
||||
newurl = clean_refresh_url(newurl)
|
||||
if key is None or key.strip().lower() != "url":
|
||||
raise ValueError()
|
||||
else:
|
||||
pause, newurl = float(refresh), None
|
||||
return pause, newurl
|
||||
|
||||
class HTTPRefreshProcessor(BaseHandler):
|
||||
"""Perform HTTP Refresh redirections.
|
||||
|
||||
Note that if a non-200 HTTP code has occurred (for example, a 30x
|
||||
redirect), this processor will do nothing.
|
||||
|
||||
By default, only zero-time Refresh headers are redirected. Use the
|
||||
max_time attribute / constructor argument to allow Refresh with longer
|
||||
pauses. Use the honor_time attribute / constructor argument to control
|
||||
whether the requested pause is honoured (with a time.sleep()) or
|
||||
skipped in favour of immediate redirection.
|
||||
|
||||
Public attributes:
|
||||
|
||||
max_time: see above
|
||||
honor_time: see above
|
||||
|
||||
"""
|
||||
handler_order = 1000
|
||||
|
||||
def __init__(self, max_time=0, honor_time=True):
|
||||
self.max_time = max_time
|
||||
self.honor_time = honor_time
|
||||
self._sleep = time.sleep
|
||||
|
||||
def http_response(self, request, response):
|
||||
code, msg, hdrs = response.code, response.msg, response.info()
|
||||
|
||||
if code == 200 and hdrs.has_key("refresh"):
|
||||
refresh = hdrs.getheaders("refresh")[0]
|
||||
try:
|
||||
pause, newurl = parse_refresh_header(refresh)
|
||||
except ValueError:
|
||||
debug("bad Refresh header: %r" % refresh)
|
||||
return response
|
||||
|
||||
if newurl is None:
|
||||
newurl = response.geturl()
|
||||
if (self.max_time is None) or (pause <= self.max_time):
|
||||
if pause > 1E-3 and self.honor_time:
|
||||
self._sleep(pause)
|
||||
hdrs["location"] = newurl
|
||||
# hardcoded http is NOT a bug
|
||||
response = self.parent.error(
|
||||
"http", request, response,
|
||||
"refresh", msg, hdrs)
|
||||
else:
|
||||
debug("Refresh header ignored: %r" % refresh)
|
||||
|
||||
return response
|
||||
|
||||
https_response = http_response
|
Binary file not shown.
|
@ -0,0 +1,185 @@
|
|||
"""Load / save to libwww-perl (LWP) format files.
|
||||
|
||||
Actually, the format is slightly extended from that used by LWP's
|
||||
(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information
|
||||
not recorded by LWP.
|
||||
|
||||
It uses the version string "2.0", though really there isn't an LWP Cookies
|
||||
2.0 format. This indicates that there is extra information in here
|
||||
(domain_dot and port_spec) while still being compatible with libwww-perl,
|
||||
I hope.
|
||||
|
||||
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import time, re, logging
|
||||
|
||||
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
|
||||
MISSING_FILENAME_TEXT, LoadError
|
||||
from _headersutil import join_header_words, split_header_words
|
||||
from _util import iso2time, time2isoz
|
||||
|
||||
debug = logging.getLogger("mechanize").debug
|
||||
|
||||
|
||||
def lwp_cookie_str(cookie):
|
||||
"""Return string representation of Cookie in an the LWP cookie file format.
|
||||
|
||||
Actually, the format is extended a bit -- see module docstring.
|
||||
|
||||
"""
|
||||
h = [(cookie.name, cookie.value),
|
||||
("path", cookie.path),
|
||||
("domain", cookie.domain)]
|
||||
if cookie.port is not None: h.append(("port", cookie.port))
|
||||
if cookie.path_specified: h.append(("path_spec", None))
|
||||
if cookie.port_specified: h.append(("port_spec", None))
|
||||
if cookie.domain_initial_dot: h.append(("domain_dot", None))
|
||||
if cookie.secure: h.append(("secure", None))
|
||||
if cookie.expires: h.append(("expires",
|
||||
time2isoz(float(cookie.expires))))
|
||||
if cookie.discard: h.append(("discard", None))
|
||||
if cookie.comment: h.append(("comment", cookie.comment))
|
||||
if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
|
||||
if cookie.rfc2109: h.append(("rfc2109", None))
|
||||
|
||||
keys = cookie.nonstandard_attr_keys()
|
||||
keys.sort()
|
||||
for k in keys:
|
||||
h.append((k, str(cookie.get_nonstandard_attr(k))))
|
||||
|
||||
h.append(("version", str(cookie.version)))
|
||||
|
||||
return join_header_words([h])
|
||||
|
||||
class LWPCookieJar(FileCookieJar):
|
||||
"""
|
||||
The LWPCookieJar saves a sequence of"Set-Cookie3" lines.
|
||||
"Set-Cookie3" is the format used by the libwww-perl libary, not known
|
||||
to be compatible with any browser, but which is easy to read and
|
||||
doesn't lose information about RFC 2965 cookies.
|
||||
|
||||
Additional methods
|
||||
|
||||
as_lwp_str(ignore_discard=True, ignore_expired=True)
|
||||
|
||||
"""
|
||||
|
||||
magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
|
||||
|
||||
def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
|
||||
"""Return cookies as a string of "\n"-separated "Set-Cookie3" headers.
|
||||
|
||||
ignore_discard and ignore_expires: see docstring for FileCookieJar.save
|
||||
|
||||
"""
|
||||
now = time.time()
|
||||
r = []
|
||||
for cookie in self:
|
||||
if not ignore_discard and cookie.discard:
|
||||
debug(" Not saving %s: marked for discard", cookie.name)
|
||||
continue
|
||||
if not ignore_expires and cookie.is_expired(now):
|
||||
debug(" Not saving %s: expired", cookie.name)
|
||||
continue
|
||||
r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
|
||||
return "\n".join(r+[""])
|
||||
|
||||
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
||||
if filename is None:
|
||||
if self.filename is not None: filename = self.filename
|
||||
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||
|
||||
f = open(filename, "w")
|
||||
try:
|
||||
debug("Saving LWP cookies file")
|
||||
# There really isn't an LWP Cookies 2.0 format, but this indicates
|
||||
# that there is extra information in here (domain_dot and
|
||||
# port_spec) while still being compatible with libwww-perl, I hope.
|
||||
f.write("#LWP-Cookies-2.0\n")
|
||||
f.write(self.as_lwp_str(ignore_discard, ignore_expires))
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
||||
magic = f.readline()
|
||||
if not re.search(self.magic_re, magic):
|
||||
msg = "%s does not seem to contain cookies" % filename
|
||||
raise LoadError(msg)
|
||||
|
||||
now = time.time()
|
||||
|
||||
header = "Set-Cookie3:"
|
||||
boolean_attrs = ("port_spec", "path_spec", "domain_dot",
|
||||
"secure", "discard", "rfc2109")
|
||||
value_attrs = ("version",
|
||||
"port", "path", "domain",
|
||||
"expires",
|
||||
"comment", "commenturl")
|
||||
|
||||
try:
|
||||
while 1:
|
||||
line = f.readline()
|
||||
if line == "": break
|
||||
if not line.startswith(header):
|
||||
continue
|
||||
line = line[len(header):].strip()
|
||||
|
||||
for data in split_header_words([line]):
|
||||
name, value = data[0]
|
||||
standard = {}
|
||||
rest = {}
|
||||
for k in boolean_attrs:
|
||||
standard[k] = False
|
||||
for k, v in data[1:]:
|
||||
if k is not None:
|
||||
lc = k.lower()
|
||||
else:
|
||||
lc = None
|
||||
# don't lose case distinction for unknown fields
|
||||
if (lc in value_attrs) or (lc in boolean_attrs):
|
||||
k = lc
|
||||
if k in boolean_attrs:
|
||||
if v is None: v = True
|
||||
standard[k] = v
|
||||
elif k in value_attrs:
|
||||
standard[k] = v
|
||||
else:
|
||||
rest[k] = v
|
||||
|
||||
h = standard.get
|
||||
expires = h("expires")
|
||||
discard = h("discard")
|
||||
if expires is not None:
|
||||
expires = iso2time(expires)
|
||||
if expires is None:
|
||||
discard = True
|
||||
domain = h("domain")
|
||||
domain_specified = domain.startswith(".")
|
||||
c = Cookie(h("version"), name, value,
|
||||
h("port"), h("port_spec"),
|
||||
domain, domain_specified, h("domain_dot"),
|
||||
h("path"), h("path_spec"),
|
||||
h("secure"),
|
||||
expires,
|
||||
discard,
|
||||
h("comment"),
|
||||
h("commenturl"),
|
||||
rest,
|
||||
h("rfc2109"),
|
||||
)
|
||||
if not ignore_discard and c.discard:
|
||||
continue
|
||||
if not ignore_expires and c.is_expired(now):
|
||||
continue
|
||||
self.set_cookie(c)
|
||||
except:
|
||||
reraise_unmasked_exceptions((IOError,))
|
||||
raise LoadError("invalid Set-Cookie3 format file %s" % filename)
|
||||
|
Binary file not shown.
|
@ -0,0 +1,393 @@
|
|||
# Taken from Python 2.6.4 for use by _sgmllib.py
|
||||
"""Shared support for scanning document type declarations in HTML and XHTML.
|
||||
|
||||
This module is used as a foundation for the HTMLParser and sgmllib
|
||||
modules (indirectly, for htmllib as well). It has no documented
|
||||
public API and should not be used directly.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
|
||||
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
|
||||
_commentclose = re.compile(r'--\s*>')
|
||||
_markedsectionclose = re.compile(r']\s*]\s*>')
|
||||
|
||||
# An analysis of the MS-Word extensions is available at
|
||||
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
|
||||
|
||||
_msmarkedsectionclose = re.compile(r']\s*>')
|
||||
|
||||
del re
|
||||
|
||||
|
||||
class ParserBase:
|
||||
"""Parser base class which provides some common support methods used
|
||||
by the SGML/HTML and XHTML parsers."""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ is ParserBase:
|
||||
raise RuntimeError(
|
||||
"markupbase.ParserBase must be subclassed")
|
||||
|
||||
def error(self, message):
|
||||
raise NotImplementedError(
|
||||
"subclasses of ParserBase must override error()")
|
||||
|
||||
def reset(self):
|
||||
self.lineno = 1
|
||||
self.offset = 0
|
||||
|
||||
def getpos(self):
|
||||
"""Return current line number and offset."""
|
||||
return self.lineno, self.offset
|
||||
|
||||
# Internal -- update line number and offset. This should be
|
||||
# called for each piece of data exactly once, in order -- in other
|
||||
# words the concatenation of all the input strings to this
|
||||
# function should be exactly the entire input.
|
||||
def updatepos(self, i, j):
|
||||
if i >= j:
|
||||
return j
|
||||
rawdata = self.rawdata
|
||||
nlines = rawdata.count("\n", i, j)
|
||||
if nlines:
|
||||
self.lineno = self.lineno + nlines
|
||||
pos = rawdata.rindex("\n", i, j) # Should not fail
|
||||
self.offset = j-(pos+1)
|
||||
else:
|
||||
self.offset = self.offset + j-i
|
||||
return j
|
||||
|
||||
_decl_otherchars = ''
|
||||
|
||||
# Internal -- parse declaration (for use by subclasses).
|
||||
def parse_declaration(self, i):
|
||||
# This is some sort of declaration; in "HTML as
|
||||
# deployed," this should only be the document type
|
||||
# declaration ("<!DOCTYPE html...>").
|
||||
# ISO 8879:1986, however, has more complex
|
||||
# declaration syntax for elements in <!...>, including:
|
||||
# --comment--
|
||||
# [marked section]
|
||||
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
|
||||
# ATTLIST, NOTATION, SHORTREF, USEMAP,
|
||||
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
|
||||
rawdata = self.rawdata
|
||||
j = i + 2
|
||||
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
|
||||
if rawdata[j:j+1] == ">":
|
||||
# the empty comment <!>
|
||||
return j + 1
|
||||
if rawdata[j:j+1] in ("-", ""):
|
||||
# Start of comment followed by buffer boundary,
|
||||
# or just a buffer boundary.
|
||||
return -1
|
||||
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
|
||||
n = len(rawdata)
|
||||
if rawdata[j:j+2] == '--': #comment
|
||||
# Locate --.*-- as the body of the comment
|
||||
return self.parse_comment(i)
|
||||
elif rawdata[j] == '[': #marked section
|
||||
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
|
||||
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
|
||||
# Note that this is extended by Microsoft Office "Save as Web" function
|
||||
# to include [if...] and [endif].
|
||||
return self.parse_marked_section(i)
|
||||
else: #all other declaration elements
|
||||
decltype, j = self._scan_name(j, i)
|
||||
if j < 0:
|
||||
return j
|
||||
if decltype == "doctype":
|
||||
self._decl_otherchars = ''
|
||||
while j < n:
|
||||
c = rawdata[j]
|
||||
if c == ">":
|
||||
# end of declaration syntax
|
||||
data = rawdata[i+2:j]
|
||||
if decltype == "doctype":
|
||||
self.handle_decl(data)
|
||||
else:
|
||||
self.unknown_decl(data)
|
||||
return j + 1
|
||||
if c in "\"'":
|
||||
m = _declstringlit_match(rawdata, j)
|
||||
if not m:
|
||||
return -1 # incomplete
|
||||
j = m.end()
|
||||
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
name, j = self._scan_name(j, i)
|
||||
elif c in self._decl_otherchars:
|
||||
j = j + 1
|
||||
elif c == "[":
|
||||
# this could be handled in a separate doctype parser
|
||||
if decltype == "doctype":
|
||||
j = self._parse_doctype_subset(j + 1, i)
|
||||
elif decltype in ("attlist", "linktype", "link", "element"):
|
||||
# must tolerate []'d groups in a content model in an element declaration
|
||||
# also in data attribute specifications of attlist declaration
|
||||
# also link type declaration subsets in linktype declarations
|
||||
# also link attribute specification lists in link declarations
|
||||
self.error("unsupported '[' char in %s declaration" % decltype)
|
||||
else:
|
||||
self.error("unexpected '[' char in declaration")
|
||||
else:
|
||||
self.error(
|
||||
"unexpected %r char in declaration" % rawdata[j])
|
||||
if j < 0:
|
||||
return j
|
||||
return -1 # incomplete
|
||||
|
||||
# Internal -- parse a marked section
|
||||
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
|
||||
def parse_marked_section(self, i, report=1):
|
||||
rawdata= self.rawdata
|
||||
assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
|
||||
sectName, j = self._scan_name( i+3, i )
|
||||
if j < 0:
|
||||
return j
|
||||
if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
|
||||
# look for standard ]]> ending
|
||||
match= _markedsectionclose.search(rawdata, i+3)
|
||||
elif sectName in ("if", "else", "endif"):
|
||||
# look for MS Office ]> ending
|
||||
match= _msmarkedsectionclose.search(rawdata, i+3)
|
||||
else:
|
||||
self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
|
||||
if not match:
|
||||
return -1
|
||||
if report:
|
||||
j = match.start(0)
|
||||
self.unknown_decl(rawdata[i+3: j])
|
||||
return match.end(0)
|
||||
|
||||
# Internal -- parse comment, return length or -1 if not terminated
|
||||
def parse_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+4] != '<!--':
|
||||
self.error('unexpected call to parse_comment()')
|
||||
match = _commentclose.search(rawdata, i+4)
|
||||
if not match:
|
||||
return -1
|
||||
if report:
|
||||
j = match.start(0)
|
||||
self.handle_comment(rawdata[i+4: j])
|
||||
return match.end(0)
|
||||
|
||||
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
|
||||
# returning the index just past any whitespace following the trailing ']'.
|
||||
def _parse_doctype_subset(self, i, declstartpos):
|
||||
rawdata = self.rawdata
|
||||
n = len(rawdata)
|
||||
j = i
|
||||
while j < n:
|
||||
c = rawdata[j]
|
||||
if c == "<":
|
||||
s = rawdata[j:j+2]
|
||||
if s == "<":
|
||||
# end of buffer; incomplete
|
||||
return -1
|
||||
if s != "<!":
|
||||
self.updatepos(declstartpos, j + 1)
|
||||
self.error("unexpected char in internal subset (in %r)" % s)
|
||||
if (j + 2) == n:
|
||||
# end of buffer; incomplete
|
||||
return -1
|
||||
if (j + 4) > n:
|
||||
# end of buffer; incomplete
|
||||
return -1
|
||||
if rawdata[j:j+4] == "<!--":
|
||||
j = self.parse_comment(j, report=0)
|
||||
if j < 0:
|
||||
return j
|
||||
continue
|
||||
name, j = self._scan_name(j + 2, declstartpos)
|
||||
if j == -1:
|
||||
return -1
|
||||
if name not in ("attlist", "element", "entity", "notation"):
|
||||
self.updatepos(declstartpos, j + 2)
|
||||
self.error(
|
||||
"unknown declaration %r in internal subset" % name)
|
||||
# handle the individual names
|
||||
meth = getattr(self, "_parse_doctype_" + name)
|
||||
j = meth(j, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
elif c == "%":
|
||||
# parameter entity reference
|
||||
if (j + 1) == n:
|
||||
# end of buffer; incomplete
|
||||
return -1
|
||||
s, j = self._scan_name(j + 1, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
if rawdata[j] == ";":
|
||||
j = j + 1
|
||||
elif c == "]":
|
||||
j = j + 1
|
||||
while j < n and rawdata[j].isspace():
|
||||
j = j + 1
|
||||
if j < n:
|
||||
if rawdata[j] == ">":
|
||||
return j
|
||||
self.updatepos(declstartpos, j)
|
||||
self.error("unexpected char after internal subset")
|
||||
else:
|
||||
return -1
|
||||
elif c.isspace():
|
||||
j = j + 1
|
||||
else:
|
||||
self.updatepos(declstartpos, j)
|
||||
self.error("unexpected char %r in internal subset" % c)
|
||||
# end of buffer reached
|
||||
return -1
|
||||
|
||||
# Internal -- scan past <!ELEMENT declarations
|
||||
def _parse_doctype_element(self, i, declstartpos):
|
||||
name, j = self._scan_name(i, declstartpos)
|
||||
if j == -1:
|
||||
return -1
|
||||
# style content model; just skip until '>'
|
||||
rawdata = self.rawdata
|
||||
if '>' in rawdata[j:]:
|
||||
return rawdata.find(">", j) + 1
|
||||
return -1
|
||||
|
||||
# Internal -- scan past <!ATTLIST declarations
|
||||
def _parse_doctype_attlist(self, i, declstartpos):
|
||||
rawdata = self.rawdata
|
||||
name, j = self._scan_name(i, declstartpos)
|
||||
c = rawdata[j:j+1]
|
||||
if c == "":
|
||||
return -1
|
||||
if c == ">":
|
||||
return j + 1
|
||||
while 1:
|
||||
# scan a series of attribute descriptions; simplified:
|
||||
# name type [value] [#constraint]
|
||||
name, j = self._scan_name(j, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
c = rawdata[j:j+1]
|
||||
if c == "":
|
||||
return -1
|
||||
if c == "(":
|
||||
# an enumerated type; look for ')'
|
||||
if ")" in rawdata[j:]:
|
||||
j = rawdata.find(")", j) + 1
|
||||
else:
|
||||
return -1
|
||||
while rawdata[j:j+1].isspace():
|
||||
j = j + 1
|
||||
if not rawdata[j:]:
|
||||
# end of buffer, incomplete
|
||||
return -1
|
||||
else:
|
||||
name, j = self._scan_name(j, declstartpos)
|
||||
c = rawdata[j:j+1]
|
||||
if not c:
|
||||
return -1
|
||||
if c in "'\"":
|
||||
m = _declstringlit_match(rawdata, j)
|
||||
if m:
|
||||
j = m.end()
|
||||
else:
|
||||
return -1
|
||||
c = rawdata[j:j+1]
|
||||
if not c:
|
||||
return -1
|
||||
if c == "#":
|
||||
if rawdata[j:] == "#":
|
||||
# end of buffer
|
||||
return -1
|
||||
name, j = self._scan_name(j + 1, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
c = rawdata[j:j+1]
|
||||
if not c:
|
||||
return -1
|
||||
if c == '>':
|
||||
# all done
|
||||
return j + 1
|
||||
|
||||
# Internal -- scan past <!NOTATION declarations
|
||||
def _parse_doctype_notation(self, i, declstartpos):
|
||||
name, j = self._scan_name(i, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
rawdata = self.rawdata
|
||||
while 1:
|
||||
c = rawdata[j:j+1]
|
||||
if not c:
|
||||
# end of buffer; incomplete
|
||||
return -1
|
||||
if c == '>':
|
||||
return j + 1
|
||||
if c in "'\"":
|
||||
m = _declstringlit_match(rawdata, j)
|
||||
if not m:
|
||||
return -1
|
||||
j = m.end()
|
||||
else:
|
||||
name, j = self._scan_name(j, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
|
||||
# Internal -- scan past <!ENTITY declarations
|
||||
def _parse_doctype_entity(self, i, declstartpos):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+1] == "%":
|
||||
j = i + 1
|
||||
while 1:
|
||||
c = rawdata[j:j+1]
|
||||
if not c:
|
||||
return -1
|
||||
if c.isspace():
|
||||
j = j + 1
|
||||
else:
|
||||
break
|
||||
else:
|
||||
j = i
|
||||
name, j = self._scan_name(j, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
while 1:
|
||||
c = self.rawdata[j:j+1]
|
||||
if not c:
|
||||
return -1
|
||||
if c in "'\"":
|
||||
m = _declstringlit_match(rawdata, j)
|
||||
if m:
|
||||
j = m.end()
|
||||
else:
|
||||
return -1 # incomplete
|
||||
elif c == ">":
|
||||
return j + 1
|
||||
else:
|
||||
name, j = self._scan_name(j, declstartpos)
|
||||
if j < 0:
|
||||
return j
|
||||
|
||||
# Internal -- scan a name token and the new position and the token, or
|
||||
# return -1 if we've reached the end of the buffer.
|
||||
def _scan_name(self, i, declstartpos):
|
||||
rawdata = self.rawdata
|
||||
n = len(rawdata)
|
||||
if i == n:
|
||||
return None, -1
|
||||
m = _declname_match(rawdata, i)
|
||||
if m:
|
||||
s = m.group()
|
||||
name = s.strip()
|
||||
if (i + len(s)) == n:
|
||||
return None, -1 # end of buffer
|
||||
return name.lower(), m.end()
|
||||
else:
|
||||
self.updatepos(declstartpos, i)
|
||||
self.error("expected name token at %r"
|
||||
% rawdata[declstartpos:declstartpos+20])
|
||||
|
||||
# To be overridden -- handlers for unknown objects
|
||||
def unknown_decl(self, data):
|
||||
pass
|
|
@ -0,0 +1,669 @@
|
|||
"""Stateful programmatic WWW navigation, after Perl's WWW::Mechanize.
|
||||
|
||||
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||
Copyright 2003 Andy Lester (original Perl code)
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import copy, re, os, urllib, urllib2
|
||||
|
||||
from _html import DefaultFactory
|
||||
import _response
|
||||
import _request
|
||||
import _rfc3986
|
||||
import _sockettimeout
|
||||
import _urllib2_fork
|
||||
from _useragent import UserAgentBase
|
||||
|
||||
class BrowserStateError(Exception): pass
|
||||
class LinkNotFoundError(Exception): pass
|
||||
class FormNotFoundError(Exception): pass
|
||||
|
||||
|
||||
def sanepathname2url(path):
|
||||
urlpath = urllib.pathname2url(path)
|
||||
if os.name == "nt" and urlpath.startswith("///"):
|
||||
urlpath = urlpath[2:]
|
||||
# XXX don't ask me about the mac...
|
||||
return urlpath
|
||||
|
||||
|
||||
class History:
|
||||
"""
|
||||
|
||||
Though this will become public, the implied interface is not yet stable.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
self._history = [] # LIFO
|
||||
def add(self, request, response):
|
||||
self._history.append((request, response))
|
||||
def back(self, n, _response):
|
||||
response = _response # XXX move Browser._response into this class?
|
||||
while n > 0 or response is None:
|
||||
try:
|
||||
request, response = self._history.pop()
|
||||
except IndexError:
|
||||
raise BrowserStateError("already at start of history")
|
||||
n -= 1
|
||||
return request, response
|
||||
def clear(self):
|
||||
del self._history[:]
|
||||
def close(self):
|
||||
for request, response in self._history:
|
||||
if response is not None:
|
||||
response.close()
|
||||
del self._history[:]
|
||||
|
||||
|
||||
class HTTPRefererProcessor(_urllib2_fork.BaseHandler):
|
||||
def http_request(self, request):
|
||||
# See RFC 2616 14.36. The only times we know the source of the
|
||||
# request URI has a URI associated with it are redirect, and
|
||||
# Browser.click() / Browser.submit() / Browser.follow_link().
|
||||
# Otherwise, it's the user's job to add any Referer header before
|
||||
# .open()ing.
|
||||
if hasattr(request, "redirect_dict"):
|
||||
request = self.parent._add_referer_header(
|
||||
request, origin_request=False)
|
||||
return request
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class Browser(UserAgentBase):
|
||||
"""Browser-like class with support for history, forms and links.
|
||||
|
||||
BrowserStateError is raised whenever the browser is in the wrong state to
|
||||
complete the requested operation - e.g., when .back() is called when the
|
||||
browser history is empty, or when .follow_link() is called when the current
|
||||
response does not contain HTML data.
|
||||
|
||||
Public attributes:
|
||||
|
||||
request: current request (mechanize.Request)
|
||||
form: currently selected form (see .select_form())
|
||||
|
||||
"""
|
||||
|
||||
handler_classes = copy.copy(UserAgentBase.handler_classes)
|
||||
handler_classes["_referer"] = HTTPRefererProcessor
|
||||
default_features = copy.copy(UserAgentBase.default_features)
|
||||
default_features.append("_referer")
|
||||
|
||||
def __init__(self,
|
||||
factory=None,
|
||||
history=None,
|
||||
request_class=None,
|
||||
):
|
||||
"""
|
||||
|
||||
Only named arguments should be passed to this constructor.
|
||||
|
||||
factory: object implementing the mechanize.Factory interface.
|
||||
history: object implementing the mechanize.History interface. Note
|
||||
this interface is still experimental and may change in future.
|
||||
request_class: Request class to use. Defaults to mechanize.Request
|
||||
|
||||
The Factory and History objects passed in are 'owned' by the Browser,
|
||||
so they should not be shared across Browsers. In particular,
|
||||
factory.set_response() should not be called except by the owning
|
||||
Browser itself.
|
||||
|
||||
Note that the supplied factory's request_class is overridden by this
|
||||
constructor, to ensure only one Request class is used.
|
||||
|
||||
"""
|
||||
self._handle_referer = True
|
||||
|
||||
if history is None:
|
||||
history = History()
|
||||
self._history = history
|
||||
|
||||
if request_class is None:
|
||||
request_class = _request.Request
|
||||
|
||||
if factory is None:
|
||||
factory = DefaultFactory()
|
||||
factory.set_request_class(request_class)
|
||||
self._factory = factory
|
||||
self.request_class = request_class
|
||||
|
||||
self.request = None
|
||||
self._set_response(None, False)
|
||||
|
||||
# do this last to avoid __getattr__ problems
|
||||
UserAgentBase.__init__(self)
|
||||
|
||||
def close(self):
|
||||
UserAgentBase.close(self)
|
||||
if self._response is not None:
|
||||
self._response.close()
|
||||
if self._history is not None:
|
||||
self._history.close()
|
||||
self._history = None
|
||||
|
||||
# make use after .close easy to spot
|
||||
self.form = None
|
||||
self.request = self._response = None
|
||||
self.request = self.response = self.set_response = None
|
||||
self.geturl = self.reload = self.back = None
|
||||
self.clear_history = self.set_cookie = self.links = self.forms = None
|
||||
self.viewing_html = self.encoding = self.title = None
|
||||
self.select_form = self.click = self.submit = self.click_link = None
|
||||
self.follow_link = self.find_link = None
|
||||
|
||||
def set_handle_referer(self, handle):
|
||||
"""Set whether to add Referer header to each request."""
|
||||
self._set_handler("_referer", handle)
|
||||
self._handle_referer = bool(handle)
|
||||
|
||||
def _add_referer_header(self, request, origin_request=True):
|
||||
if self.request is None:
|
||||
return request
|
||||
scheme = request.get_type()
|
||||
original_scheme = self.request.get_type()
|
||||
if scheme not in ["http", "https"]:
|
||||
return request
|
||||
if not origin_request and not self.request.has_header("Referer"):
|
||||
return request
|
||||
|
||||
if (self._handle_referer and
|
||||
original_scheme in ["http", "https"] and
|
||||
not (original_scheme == "https" and scheme != "https")):
|
||||
# strip URL fragment (RFC 2616 14.36)
|
||||
parts = _rfc3986.urlsplit(self.request.get_full_url())
|
||||
parts = parts[:-1]+(None,)
|
||||
referer = _rfc3986.urlunsplit(parts)
|
||||
request.add_unredirected_header("Referer", referer)
|
||||
return request
|
||||
|
||||
def open_novisit(self, url, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
"""Open a URL without visiting it.
|
||||
|
||||
Browser state (including request, response, history, forms and links)
|
||||
is left unchanged by calling this function.
|
||||
|
||||
The interface is the same as for .open().
|
||||
|
||||
This is useful for things like fetching images.
|
||||
|
||||
See also .retrieve().
|
||||
|
||||
"""
|
||||
return self._mech_open(url, data, visit=False, timeout=timeout)
|
||||
|
||||
def open(self, url, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
return self._mech_open(url, data, timeout=timeout)
|
||||
|
||||
def _mech_open(self, url, data=None, update_history=True, visit=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
try:
|
||||
url.get_full_url
|
||||
except AttributeError:
|
||||
# string URL -- convert to absolute URL if required
|
||||
scheme, authority = _rfc3986.urlsplit(url)[:2]
|
||||
if scheme is None:
|
||||
# relative URL
|
||||
if self._response is None:
|
||||
raise BrowserStateError(
|
||||
"can't fetch relative reference: "
|
||||
"not viewing any document")
|
||||
url = _rfc3986.urljoin(self._response.geturl(), url)
|
||||
|
||||
request = self._request(url, data, visit, timeout)
|
||||
visit = request.visit
|
||||
if visit is None:
|
||||
visit = True
|
||||
|
||||
if visit:
|
||||
self._visit_request(request, update_history)
|
||||
|
||||
success = True
|
||||
try:
|
||||
response = UserAgentBase.open(self, request, data)
|
||||
except urllib2.HTTPError, error:
|
||||
success = False
|
||||
if error.fp is None: # not a response
|
||||
raise
|
||||
response = error
|
||||
## except (IOError, socket.error, OSError), error:
|
||||
## # Yes, urllib2 really does raise all these :-((
|
||||
## # See test_urllib2.py for examples of socket.gaierror and OSError,
|
||||
## # plus note that FTPHandler raises IOError.
|
||||
## # XXX I don't seem to have an example of exactly socket.error being
|
||||
## # raised, only socket.gaierror...
|
||||
## # I don't want to start fixing these here, though, since this is a
|
||||
## # subclass of OpenerDirector, and it would break old code. Even in
|
||||
## # Python core, a fix would need some backwards-compat. hack to be
|
||||
## # acceptable.
|
||||
## raise
|
||||
|
||||
if visit:
|
||||
self._set_response(response, False)
|
||||
response = copy.copy(self._response)
|
||||
elif response is not None:
|
||||
response = _response.upgrade_response(response)
|
||||
|
||||
if not success:
|
||||
raise response
|
||||
return response
|
||||
|
||||
def __str__(self):
|
||||
text = []
|
||||
text.append("<%s " % self.__class__.__name__)
|
||||
if self._response:
|
||||
text.append("visiting %s" % self._response.geturl())
|
||||
else:
|
||||
text.append("(not visiting a URL)")
|
||||
if self.form:
|
||||
text.append("\n selected form:\n %s\n" % str(self.form))
|
||||
text.append(">")
|
||||
return "".join(text)
|
||||
|
||||
def response(self):
|
||||
"""Return a copy of the current response.
|
||||
|
||||
The returned object has the same interface as the object returned by
|
||||
.open() (or mechanize.urlopen()).
|
||||
|
||||
"""
|
||||
return copy.copy(self._response)
|
||||
|
||||
def open_local_file(self, filename):
|
||||
path = sanepathname2url(os.path.abspath(filename))
|
||||
url = 'file://'+path
|
||||
return self.open(url)
|
||||
|
||||
def set_response(self, response):
|
||||
"""Replace current response with (a copy of) response.
|
||||
|
||||
response may be None.
|
||||
|
||||
This is intended mostly for HTML-preprocessing.
|
||||
"""
|
||||
self._set_response(response, True)
|
||||
|
||||
def _set_response(self, response, close_current):
|
||||
# sanity check, necessary but far from sufficient
|
||||
if not (response is None or
|
||||
(hasattr(response, "info") and hasattr(response, "geturl") and
|
||||
hasattr(response, "read")
|
||||
)
|
||||
):
|
||||
raise ValueError("not a response object")
|
||||
|
||||
self.form = None
|
||||
if response is not None:
|
||||
response = _response.upgrade_response(response)
|
||||
if close_current and self._response is not None:
|
||||
self._response.close()
|
||||
self._response = response
|
||||
self._factory.set_response(response)
|
||||
|
||||
def visit_response(self, response, request=None):
|
||||
"""Visit the response, as if it had been .open()ed.
|
||||
|
||||
Unlike .set_response(), this updates history rather than replacing the
|
||||
current response.
|
||||
"""
|
||||
if request is None:
|
||||
request = _request.Request(response.geturl())
|
||||
self._visit_request(request, True)
|
||||
self._set_response(response, False)
|
||||
|
||||
def _visit_request(self, request, update_history):
|
||||
if self._response is not None:
|
||||
self._response.close()
|
||||
if self.request is not None and update_history:
|
||||
self._history.add(self.request, self._response)
|
||||
self._response = None
|
||||
# we want self.request to be assigned even if UserAgentBase.open
|
||||
# fails
|
||||
self.request = request
|
||||
|
||||
def geturl(self):
|
||||
"""Get URL of current document."""
|
||||
if self._response is None:
|
||||
raise BrowserStateError("not viewing any document")
|
||||
return self._response.geturl()
|
||||
|
||||
def reload(self):
|
||||
"""Reload current document, and return response object."""
|
||||
if self.request is None:
|
||||
raise BrowserStateError("no URL has yet been .open()ed")
|
||||
if self._response is not None:
|
||||
self._response.close()
|
||||
return self._mech_open(self.request, update_history=False)
|
||||
|
||||
def back(self, n=1):
|
||||
"""Go back n steps in history, and return response object.
|
||||
|
||||
n: go back this number of steps (default 1 step)
|
||||
|
||||
"""
|
||||
if self._response is not None:
|
||||
self._response.close()
|
||||
self.request, response = self._history.back(n, self._response)
|
||||
self.set_response(response)
|
||||
if not response.read_complete:
|
||||
return self.reload()
|
||||
return copy.copy(response)
|
||||
|
||||
def clear_history(self):
|
||||
self._history.clear()
|
||||
|
||||
def set_cookie(self, cookie_string):
|
||||
"""Request to set a cookie.
|
||||
|
||||
Note that it is NOT necessary to call this method under ordinary
|
||||
circumstances: cookie handling is normally entirely automatic. The
|
||||
intended use case is rather to simulate the setting of a cookie by
|
||||
client script in a web page (e.g. JavaScript). In that case, use of
|
||||
this method is necessary because mechanize currently does not support
|
||||
JavaScript, VBScript, etc.
|
||||
|
||||
The cookie is added in the same way as if it had arrived with the
|
||||
current response, as a result of the current request. This means that,
|
||||
for example, if it is not appropriate to set the cookie based on the
|
||||
current request, no cookie will be set.
|
||||
|
||||
The cookie will be returned automatically with subsequent responses
|
||||
made by the Browser instance whenever that's appropriate.
|
||||
|
||||
cookie_string should be a valid value of the Set-Cookie header.
|
||||
|
||||
For example:
|
||||
|
||||
browser.set_cookie(
|
||||
"sid=abcdef; expires=Wednesday, 09-Nov-06 23:12:40 GMT")
|
||||
|
||||
Currently, this method does not allow for adding RFC 2986 cookies.
|
||||
This limitation will be lifted if anybody requests it.
|
||||
|
||||
"""
|
||||
if self._response is None:
|
||||
raise BrowserStateError("not viewing any document")
|
||||
if self.request.get_type() not in ["http", "https"]:
|
||||
raise BrowserStateError("can't set cookie for non-HTTP/HTTPS "
|
||||
"transactions")
|
||||
cookiejar = self._ua_handlers["_cookies"].cookiejar
|
||||
response = self.response() # copy
|
||||
headers = response.info()
|
||||
headers["Set-cookie"] = cookie_string
|
||||
cookiejar.extract_cookies(response, self.request)
|
||||
|
||||
def links(self, **kwds):
|
||||
"""Return iterable over links (mechanize.Link objects)."""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
links = self._factory.links()
|
||||
if kwds:
|
||||
return self._filter_links(links, **kwds)
|
||||
else:
|
||||
return links
|
||||
|
||||
def forms(self):
|
||||
"""Return iterable over forms.
|
||||
|
||||
The returned form objects implement the mechanize.HTMLForm interface.
|
||||
|
||||
"""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
return self._factory.forms()
|
||||
|
||||
def global_form(self):
|
||||
"""Return the global form object, or None if the factory implementation
|
||||
did not supply one.
|
||||
|
||||
The "global" form object contains all controls that are not descendants
|
||||
of any FORM element.
|
||||
|
||||
The returned form object implements the mechanize.HTMLForm interface.
|
||||
|
||||
This is a separate method since the global form is not regarded as part
|
||||
of the sequence of forms in the document -- mostly for
|
||||
backwards-compatibility.
|
||||
|
||||
"""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
return self._factory.global_form
|
||||
|
||||
def viewing_html(self):
|
||||
"""Return whether the current response contains HTML data."""
|
||||
if self._response is None:
|
||||
raise BrowserStateError("not viewing any document")
|
||||
return self._factory.is_html
|
||||
|
||||
def encoding(self):
|
||||
if self._response is None:
|
||||
raise BrowserStateError("not viewing any document")
|
||||
return self._factory.encoding
|
||||
|
||||
def title(self):
|
||||
r"""Return title, or None if there is no title element in the document.
|
||||
|
||||
Treatment of any tag children of attempts to follow Firefox and IE
|
||||
(currently, tags are preserved).
|
||||
|
||||
"""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
return self._factory.title
|
||||
|
||||
def select_form(self, name=None, predicate=None, nr=None):
|
||||
"""Select an HTML form for input.
|
||||
|
||||
This is a bit like giving a form the "input focus" in a browser.
|
||||
|
||||
If a form is selected, the Browser object supports the HTMLForm
|
||||
interface, so you can call methods like .set_value(), .set(), and
|
||||
.click().
|
||||
|
||||
Another way to select a form is to assign to the .form attribute. The
|
||||
form assigned should be one of the objects returned by the .forms()
|
||||
method.
|
||||
|
||||
At least one of the name, predicate and nr arguments must be supplied.
|
||||
If no matching form is found, mechanize.FormNotFoundError is raised.
|
||||
|
||||
If name is specified, then the form must have the indicated name.
|
||||
|
||||
If predicate is specified, then the form must match that function. The
|
||||
predicate function is passed the HTMLForm as its single argument, and
|
||||
should return a boolean value indicating whether the form matched.
|
||||
|
||||
nr, if supplied, is the sequence number of the form (where 0 is the
|
||||
first). Note that control 0 is the first form matching all the other
|
||||
arguments (if supplied); it is not necessarily the first control in the
|
||||
form. The "global form" (consisting of all form controls not contained
|
||||
in any FORM element) is considered not to be part of this sequence and
|
||||
to have no name, so will not be matched unless both name and nr are
|
||||
None.
|
||||
|
||||
"""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
if (name is None) and (predicate is None) and (nr is None):
|
||||
raise ValueError(
|
||||
"at least one argument must be supplied to specify form")
|
||||
|
||||
global_form = self._factory.global_form
|
||||
if nr is None and name is None and \
|
||||
predicate is not None and predicate(global_form):
|
||||
self.form = global_form
|
||||
return
|
||||
|
||||
orig_nr = nr
|
||||
for form in self.forms():
|
||||
if name is not None and name != form.name:
|
||||
continue
|
||||
if predicate is not None and not predicate(form):
|
||||
continue
|
||||
if nr:
|
||||
nr -= 1
|
||||
continue
|
||||
self.form = form
|
||||
break # success
|
||||
else:
|
||||
# failure
|
||||
description = []
|
||||
if name is not None: description.append("name '%s'" % name)
|
||||
if predicate is not None:
|
||||
description.append("predicate %s" % predicate)
|
||||
if orig_nr is not None: description.append("nr %d" % orig_nr)
|
||||
description = ", ".join(description)
|
||||
raise FormNotFoundError("no form matching "+description)
|
||||
|
||||
def click(self, *args, **kwds):
|
||||
"""See mechanize.HTMLForm.click for documentation."""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
request = self.form.click(*args, **kwds)
|
||||
return self._add_referer_header(request)
|
||||
|
||||
def submit(self, *args, **kwds):
|
||||
"""Submit current form.
|
||||
|
||||
Arguments are as for mechanize.HTMLForm.click().
|
||||
|
||||
Return value is same as for Browser.open().
|
||||
|
||||
"""
|
||||
return self.open(self.click(*args, **kwds))
|
||||
|
||||
def click_link(self, link=None, **kwds):
|
||||
"""Find a link and return a Request object for it.
|
||||
|
||||
Arguments are as for .find_link(), except that a link may be supplied
|
||||
as the first argument.
|
||||
|
||||
"""
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
if not link:
|
||||
link = self.find_link(**kwds)
|
||||
else:
|
||||
if kwds:
|
||||
raise ValueError(
|
||||
"either pass a Link, or keyword arguments, not both")
|
||||
request = self.request_class(link.absolute_url)
|
||||
return self._add_referer_header(request)
|
||||
|
||||
def follow_link(self, link=None, **kwds):
|
||||
"""Find a link and .open() it.
|
||||
|
||||
Arguments are as for .click_link().
|
||||
|
||||
Return value is same as for Browser.open().
|
||||
|
||||
"""
|
||||
return self.open(self.click_link(link, **kwds))
|
||||
|
||||
def find_link(self, **kwds):
|
||||
"""Find a link in current page.
|
||||
|
||||
Links are returned as mechanize.Link objects.
|
||||
|
||||
# Return third link that .search()-matches the regexp "python"
|
||||
# (by ".search()-matches", I mean that the regular expression method
|
||||
# .search() is used, rather than .match()).
|
||||
find_link(text_regex=re.compile("python"), nr=2)
|
||||
|
||||
# Return first http link in the current page that points to somewhere
|
||||
# on python.org whose link text (after tags have been removed) is
|
||||
# exactly "monty python".
|
||||
find_link(text="monty python",
|
||||
url_regex=re.compile("http.*python.org"))
|
||||
|
||||
# Return first link with exactly three HTML attributes.
|
||||
find_link(predicate=lambda link: len(link.attrs) == 3)
|
||||
|
||||
Links include anchors (<a>), image maps (<area>), and frames (<frame>,
|
||||
<iframe>).
|
||||
|
||||
All arguments must be passed by keyword, not position. Zero or more
|
||||
arguments may be supplied. In order to find a link, all arguments
|
||||
supplied must match.
|
||||
|
||||
If a matching link is not found, mechanize.LinkNotFoundError is raised.
|
||||
|
||||
text: link text between link tags: e.g. <a href="blah">this bit</a> (as
|
||||
returned by pullparser.get_compressed_text(), ie. without tags but
|
||||
with opening tags "textified" as per the pullparser docs) must compare
|
||||
equal to this argument, if supplied
|
||||
text_regex: link text between tag (as defined above) must match the
|
||||
regular expression object or regular expression string passed as this
|
||||
argument, if supplied
|
||||
name, name_regex: as for text and text_regex, but matched against the
|
||||
name HTML attribute of the link tag
|
||||
url, url_regex: as for text and text_regex, but matched against the
|
||||
URL of the link tag (note this matches against Link.url, which is a
|
||||
relative or absolute URL according to how it was written in the HTML)
|
||||
tag: element name of opening tag, e.g. "a"
|
||||
predicate: a function taking a Link object as its single argument,
|
||||
returning a boolean result, indicating whether the links
|
||||
nr: matches the nth link that matches all other criteria (default 0)
|
||||
|
||||
"""
|
||||
try:
|
||||
return self._filter_links(self._factory.links(), **kwds).next()
|
||||
except StopIteration:
|
||||
raise LinkNotFoundError()
|
||||
|
||||
def __getattr__(self, name):
|
||||
# pass through _form.HTMLForm methods and attributes
|
||||
form = self.__dict__.get("form")
|
||||
if form is None:
|
||||
raise AttributeError(
|
||||
"%s instance has no attribute %s (perhaps you forgot to "
|
||||
".select_form()?)" % (self.__class__, name))
|
||||
return getattr(form, name)
|
||||
|
||||
def _filter_links(self, links,
|
||||
text=None, text_regex=None,
|
||||
name=None, name_regex=None,
|
||||
url=None, url_regex=None,
|
||||
tag=None,
|
||||
predicate=None,
|
||||
nr=0
|
||||
):
|
||||
if not self.viewing_html():
|
||||
raise BrowserStateError("not viewing HTML")
|
||||
|
||||
orig_nr = nr
|
||||
|
||||
for link in links:
|
||||
if url is not None and url != link.url:
|
||||
continue
|
||||
if url_regex is not None and not re.search(url_regex, link.url):
|
||||
continue
|
||||
if (text is not None and
|
||||
(link.text is None or text != link.text)):
|
||||
continue
|
||||
if (text_regex is not None and
|
||||
(link.text is None or not re.search(text_regex, link.text))):
|
||||
continue
|
||||
if name is not None and name != dict(link.attrs).get("name"):
|
||||
continue
|
||||
if name_regex is not None:
|
||||
link_name = dict(link.attrs).get("name")
|
||||
if link_name is None or not re.search(name_regex, link_name):
|
||||
continue
|
||||
if tag is not None and tag != link.tag:
|
||||
continue
|
||||
if predicate is not None and not predicate(link):
|
||||
continue
|
||||
if nr:
|
||||
nr -= 1
|
||||
continue
|
||||
yield link
|
||||
nr = orig_nr
|
Binary file not shown.
|
@ -0,0 +1,161 @@
|
|||
"""Mozilla / Netscape cookie loading / saving.
|
||||
|
||||
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||
Copyright 1997-1999 Gisle Aas (original libwww-perl code)
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import re, time, logging
|
||||
|
||||
from _clientcookie import reraise_unmasked_exceptions, FileCookieJar, Cookie, \
|
||||
MISSING_FILENAME_TEXT, LoadError
|
||||
debug = logging.getLogger("ClientCookie").debug
|
||||
|
||||
|
||||
class MozillaCookieJar(FileCookieJar):
|
||||
"""
|
||||
|
||||
WARNING: you may want to backup your browser's cookies file if you use
|
||||
this class to save cookies. I *think* it works, but there have been
|
||||
bugs in the past!
|
||||
|
||||
This class differs from CookieJar only in the format it uses to save and
|
||||
load cookies to and from a file. This class uses the Mozilla/Netscape
|
||||
`cookies.txt' format. lynx uses this file format, too.
|
||||
|
||||
Don't expect cookies saved while the browser is running to be noticed by
|
||||
the browser (in fact, Mozilla on unix will overwrite your saved cookies if
|
||||
you change them on disk while it's running; on Windows, you probably can't
|
||||
save at all while the browser is running).
|
||||
|
||||
Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
|
||||
Netscape cookies on saving.
|
||||
|
||||
In particular, the cookie version and port number information is lost,
|
||||
together with information about whether or not Path, Port and Discard were
|
||||
specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
|
||||
domain as set in the HTTP header started with a dot (yes, I'm aware some
|
||||
domains in Netscape files start with a dot and some don't -- trust me, you
|
||||
really don't want to know any more about this).
|
||||
|
||||
Note that though Mozilla and Netscape use the same format, they use
|
||||
slightly different headers. The class saves cookies using the Netscape
|
||||
header by default (Mozilla can cope with that).
|
||||
|
||||
"""
|
||||
magic_re = "#( Netscape)? HTTP Cookie File"
|
||||
header = """\
|
||||
# Netscape HTTP Cookie File
|
||||
# http://www.netscape.com/newsref/std/cookie_spec.html
|
||||
# This is a generated file! Do not edit.
|
||||
|
||||
"""
|
||||
|
||||
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
||||
now = time.time()
|
||||
|
||||
magic = f.readline()
|
||||
if not re.search(self.magic_re, magic):
|
||||
f.close()
|
||||
raise LoadError(
|
||||
"%s does not look like a Netscape format cookies file" %
|
||||
filename)
|
||||
|
||||
try:
|
||||
while 1:
|
||||
line = f.readline()
|
||||
if line == "": break
|
||||
|
||||
# last field may be absent, so keep any trailing tab
|
||||
if line.endswith("\n"): line = line[:-1]
|
||||
|
||||
# skip comments and blank lines XXX what is $ for?
|
||||
if (line.strip().startswith("#") or
|
||||
line.strip().startswith("$") or
|
||||
line.strip() == ""):
|
||||
continue
|
||||
|
||||
domain, domain_specified, path, secure, expires, name, value = \
|
||||
line.split("\t", 6)
|
||||
secure = (secure == "TRUE")
|
||||
domain_specified = (domain_specified == "TRUE")
|
||||
if name == "":
|
||||
name = value
|
||||
value = None
|
||||
|
||||
initial_dot = domain.startswith(".")
|
||||
if domain_specified != initial_dot:
|
||||
raise LoadError("domain and domain specified flag don't "
|
||||
"match in %s: %s" % (filename, line))
|
||||
|
||||
discard = False
|
||||
if expires == "":
|
||||
expires = None
|
||||
discard = True
|
||||
|
||||
# assume path_specified is false
|
||||
c = Cookie(0, name, value,
|
||||
None, False,
|
||||
domain, domain_specified, initial_dot,
|
||||
path, False,
|
||||
secure,
|
||||
expires,
|
||||
discard,
|
||||
None,
|
||||
None,
|
||||
{})
|
||||
if not ignore_discard and c.discard:
|
||||
continue
|
||||
if not ignore_expires and c.is_expired(now):
|
||||
continue
|
||||
self.set_cookie(c)
|
||||
|
||||
except:
|
||||
reraise_unmasked_exceptions((IOError, LoadError))
|
||||
raise LoadError("invalid Netscape format file %s: %s" %
|
||||
(filename, line))
|
||||
|
||||
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
||||
if filename is None:
|
||||
if self.filename is not None: filename = self.filename
|
||||
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||
|
||||
f = open(filename, "w")
|
||||
try:
|
||||
debug("Saving Netscape cookies.txt file")
|
||||
f.write(self.header)
|
||||
now = time.time()
|
||||
for cookie in self:
|
||||
if not ignore_discard and cookie.discard:
|
||||
debug(" Not saving %s: marked for discard", cookie.name)
|
||||
continue
|
||||
if not ignore_expires and cookie.is_expired(now):
|
||||
debug(" Not saving %s: expired", cookie.name)
|
||||
continue
|
||||
if cookie.secure: secure = "TRUE"
|
||||
else: secure = "FALSE"
|
||||
if cookie.domain.startswith("."): initial_dot = "TRUE"
|
||||
else: initial_dot = "FALSE"
|
||||
if cookie.expires is not None:
|
||||
expires = str(cookie.expires)
|
||||
else:
|
||||
expires = ""
|
||||
if cookie.value is None:
|
||||
# cookies.txt regards 'Set-Cookie: foo' as a cookie
|
||||
# with no name, whereas cookielib regards it as a
|
||||
# cookie with no value.
|
||||
name = ""
|
||||
value = cookie.name
|
||||
else:
|
||||
name = cookie.name
|
||||
value = cookie.value
|
||||
f.write(
|
||||
"\t".join([cookie.domain, initial_dot, cookie.path,
|
||||
secure, expires, name, value])+
|
||||
"\n")
|
||||
finally:
|
||||
f.close()
|
Binary file not shown.
|
@ -0,0 +1,388 @@
|
|||
"""Microsoft Internet Explorer cookie loading on Windows.
|
||||
|
||||
Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code)
|
||||
Copyright 2002-2006 John J Lee <jjl@pobox.com> (The Python port)
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
# XXX names and comments are not great here
|
||||
|
||||
import os, re, time, struct, logging
|
||||
if os.name == "nt":
|
||||
import _winreg
|
||||
|
||||
from _clientcookie import FileCookieJar, CookieJar, Cookie, \
|
||||
MISSING_FILENAME_TEXT, LoadError
|
||||
|
||||
debug = logging.getLogger("mechanize").debug
|
||||
|
||||
|
||||
def regload(path, leaf):
|
||||
key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0,
|
||||
_winreg.KEY_ALL_ACCESS)
|
||||
try:
|
||||
value = _winreg.QueryValueEx(key, leaf)[0]
|
||||
except WindowsError:
|
||||
value = None
|
||||
return value
|
||||
|
||||
WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME
|
||||
|
||||
def epoch_time_offset_from_win32_filetime(filetime):
|
||||
"""Convert from win32 filetime to seconds-since-epoch value.
|
||||
|
||||
MSIE stores create and expire times as Win32 FILETIME, which is 64
|
||||
bits of 100 nanosecond intervals since Jan 01 1601.
|
||||
|
||||
mechanize expects time in 32-bit value expressed in seconds since the
|
||||
epoch (Jan 01 1970).
|
||||
|
||||
"""
|
||||
if filetime < WIN32_EPOCH:
|
||||
raise ValueError("filetime (%d) is before epoch (%d)" %
|
||||
(filetime, WIN32_EPOCH))
|
||||
|
||||
return divmod((filetime - WIN32_EPOCH), 10000000L)[0]
|
||||
|
||||
def binary_to_char(c): return "%02X" % ord(c)
|
||||
def binary_to_str(d): return "".join(map(binary_to_char, list(d)))
|
||||
|
||||
class MSIEBase:
|
||||
magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*")
|
||||
padding = "\x0d\xf0\xad\x0b"
|
||||
|
||||
msie_domain_re = re.compile(r"^([^/]+)(/.*)$")
|
||||
cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?"
|
||||
"(.+\@[\x21-\xFF]+\.txt)")
|
||||
|
||||
# path under HKEY_CURRENT_USER from which to get location of index.dat
|
||||
reg_path = r"software\microsoft\windows" \
|
||||
r"\currentversion\explorer\shell folders"
|
||||
reg_key = "Cookies"
|
||||
|
||||
def __init__(self):
|
||||
self._delayload_domains = {}
|
||||
|
||||
def _delayload_domain(self, domain):
|
||||
# if necessary, lazily load cookies for this domain
|
||||
delayload_info = self._delayload_domains.get(domain)
|
||||
if delayload_info is not None:
|
||||
cookie_file, ignore_discard, ignore_expires = delayload_info
|
||||
try:
|
||||
self.load_cookie_data(cookie_file,
|
||||
ignore_discard, ignore_expires)
|
||||
except (LoadError, IOError):
|
||||
debug("error reading cookie file, skipping: %s", cookie_file)
|
||||
else:
|
||||
del self._delayload_domains[domain]
|
||||
|
||||
def _load_cookies_from_file(self, filename):
|
||||
debug("Loading MSIE cookies file: %s", filename)
|
||||
cookies = []
|
||||
|
||||
cookies_fh = open(filename)
|
||||
|
||||
try:
|
||||
while 1:
|
||||
key = cookies_fh.readline()
|
||||
if key == "": break
|
||||
|
||||
rl = cookies_fh.readline
|
||||
def getlong(rl=rl): return long(rl().rstrip())
|
||||
def getstr(rl=rl): return rl().rstrip()
|
||||
|
||||
key = key.rstrip()
|
||||
value = getstr()
|
||||
domain_path = getstr()
|
||||
flags = getlong() # 0x2000 bit is for secure I think
|
||||
lo_expire = getlong()
|
||||
hi_expire = getlong()
|
||||
lo_create = getlong()
|
||||
hi_create = getlong()
|
||||
sep = getstr()
|
||||
|
||||
if "" in (key, value, domain_path, flags, hi_expire, lo_expire,
|
||||
hi_create, lo_create, sep) or (sep != "*"):
|
||||
break
|
||||
|
||||
m = self.msie_domain_re.search(domain_path)
|
||||
if m:
|
||||
domain = m.group(1)
|
||||
path = m.group(2)
|
||||
|
||||
cookies.append({"KEY": key, "VALUE": value,
|
||||
"DOMAIN": domain, "PATH": path,
|
||||
"FLAGS": flags, "HIXP": hi_expire,
|
||||
"LOXP": lo_expire, "HICREATE": hi_create,
|
||||
"LOCREATE": lo_create})
|
||||
finally:
|
||||
cookies_fh.close()
|
||||
|
||||
return cookies
|
||||
|
||||
def load_cookie_data(self, filename,
|
||||
ignore_discard=False, ignore_expires=False):
|
||||
"""Load cookies from file containing actual cookie data.
|
||||
|
||||
Old cookies are kept unless overwritten by newly loaded ones.
|
||||
|
||||
You should not call this method if the delayload attribute is set.
|
||||
|
||||
I think each of these files contain all cookies for one user, domain,
|
||||
and path.
|
||||
|
||||
filename: file containing cookies -- usually found in a file like
|
||||
C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt
|
||||
|
||||
"""
|
||||
now = int(time.time())
|
||||
|
||||
cookie_data = self._load_cookies_from_file(filename)
|
||||
|
||||
for cookie in cookie_data:
|
||||
flags = cookie["FLAGS"]
|
||||
secure = ((flags & 0x2000) != 0)
|
||||
filetime = (cookie["HIXP"] << 32) + cookie["LOXP"]
|
||||
expires = epoch_time_offset_from_win32_filetime(filetime)
|
||||
if expires < now:
|
||||
discard = True
|
||||
else:
|
||||
discard = False
|
||||
domain = cookie["DOMAIN"]
|
||||
initial_dot = domain.startswith(".")
|
||||
if initial_dot:
|
||||
domain_specified = True
|
||||
else:
|
||||
# MSIE 5 does not record whether the domain cookie-attribute
|
||||
# was specified.
|
||||
# Assuming it wasn't is conservative, because with strict
|
||||
# domain matching this will match less frequently; with regular
|
||||
# Netscape tail-matching, this will match at exactly the same
|
||||
# times that domain_specified = True would. It also means we
|
||||
# don't have to prepend a dot to achieve consistency with our
|
||||
# own & Mozilla's domain-munging scheme.
|
||||
domain_specified = False
|
||||
|
||||
# assume path_specified is false
|
||||
# XXX is there other stuff in here? -- e.g. comment, commentURL?
|
||||
c = Cookie(0,
|
||||
cookie["KEY"], cookie["VALUE"],
|
||||
None, False,
|
||||
domain, domain_specified, initial_dot,
|
||||
cookie["PATH"], False,
|
||||
secure,
|
||||
expires,
|
||||
discard,
|
||||
None,
|
||||
None,
|
||||
{"flags": flags})
|
||||
if not ignore_discard and c.discard:
|
||||
continue
|
||||
if not ignore_expires and c.is_expired(now):
|
||||
continue
|
||||
CookieJar.set_cookie(self, c)
|
||||
|
||||
def load_from_registry(self, ignore_discard=False, ignore_expires=False,
|
||||
username=None):
|
||||
"""
|
||||
username: only required on win9x
|
||||
|
||||
"""
|
||||
cookies_dir = regload(self.reg_path, self.reg_key)
|
||||
filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT"))
|
||||
self.load(filename, ignore_discard, ignore_expires, username)
|
||||
|
||||
def _really_load(self, index, filename, ignore_discard, ignore_expires,
|
||||
username):
|
||||
now = int(time.time())
|
||||
|
||||
if username is None:
|
||||
username = os.environ['USERNAME'].lower()
|
||||
|
||||
cookie_dir = os.path.dirname(filename)
|
||||
|
||||
data = index.read(256)
|
||||
if len(data) != 256:
|
||||
raise LoadError("%s file is too short" % filename)
|
||||
|
||||
# Cookies' index.dat file starts with 32 bytes of signature
|
||||
# followed by an offset to the first record, stored as a little-
|
||||
# endian DWORD.
|
||||
sig, size, data = data[:32], data[32:36], data[36:]
|
||||
size = struct.unpack("<L", size)[0]
|
||||
|
||||
# check that sig is valid
|
||||
if not self.magic_re.match(sig) or size != 0x4000:
|
||||
raise LoadError("%s ['%s' %s] does not seem to contain cookies" %
|
||||
(str(filename), sig, size))
|
||||
|
||||
# skip to start of first record
|
||||
index.seek(size, 0)
|
||||
|
||||
sector = 128 # size of sector in bytes
|
||||
|
||||
while 1:
|
||||
data = ""
|
||||
|
||||
# Cookies are usually in two contiguous sectors, so read in two
|
||||
# sectors and adjust if not a Cookie.
|
||||
to_read = 2 * sector
|
||||
d = index.read(to_read)
|
||||
if len(d) != to_read:
|
||||
break
|
||||
data = data + d
|
||||
|
||||
# Each record starts with a 4-byte signature and a count
|
||||
# (little-endian DWORD) of sectors for the record.
|
||||
sig, size, data = data[:4], data[4:8], data[8:]
|
||||
size = struct.unpack("<L", size)[0]
|
||||
|
||||
to_read = (size - 2) * sector
|
||||
|
||||
## from urllib import quote
|
||||
## print "data", quote(data)
|
||||
## print "sig", quote(sig)
|
||||
## print "size in sectors", size
|
||||
## print "size in bytes", size*sector
|
||||
## print "size in units of 16 bytes", (size*sector) / 16
|
||||
## print "size to read in bytes", to_read
|
||||
## print
|
||||
|
||||
if sig != "URL ":
|
||||
assert sig in ("HASH", "LEAK", \
|
||||
self.padding, "\x00\x00\x00\x00"), \
|
||||
"unrecognized MSIE index.dat record: %s" % \
|
||||
binary_to_str(sig)
|
||||
if sig == "\x00\x00\x00\x00":
|
||||
# assume we've got all the cookies, and stop
|
||||
break
|
||||
if sig == self.padding:
|
||||
continue
|
||||
# skip the rest of this record
|
||||
assert to_read >= 0
|
||||
if size != 2:
|
||||
assert to_read != 0
|
||||
index.seek(to_read, 1)
|
||||
continue
|
||||
|
||||
# read in rest of record if necessary
|
||||
if size > 2:
|
||||
more_data = index.read(to_read)
|
||||
if len(more_data) != to_read: break
|
||||
data = data + more_data
|
||||
|
||||
cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username +
|
||||
"(%s\@[\x21-\xFF]+\.txt)" % username)
|
||||
m = re.search(cookie_re, data, re.I)
|
||||
if m:
|
||||
cookie_file = os.path.join(cookie_dir, m.group(2))
|
||||
if not self.delayload:
|
||||
try:
|
||||
self.load_cookie_data(cookie_file,
|
||||
ignore_discard, ignore_expires)
|
||||
except (LoadError, IOError):
|
||||
debug("error reading cookie file, skipping: %s",
|
||||
cookie_file)
|
||||
else:
|
||||
domain = m.group(1)
|
||||
i = domain.find("/")
|
||||
if i != -1:
|
||||
domain = domain[:i]
|
||||
|
||||
self._delayload_domains[domain] = (
|
||||
cookie_file, ignore_discard, ignore_expires)
|
||||
|
||||
|
||||
class MSIECookieJar(MSIEBase, FileCookieJar):
|
||||
"""FileCookieJar that reads from the Windows MSIE cookies database.
|
||||
|
||||
MSIECookieJar can read the cookie files of Microsoft Internet Explorer
|
||||
(MSIE) for Windows version 5 on Windows NT and version 6 on Windows XP and
|
||||
Windows 98. Other configurations may also work, but are untested. Saving
|
||||
cookies in MSIE format is NOT supported. If you save cookies, they'll be
|
||||
in the usual Set-Cookie3 format, which you can read back in using an
|
||||
instance of the plain old CookieJar class. Don't save using the same
|
||||
filename that you loaded cookies from, because you may succeed in
|
||||
clobbering your MSIE cookies index file!
|
||||
|
||||
You should be able to have LWP share Internet Explorer's cookies like
|
||||
this (note you need to supply a username to load_from_registry if you're on
|
||||
Windows 9x or Windows ME):
|
||||
|
||||
cj = MSIECookieJar(delayload=1)
|
||||
# find cookies index file in registry and load cookies from it
|
||||
cj.load_from_registry()
|
||||
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
|
||||
response = opener.open("http://example.com/")
|
||||
|
||||
Iterating over a delayloaded MSIECookieJar instance will not cause any
|
||||
cookies to be read from disk. To force reading of all cookies from disk,
|
||||
call read_all_cookies. Note that the following methods iterate over self:
|
||||
clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__
|
||||
and as_string.
|
||||
|
||||
Additional methods:
|
||||
|
||||
load_from_registry(ignore_discard=False, ignore_expires=False,
|
||||
username=None)
|
||||
load_cookie_data(filename, ignore_discard=False, ignore_expires=False)
|
||||
read_all_cookies()
|
||||
|
||||
"""
|
||||
def __init__(self, filename=None, delayload=False, policy=None):
|
||||
MSIEBase.__init__(self)
|
||||
FileCookieJar.__init__(self, filename, delayload, policy)
|
||||
|
||||
def set_cookie(self, cookie):
|
||||
if self.delayload:
|
||||
self._delayload_domain(cookie.domain)
|
||||
CookieJar.set_cookie(self, cookie)
|
||||
|
||||
def _cookies_for_request(self, request):
|
||||
"""Return a list of cookies to be returned to server."""
|
||||
domains = self._cookies.copy()
|
||||
domains.update(self._delayload_domains)
|
||||
domains = domains.keys()
|
||||
|
||||
cookies = []
|
||||
for domain in domains:
|
||||
cookies.extend(self._cookies_for_domain(domain, request))
|
||||
return cookies
|
||||
|
||||
def _cookies_for_domain(self, domain, request):
|
||||
if not self._policy.domain_return_ok(domain, request):
|
||||
return []
|
||||
debug("Checking %s for cookies to return", domain)
|
||||
if self.delayload:
|
||||
self._delayload_domain(domain)
|
||||
return CookieJar._cookies_for_domain(self, domain, request)
|
||||
|
||||
def read_all_cookies(self):
|
||||
"""Eagerly read in all cookies."""
|
||||
if self.delayload:
|
||||
for domain in self._delayload_domains.keys():
|
||||
self._delayload_domain(domain)
|
||||
|
||||
def load(self, filename, ignore_discard=False, ignore_expires=False,
|
||||
username=None):
|
||||
"""Load cookies from an MSIE 'index.dat' cookies index file.
|
||||
|
||||
filename: full path to cookie index file
|
||||
username: only required on win9x
|
||||
|
||||
"""
|
||||
if filename is None:
|
||||
if self.filename is not None: filename = self.filename
|
||||
else: raise ValueError(MISSING_FILENAME_TEXT)
|
||||
|
||||
index = open(filename, "rb")
|
||||
|
||||
try:
|
||||
self._really_load(index, filename, ignore_discard, ignore_expires,
|
||||
username)
|
||||
finally:
|
||||
index.close()
|
Binary file not shown.
|
@ -0,0 +1,442 @@
|
|||
"""URL opener.
|
||||
|
||||
Copyright 2004-2006 John J Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import os, urllib2, bisect, httplib, types, tempfile
|
||||
try:
|
||||
import threading as _threading
|
||||
except ImportError:
|
||||
import dummy_threading as _threading
|
||||
try:
|
||||
set
|
||||
except NameError:
|
||||
import sets
|
||||
set = sets.Set
|
||||
|
||||
from _request import Request
|
||||
import _response
|
||||
import _rfc3986
|
||||
import _sockettimeout
|
||||
import _urllib2_fork
|
||||
from _util import isstringlike
|
||||
|
||||
open_file = open
|
||||
|
||||
|
||||
class ContentTooShortError(urllib2.URLError):
|
||||
def __init__(self, reason, result):
|
||||
urllib2.URLError.__init__(self, reason)
|
||||
self.result = result
|
||||
|
||||
|
||||
def set_request_attr(req, name, value, default):
|
||||
try:
|
||||
getattr(req, name)
|
||||
except AttributeError:
|
||||
setattr(req, name, default)
|
||||
if value is not default:
|
||||
setattr(req, name, value)
|
||||
|
||||
|
||||
class OpenerDirector(_urllib2_fork.OpenerDirector):
|
||||
def __init__(self):
|
||||
_urllib2_fork.OpenerDirector.__init__(self)
|
||||
# really none of these are (sanely) public -- the lack of initial
|
||||
# underscore on some is just due to following urllib2
|
||||
self.process_response = {}
|
||||
self.process_request = {}
|
||||
self._any_request = {}
|
||||
self._any_response = {}
|
||||
self._handler_index_valid = True
|
||||
self._tempfiles = []
|
||||
|
||||
def add_handler(self, handler):
|
||||
if not hasattr(handler, "add_parent"):
|
||||
raise TypeError("expected BaseHandler instance, got %r" %
|
||||
type(handler))
|
||||
|
||||
if handler in self.handlers:
|
||||
return
|
||||
# XXX why does self.handlers need to be sorted?
|
||||
bisect.insort(self.handlers, handler)
|
||||
handler.add_parent(self)
|
||||
self._handler_index_valid = False
|
||||
|
||||
def _maybe_reindex_handlers(self):
|
||||
if self._handler_index_valid:
|
||||
return
|
||||
|
||||
handle_error = {}
|
||||
handle_open = {}
|
||||
process_request = {}
|
||||
process_response = {}
|
||||
any_request = set()
|
||||
any_response = set()
|
||||
unwanted = []
|
||||
|
||||
for handler in self.handlers:
|
||||
added = False
|
||||
for meth in dir(handler):
|
||||
if meth in ["redirect_request", "do_open", "proxy_open"]:
|
||||
# oops, coincidental match
|
||||
continue
|
||||
|
||||
if meth == "any_request":
|
||||
any_request.add(handler)
|
||||
added = True
|
||||
continue
|
||||
elif meth == "any_response":
|
||||
any_response.add(handler)
|
||||
added = True
|
||||
continue
|
||||
|
||||
ii = meth.find("_")
|
||||
scheme = meth[:ii]
|
||||
condition = meth[ii+1:]
|
||||
|
||||
if condition.startswith("error"):
|
||||
jj = meth[ii+1:].find("_") + ii + 1
|
||||
kind = meth[jj+1:]
|
||||
try:
|
||||
kind = int(kind)
|
||||
except ValueError:
|
||||
pass
|
||||
lookup = handle_error.setdefault(scheme, {})
|
||||
elif condition == "open":
|
||||
kind = scheme
|
||||
lookup = handle_open
|
||||
elif condition == "request":
|
||||
kind = scheme
|
||||
lookup = process_request
|
||||
elif condition == "response":
|
||||
kind = scheme
|
||||
lookup = process_response
|
||||
else:
|
||||
continue
|
||||
|
||||
lookup.setdefault(kind, set()).add(handler)
|
||||
added = True
|
||||
|
||||
if not added:
|
||||
unwanted.append(handler)
|
||||
|
||||
for handler in unwanted:
|
||||
self.handlers.remove(handler)
|
||||
|
||||
# sort indexed methods
|
||||
# XXX could be cleaned up
|
||||
for lookup in [process_request, process_response]:
|
||||
for scheme, handlers in lookup.iteritems():
|
||||
lookup[scheme] = handlers
|
||||
for scheme, lookup in handle_error.iteritems():
|
||||
for code, handlers in lookup.iteritems():
|
||||
handlers = list(handlers)
|
||||
handlers.sort()
|
||||
lookup[code] = handlers
|
||||
for scheme, handlers in handle_open.iteritems():
|
||||
handlers = list(handlers)
|
||||
handlers.sort()
|
||||
handle_open[scheme] = handlers
|
||||
|
||||
# cache the indexes
|
||||
self.handle_error = handle_error
|
||||
self.handle_open = handle_open
|
||||
self.process_request = process_request
|
||||
self.process_response = process_response
|
||||
self._any_request = any_request
|
||||
self._any_response = any_response
|
||||
|
||||
def _request(self, url_or_req, data, visit,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
if isstringlike(url_or_req):
|
||||
req = Request(url_or_req, data, visit=visit, timeout=timeout)
|
||||
else:
|
||||
# already a mechanize.Request instance
|
||||
req = url_or_req
|
||||
if data is not None:
|
||||
req.add_data(data)
|
||||
# XXX yuck
|
||||
set_request_attr(req, "visit", visit, None)
|
||||
set_request_attr(req, "timeout", timeout,
|
||||
_sockettimeout._GLOBAL_DEFAULT_TIMEOUT)
|
||||
return req
|
||||
|
||||
def open(self, fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
req = self._request(fullurl, data, None, timeout)
|
||||
req_scheme = req.get_type()
|
||||
|
||||
self._maybe_reindex_handlers()
|
||||
|
||||
# pre-process request
|
||||
# XXX should we allow a Processor to change the URL scheme
|
||||
# of the request?
|
||||
request_processors = set(self.process_request.get(req_scheme, []))
|
||||
request_processors.update(self._any_request)
|
||||
request_processors = list(request_processors)
|
||||
request_processors.sort()
|
||||
for processor in request_processors:
|
||||
for meth_name in ["any_request", req_scheme+"_request"]:
|
||||
meth = getattr(processor, meth_name, None)
|
||||
if meth:
|
||||
req = meth(req)
|
||||
|
||||
# In Python >= 2.4, .open() supports processors already, so we must
|
||||
# call ._open() instead.
|
||||
urlopen = _urllib2_fork.OpenerDirector._open
|
||||
response = urlopen(self, req, data)
|
||||
|
||||
# post-process response
|
||||
response_processors = set(self.process_response.get(req_scheme, []))
|
||||
response_processors.update(self._any_response)
|
||||
response_processors = list(response_processors)
|
||||
response_processors.sort()
|
||||
for processor in response_processors:
|
||||
for meth_name in ["any_response", req_scheme+"_response"]:
|
||||
meth = getattr(processor, meth_name, None)
|
||||
if meth:
|
||||
response = meth(req, response)
|
||||
|
||||
return response
|
||||
|
||||
def error(self, proto, *args):
|
||||
if proto in ['http', 'https']:
|
||||
# XXX http[s] protocols are special-cased
|
||||
dict = self.handle_error['http'] # https is not different than http
|
||||
proto = args[2] # YUCK!
|
||||
meth_name = 'http_error_%s' % proto
|
||||
http_err = 1
|
||||
orig_args = args
|
||||
else:
|
||||
dict = self.handle_error
|
||||
meth_name = proto + '_error'
|
||||
http_err = 0
|
||||
args = (dict, proto, meth_name) + args
|
||||
result = apply(self._call_chain, args)
|
||||
if result:
|
||||
return result
|
||||
|
||||
if http_err:
|
||||
args = (dict, 'default', 'http_error_default') + orig_args
|
||||
return apply(self._call_chain, args)
|
||||
|
||||
BLOCK_SIZE = 1024*8
|
||||
def retrieve(self, fullurl, filename=None, reporthook=None, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT,
|
||||
open=open_file):
|
||||
"""Returns (filename, headers).
|
||||
|
||||
For remote objects, the default filename will refer to a temporary
|
||||
file. Temporary files are removed when the OpenerDirector.close()
|
||||
method is called.
|
||||
|
||||
For file: URLs, at present the returned filename is None. This may
|
||||
change in future.
|
||||
|
||||
If the actual number of bytes read is less than indicated by the
|
||||
Content-Length header, raises ContentTooShortError (a URLError
|
||||
subclass). The exception's .result attribute contains the (filename,
|
||||
headers) that would have been returned.
|
||||
|
||||
"""
|
||||
req = self._request(fullurl, data, False, timeout)
|
||||
scheme = req.get_type()
|
||||
fp = self.open(req)
|
||||
try:
|
||||
headers = fp.info()
|
||||
if filename is None and scheme == 'file':
|
||||
# XXX req.get_selector() seems broken here, return None,
|
||||
# pending sanity :-/
|
||||
return None, headers
|
||||
#return urllib.url2pathname(req.get_selector()), headers
|
||||
if filename:
|
||||
tfp = open(filename, 'wb')
|
||||
else:
|
||||
path = _rfc3986.urlsplit(req.get_full_url())[2]
|
||||
suffix = os.path.splitext(path)[1]
|
||||
fd, filename = tempfile.mkstemp(suffix)
|
||||
self._tempfiles.append(filename)
|
||||
tfp = os.fdopen(fd, 'wb')
|
||||
try:
|
||||
result = filename, headers
|
||||
bs = self.BLOCK_SIZE
|
||||
size = -1
|
||||
read = 0
|
||||
blocknum = 0
|
||||
if reporthook:
|
||||
if "content-length" in headers:
|
||||
size = int(headers["Content-Length"])
|
||||
reporthook(blocknum, bs, size)
|
||||
while 1:
|
||||
block = fp.read(bs)
|
||||
if block == "":
|
||||
break
|
||||
read += len(block)
|
||||
tfp.write(block)
|
||||
blocknum += 1
|
||||
if reporthook:
|
||||
reporthook(blocknum, bs, size)
|
||||
finally:
|
||||
tfp.close()
|
||||
finally:
|
||||
fp.close()
|
||||
|
||||
# raise exception if actual size does not match content-length header
|
||||
if size >= 0 and read < size:
|
||||
raise ContentTooShortError(
|
||||
"retrieval incomplete: "
|
||||
"got only %i out of %i bytes" % (read, size),
|
||||
result
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def close(self):
|
||||
_urllib2_fork.OpenerDirector.close(self)
|
||||
|
||||
# make it very obvious this object is no longer supposed to be used
|
||||
self.open = self.error = self.retrieve = self.add_handler = None
|
||||
|
||||
if self._tempfiles:
|
||||
for filename in self._tempfiles:
|
||||
try:
|
||||
os.unlink(filename)
|
||||
except OSError:
|
||||
pass
|
||||
del self._tempfiles[:]
|
||||
|
||||
|
||||
def wrapped_open(urlopen, process_response_object, fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
success = True
|
||||
try:
|
||||
response = urlopen(fullurl, data, timeout)
|
||||
except urllib2.HTTPError, error:
|
||||
success = False
|
||||
if error.fp is None: # not a response
|
||||
raise
|
||||
response = error
|
||||
|
||||
if response is not None:
|
||||
response = process_response_object(response)
|
||||
|
||||
if not success:
|
||||
raise response
|
||||
return response
|
||||
|
||||
class ResponseProcessingOpener(OpenerDirector):
|
||||
|
||||
def open(self, fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
def bound_open(fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
return OpenerDirector.open(self, fullurl, data, timeout)
|
||||
return wrapped_open(
|
||||
bound_open, self.process_response_object, fullurl, data, timeout)
|
||||
|
||||
def process_response_object(self, response):
|
||||
return response
|
||||
|
||||
|
||||
class SeekableResponseOpener(ResponseProcessingOpener):
|
||||
def process_response_object(self, response):
|
||||
return _response.seek_wrapped_response(response)
|
||||
|
||||
|
||||
def isclass(obj):
|
||||
return isinstance(obj, (types.ClassType, type))
|
||||
|
||||
|
||||
class OpenerFactory:
|
||||
"""This class's interface is quite likely to change."""
|
||||
|
||||
default_classes = [
|
||||
# handlers
|
||||
_urllib2_fork.ProxyHandler,
|
||||
_urllib2_fork.UnknownHandler,
|
||||
_urllib2_fork.HTTPHandler,
|
||||
_urllib2_fork.HTTPDefaultErrorHandler,
|
||||
_urllib2_fork.HTTPRedirectHandler,
|
||||
_urllib2_fork.FTPHandler,
|
||||
_urllib2_fork.FileHandler,
|
||||
# processors
|
||||
_urllib2_fork.HTTPCookieProcessor,
|
||||
_urllib2_fork.HTTPErrorProcessor,
|
||||
]
|
||||
if hasattr(httplib, 'HTTPS'):
|
||||
default_classes.append(_urllib2_fork.HTTPSHandler)
|
||||
handlers = []
|
||||
replacement_handlers = []
|
||||
|
||||
def __init__(self, klass=OpenerDirector):
|
||||
self.klass = klass
|
||||
|
||||
def build_opener(self, *handlers):
|
||||
"""Create an opener object from a list of handlers and processors.
|
||||
|
||||
The opener will use several default handlers and processors, including
|
||||
support for HTTP and FTP.
|
||||
|
||||
If any of the handlers passed as arguments are subclasses of the
|
||||
default handlers, the default handlers will not be used.
|
||||
|
||||
"""
|
||||
opener = self.klass()
|
||||
default_classes = list(self.default_classes)
|
||||
skip = set()
|
||||
for klass in default_classes:
|
||||
for check in handlers:
|
||||
if isclass(check):
|
||||
if issubclass(check, klass):
|
||||
skip.add(klass)
|
||||
elif isinstance(check, klass):
|
||||
skip.add(klass)
|
||||
for klass in skip:
|
||||
default_classes.remove(klass)
|
||||
|
||||
for klass in default_classes:
|
||||
opener.add_handler(klass())
|
||||
for h in handlers:
|
||||
if isclass(h):
|
||||
h = h()
|
||||
opener.add_handler(h)
|
||||
|
||||
return opener
|
||||
|
||||
|
||||
build_opener = OpenerFactory().build_opener
|
||||
|
||||
_opener = None
|
||||
urlopen_lock = _threading.Lock()
|
||||
def urlopen(url, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
global _opener
|
||||
if _opener is None:
|
||||
urlopen_lock.acquire()
|
||||
try:
|
||||
if _opener is None:
|
||||
_opener = build_opener()
|
||||
finally:
|
||||
urlopen_lock.release()
|
||||
return _opener.open(url, data, timeout)
|
||||
|
||||
def urlretrieve(url, filename=None, reporthook=None, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
global _opener
|
||||
if _opener is None:
|
||||
urlopen_lock.acquire()
|
||||
try:
|
||||
if _opener is None:
|
||||
_opener = build_opener()
|
||||
finally:
|
||||
urlopen_lock.release()
|
||||
return _opener.retrieve(url, filename, reporthook, data, timeout)
|
||||
|
||||
def install_opener(opener):
|
||||
global _opener
|
||||
_opener = opener
|
Binary file not shown.
|
@ -0,0 +1,391 @@
|
|||
"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
|
||||
|
||||
Examples
|
||||
|
||||
This program extracts all links from a document. It will print one
|
||||
line for each link, containing the URL and the textual description
|
||||
between the <A>...</A> tags:
|
||||
|
||||
import pullparser, sys
|
||||
f = file(sys.argv[1])
|
||||
p = pullparser.PullParser(f)
|
||||
for token in p.tags("a"):
|
||||
if token.type == "endtag": continue
|
||||
url = dict(token.attrs).get("href", "-")
|
||||
text = p.get_compressed_text(endat=("endtag", "a"))
|
||||
print "%s\t%s" % (url, text)
|
||||
|
||||
This program extracts the <TITLE> from the document:
|
||||
|
||||
import pullparser, sys
|
||||
f = file(sys.argv[1])
|
||||
p = pullparser.PullParser(f)
|
||||
if p.get_tag("title"):
|
||||
title = p.get_compressed_text()
|
||||
print "Title: %s" % title
|
||||
|
||||
|
||||
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||
Copyright 1998-2001 Gisle Aas (original libwww-perl code)
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses.
|
||||
|
||||
"""
|
||||
|
||||
import re, htmlentitydefs
|
||||
import _sgmllib_copy as sgmllib
|
||||
import HTMLParser
|
||||
from xml.sax import saxutils
|
||||
|
||||
from _html import unescape, unescape_charref
|
||||
|
||||
|
||||
class NoMoreTokensError(Exception): pass
|
||||
|
||||
class Token:
|
||||
"""Represents an HTML tag, declaration, processing instruction etc.
|
||||
|
||||
Behaves as both a tuple-like object (ie. iterable) and has attributes
|
||||
.type, .data and .attrs.
|
||||
|
||||
>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
|
||||
>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
|
||||
True
|
||||
>>> (t.type, t.data) == ("starttag", "a")
|
||||
True
|
||||
>>> t.attrs == [("href", "http://www.python.org/")]
|
||||
True
|
||||
|
||||
Public attributes
|
||||
|
||||
type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
|
||||
"data", "comment", "decl", "pi", after the corresponding methods of
|
||||
HTMLParser.HTMLParser
|
||||
data: For a tag, the tag name; otherwise, the relevant data carried by the
|
||||
tag, as a string
|
||||
attrs: list of (name, value) pairs representing HTML attributes
|
||||
(or None if token does not represent an opening tag)
|
||||
|
||||
"""
|
||||
def __init__(self, type, data, attrs=None):
|
||||
self.type = type
|
||||
self.data = data
|
||||
self.attrs = attrs
|
||||
def __iter__(self):
|
||||
return iter((self.type, self.data, self.attrs))
|
||||
def __eq__(self, other):
|
||||
type, data, attrs = other
|
||||
if (self.type == type and
|
||||
self.data == data and
|
||||
self.attrs == attrs):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
def __ne__(self, other): return not self.__eq__(other)
|
||||
def __repr__(self):
|
||||
args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
|
||||
return self.__class__.__name__+"(%s)" % args
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
>>> print Token("starttag", "br")
|
||||
<br>
|
||||
>>> print Token("starttag", "a",
|
||||
... [("href", "http://www.python.org/"), ("alt", '"foo"')])
|
||||
<a href="http://www.python.org/" alt='"foo"'>
|
||||
>>> print Token("startendtag", "br")
|
||||
<br />
|
||||
>>> print Token("startendtag", "br", [("spam", "eggs")])
|
||||
<br spam="eggs" />
|
||||
>>> print Token("endtag", "p")
|
||||
</p>
|
||||
>>> print Token("charref", "38")
|
||||
&
|
||||
>>> print Token("entityref", "amp")
|
||||
&
|
||||
>>> print Token("data", "foo\\nbar")
|
||||
foo
|
||||
bar
|
||||
>>> print Token("comment", "Life is a bowl\\nof cherries.")
|
||||
<!--Life is a bowl
|
||||
of cherries.-->
|
||||
>>> print Token("decl", "decl")
|
||||
<!decl>
|
||||
>>> print Token("pi", "pi")
|
||||
<?pi>
|
||||
"""
|
||||
if self.attrs is not None:
|
||||
attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
|
||||
k, v in self.attrs])
|
||||
else:
|
||||
attrs = ""
|
||||
if self.type == "starttag":
|
||||
return "<%s%s>" % (self.data, attrs)
|
||||
elif self.type == "startendtag":
|
||||
return "<%s%s />" % (self.data, attrs)
|
||||
elif self.type == "endtag":
|
||||
return "</%s>" % self.data
|
||||
elif self.type == "charref":
|
||||
return "&#%s;" % self.data
|
||||
elif self.type == "entityref":
|
||||
return "&%s;" % self.data
|
||||
elif self.type == "data":
|
||||
return self.data
|
||||
elif self.type == "comment":
|
||||
return "<!--%s-->" % self.data
|
||||
elif self.type == "decl":
|
||||
return "<!%s>" % self.data
|
||||
elif self.type == "pi":
|
||||
return "<?%s>" % self.data
|
||||
assert False
|
||||
|
||||
|
||||
def iter_until_exception(fn, exception, *args, **kwds):
|
||||
while 1:
|
||||
try:
|
||||
yield fn(*args, **kwds)
|
||||
except exception:
|
||||
raise StopIteration
|
||||
|
||||
|
||||
class _AbstractParser:
|
||||
chunk = 1024
|
||||
compress_re = re.compile(r"\s+")
|
||||
def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
|
||||
encoding="ascii", entitydefs=None):
|
||||
"""
|
||||
fh: file-like object (only a .read() method is required) from which to
|
||||
read HTML to be parsed
|
||||
textify: mapping used by .get_text() and .get_compressed_text() methods
|
||||
to represent opening tags as text
|
||||
encoding: encoding used to encode numeric character references by
|
||||
.get_text() and .get_compressed_text() ("ascii" by default)
|
||||
|
||||
entitydefs: mapping like {"amp": "&", ...} containing HTML entity
|
||||
definitions (a sensible default is used). This is used to unescape
|
||||
entities in .get_text() (and .get_compressed_text()) and attribute
|
||||
values. If the encoding can not represent the character, the entity
|
||||
reference is left unescaped. Note that entity references (both
|
||||
numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
|
||||
unescaped in attribute values and the return value of .get_text(), but
|
||||
not in data outside of tags. Instead, entity references outside of
|
||||
tags are represented as tokens. This is a bit odd, it's true :-/
|
||||
|
||||
If the element name of an opening tag matches a key in the textify
|
||||
mapping then that tag is converted to text. The corresponding value is
|
||||
used to specify which tag attribute to obtain the text from. textify
|
||||
maps from element names to either:
|
||||
|
||||
- an HTML attribute name, in which case the HTML attribute value is
|
||||
used as its text value along with the element name in square
|
||||
brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute
|
||||
were missing, just "[IMG]")
|
||||
- a callable object (e.g. a function) which takes a Token and returns
|
||||
the string to be used as its text value
|
||||
|
||||
If textify has no key for an element name, nothing is substituted for
|
||||
the opening tag.
|
||||
|
||||
Public attributes:
|
||||
|
||||
encoding and textify: see above
|
||||
|
||||
"""
|
||||
self._fh = fh
|
||||
self._tokenstack = [] # FIFO
|
||||
self.textify = textify
|
||||
self.encoding = encoding
|
||||
if entitydefs is None:
|
||||
entitydefs = htmlentitydefs.name2codepoint
|
||||
self._entitydefs = entitydefs
|
||||
|
||||
def __iter__(self): return self
|
||||
|
||||
def tags(self, *names):
|
||||
return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
|
||||
|
||||
def tokens(self, *tokentypes):
|
||||
return iter_until_exception(self.get_token, NoMoreTokensError,
|
||||
*tokentypes)
|
||||
|
||||
def next(self):
|
||||
try:
|
||||
return self.get_token()
|
||||
except NoMoreTokensError:
|
||||
raise StopIteration()
|
||||
|
||||
def get_token(self, *tokentypes):
|
||||
"""Pop the next Token object from the stack of parsed tokens.
|
||||
|
||||
If arguments are given, they are taken to be token types in which the
|
||||
caller is interested: tokens representing other elements will be
|
||||
skipped. Element names must be given in lower case.
|
||||
|
||||
Raises NoMoreTokensError.
|
||||
|
||||
"""
|
||||
while 1:
|
||||
while self._tokenstack:
|
||||
token = self._tokenstack.pop(0)
|
||||
if tokentypes:
|
||||
if token.type in tokentypes:
|
||||
return token
|
||||
else:
|
||||
return token
|
||||
data = self._fh.read(self.chunk)
|
||||
if not data:
|
||||
raise NoMoreTokensError()
|
||||
self.feed(data)
|
||||
|
||||
def unget_token(self, token):
|
||||
"""Push a Token back onto the stack."""
|
||||
self._tokenstack.insert(0, token)
|
||||
|
||||
def get_tag(self, *names):
|
||||
"""Return the next Token that represents an opening or closing tag.
|
||||
|
||||
If arguments are given, they are taken to be element names in which the
|
||||
caller is interested: tags representing other elements will be skipped.
|
||||
Element names must be given in lower case.
|
||||
|
||||
Raises NoMoreTokensError.
|
||||
|
||||
"""
|
||||
while 1:
|
||||
tok = self.get_token()
|
||||
if tok.type not in ["starttag", "endtag", "startendtag"]:
|
||||
continue
|
||||
if names:
|
||||
if tok.data in names:
|
||||
return tok
|
||||
else:
|
||||
return tok
|
||||
|
||||
def get_text(self, endat=None):
|
||||
"""Get some text.
|
||||
|
||||
endat: stop reading text at this tag (the tag is included in the
|
||||
returned text); endtag is a tuple (type, name) where type is
|
||||
"starttag", "endtag" or "startendtag", and name is the element name of
|
||||
the tag (element names must be given in lower case)
|
||||
|
||||
If endat is not given, .get_text() will stop at the next opening or
|
||||
closing tag, or when there are no more tokens (no exception is raised).
|
||||
Note that .get_text() includes the text representation (if any) of the
|
||||
opening tag, but pushes the opening tag back onto the stack. As a
|
||||
result, if you want to call .get_text() again, you need to call
|
||||
.get_tag() first (unless you want an empty string returned when you
|
||||
next call .get_text()).
|
||||
|
||||
Entity references are translated using the value of the entitydefs
|
||||
constructor argument (a mapping from names to characters like that
|
||||
provided by the standard module htmlentitydefs). Named entity
|
||||
references that are not in this mapping are left unchanged.
|
||||
|
||||
The textify attribute is used to translate opening tags into text: see
|
||||
the class docstring.
|
||||
|
||||
"""
|
||||
text = []
|
||||
tok = None
|
||||
while 1:
|
||||
try:
|
||||
tok = self.get_token()
|
||||
except NoMoreTokensError:
|
||||
# unget last token (not the one we just failed to get)
|
||||
if tok: self.unget_token(tok)
|
||||
break
|
||||
if tok.type == "data":
|
||||
text.append(tok.data)
|
||||
elif tok.type == "entityref":
|
||||
t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
|
||||
text.append(t)
|
||||
elif tok.type == "charref":
|
||||
t = unescape_charref(tok.data, self.encoding)
|
||||
text.append(t)
|
||||
elif tok.type in ["starttag", "endtag", "startendtag"]:
|
||||
tag_name = tok.data
|
||||
if tok.type in ["starttag", "startendtag"]:
|
||||
alt = self.textify.get(tag_name)
|
||||
if alt is not None:
|
||||
if callable(alt):
|
||||
text.append(alt(tok))
|
||||
elif tok.attrs is not None:
|
||||
for k, v in tok.attrs:
|
||||
if k == alt:
|
||||
text.append(v)
|
||||
text.append("[%s]" % tag_name.upper())
|
||||
if endat is None or endat == (tok.type, tag_name):
|
||||
self.unget_token(tok)
|
||||
break
|
||||
return "".join(text)
|
||||
|
||||
def get_compressed_text(self, *args, **kwds):
|
||||
"""
|
||||
As .get_text(), but collapses each group of contiguous whitespace to a
|
||||
single space character, and removes all initial and trailing
|
||||
whitespace.
|
||||
|
||||
"""
|
||||
text = self.get_text(*args, **kwds)
|
||||
text = text.strip()
|
||||
return self.compress_re.sub(" ", text)
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
self._tokenstack.append(Token("startendtag", tag, attrs))
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._tokenstack.append(Token("starttag", tag, attrs))
|
||||
def handle_endtag(self, tag):
|
||||
self._tokenstack.append(Token("endtag", tag))
|
||||
def handle_charref(self, name):
|
||||
self._tokenstack.append(Token("charref", name))
|
||||
def handle_entityref(self, name):
|
||||
self._tokenstack.append(Token("entityref", name))
|
||||
def handle_data(self, data):
|
||||
self._tokenstack.append(Token("data", data))
|
||||
def handle_comment(self, data):
|
||||
self._tokenstack.append(Token("comment", data))
|
||||
def handle_decl(self, decl):
|
||||
self._tokenstack.append(Token("decl", decl))
|
||||
def unknown_decl(self, data):
|
||||
# XXX should this call self.error instead?
|
||||
#self.error("unknown declaration: " + `data`)
|
||||
self._tokenstack.append(Token("decl", data))
|
||||
def handle_pi(self, data):
|
||||
self._tokenstack.append(Token("pi", data))
|
||||
|
||||
def unescape_attr(self, name):
|
||||
return unescape(name, self._entitydefs, self.encoding)
|
||||
def unescape_attrs(self, attrs):
|
||||
escaped_attrs = []
|
||||
for key, val in attrs:
|
||||
escaped_attrs.append((key, self.unescape_attr(val)))
|
||||
return escaped_attrs
|
||||
|
||||
class PullParser(_AbstractParser, HTMLParser.HTMLParser):
|
||||
def __init__(self, *args, **kwds):
|
||||
HTMLParser.HTMLParser.__init__(self)
|
||||
_AbstractParser.__init__(self, *args, **kwds)
|
||||
def unescape(self, name):
|
||||
# Use the entitydefs passed into constructor, not
|
||||
# HTMLParser.HTMLParser's entitydefs.
|
||||
return self.unescape_attr(name)
|
||||
|
||||
class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
|
||||
def __init__(self, *args, **kwds):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
_AbstractParser.__init__(self, *args, **kwds)
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
attrs = self.unescape_attrs(attrs)
|
||||
self._tokenstack.append(Token("starttag", tag, attrs))
|
||||
def unknown_endtag(self, tag):
|
||||
self._tokenstack.append(Token("endtag", tag))
|
||||
|
||||
|
||||
def _test():
|
||||
import doctest, _pullparser
|
||||
return doctest.testmod(_pullparser)
|
||||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
Binary file not shown.
|
@ -0,0 +1,40 @@
|
|||
"""Integration with Python standard library module urllib2: Request class.
|
||||
|
||||
Copyright 2004-2006 John J Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import _rfc3986
|
||||
import _sockettimeout
|
||||
import _urllib2_fork
|
||||
|
||||
warn = logging.getLogger("mechanize").warning
|
||||
|
||||
|
||||
class Request(_urllib2_fork.Request):
|
||||
def __init__(self, url, data=None, headers={},
|
||||
origin_req_host=None, unverifiable=False, visit=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
# In mechanize 0.2, the interpretation of a unicode url argument will
|
||||
# change: A unicode url argument will be interpreted as an IRI, and a
|
||||
# bytestring as a URI. For now, we accept unicode or bytestring. We
|
||||
# don't insist that the value is always a URI (specifically, must only
|
||||
# contain characters which are legal), because that might break working
|
||||
# code (who knows what bytes some servers want to see, especially with
|
||||
# browser plugins for internationalised URIs).
|
||||
if not _rfc3986.is_clean_uri(url):
|
||||
warn("url argument is not a URI "
|
||||
"(contains illegal characters) %r" % url)
|
||||
_urllib2_fork.Request.__init__(self, url, data, headers)
|
||||
self.selector = None
|
||||
self.visit = visit
|
||||
self.timeout = timeout
|
||||
|
||||
def __str__(self):
|
||||
return "<Request for %s>" % self.get_full_url()
|
Binary file not shown.
|
@ -0,0 +1,525 @@
|
|||
"""Response classes.
|
||||
|
||||
The seek_wrapper code is not used if you're using UserAgent with
|
||||
.set_seekable_responses(False), or if you're using the urllib2-level interface
|
||||
HTTPEquivProcessor. Class closeable_response is instantiated by some handlers
|
||||
(AbstractHTTPHandler), but the closeable_response interface is only depended
|
||||
upon by Browser-level code. Function upgrade_response is only used if you're
|
||||
using Browser.
|
||||
|
||||
|
||||
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import copy, mimetools, urllib2
|
||||
from cStringIO import StringIO
|
||||
|
||||
|
||||
def len_of_seekable(file_):
|
||||
# this function exists because evaluation of len(file_.getvalue()) on every
|
||||
# .read() from seek_wrapper would be O(N**2) in number of .read()s
|
||||
pos = file_.tell()
|
||||
file_.seek(0, 2) # to end
|
||||
try:
|
||||
return file_.tell()
|
||||
finally:
|
||||
file_.seek(pos)
|
||||
|
||||
|
||||
# XXX Andrew Dalke kindly sent me a similar class in response to my request on
|
||||
# comp.lang.python, which I then proceeded to lose. I wrote this class
|
||||
# instead, but I think he's released his code publicly since, could pinch the
|
||||
# tests from it, at least...
|
||||
|
||||
# For testing seek_wrapper invariant (note that
|
||||
# test_urllib2.HandlerTest.test_seekable is expected to fail when this
|
||||
# invariant checking is turned on). The invariant checking is done by module
|
||||
# ipdc, which is available here:
|
||||
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/436834
|
||||
## from ipdbc import ContractBase
|
||||
## class seek_wrapper(ContractBase):
|
||||
class seek_wrapper:
|
||||
"""Adds a seek method to a file object.
|
||||
|
||||
This is only designed for seeking on readonly file-like objects.
|
||||
|
||||
Wrapped file-like object must have a read method. The readline method is
|
||||
only supported if that method is present on the wrapped object. The
|
||||
readlines method is always supported. xreadlines and iteration are
|
||||
supported only for Python 2.2 and above.
|
||||
|
||||
Public attributes:
|
||||
|
||||
wrapped: the wrapped file object
|
||||
is_closed: true iff .close() has been called
|
||||
|
||||
WARNING: All other attributes of the wrapped object (ie. those that are not
|
||||
one of wrapped, read, readline, readlines, xreadlines, __iter__ and next)
|
||||
are passed through unaltered, which may or may not make sense for your
|
||||
particular file object.
|
||||
|
||||
"""
|
||||
# General strategy is to check that cache is full enough, then delegate to
|
||||
# the cache (self.__cache, which is a cStringIO.StringIO instance). A seek
|
||||
# position (self.__pos) is maintained independently of the cache, in order
|
||||
# that a single cache may be shared between multiple seek_wrapper objects.
|
||||
# Copying using module copy shares the cache in this way.
|
||||
|
||||
def __init__(self, wrapped):
|
||||
self.wrapped = wrapped
|
||||
self.__read_complete_state = [False]
|
||||
self.__is_closed_state = [False]
|
||||
self.__have_readline = hasattr(self.wrapped, "readline")
|
||||
self.__cache = StringIO()
|
||||
self.__pos = 0 # seek position
|
||||
|
||||
def invariant(self):
|
||||
# The end of the cache is always at the same place as the end of the
|
||||
# wrapped file (though the .tell() method is not required to be present
|
||||
# on wrapped file).
|
||||
return self.wrapped.tell() == len(self.__cache.getvalue())
|
||||
|
||||
def close(self):
|
||||
self.wrapped.close()
|
||||
self.is_closed = True
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name == "is_closed":
|
||||
return self.__is_closed_state[0]
|
||||
elif name == "read_complete":
|
||||
return self.__read_complete_state[0]
|
||||
|
||||
wrapped = self.__dict__.get("wrapped")
|
||||
if wrapped:
|
||||
return getattr(wrapped, name)
|
||||
|
||||
return getattr(self.__class__, name)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if name == "is_closed":
|
||||
self.__is_closed_state[0] = bool(value)
|
||||
elif name == "read_complete":
|
||||
if not self.is_closed:
|
||||
self.__read_complete_state[0] = bool(value)
|
||||
else:
|
||||
self.__dict__[name] = value
|
||||
|
||||
def seek(self, offset, whence=0):
|
||||
assert whence in [0,1,2]
|
||||
|
||||
# how much data, if any, do we need to read?
|
||||
if whence == 2: # 2: relative to end of *wrapped* file
|
||||
if offset < 0: raise ValueError("negative seek offset")
|
||||
# since we don't know yet where the end of that file is, we must
|
||||
# read everything
|
||||
to_read = None
|
||||
else:
|
||||
if whence == 0: # 0: absolute
|
||||
if offset < 0: raise ValueError("negative seek offset")
|
||||
dest = offset
|
||||
else: # 1: relative to current position
|
||||
pos = self.__pos
|
||||
if pos < offset:
|
||||
raise ValueError("seek to before start of file")
|
||||
dest = pos + offset
|
||||
end = len_of_seekable(self.__cache)
|
||||
to_read = dest - end
|
||||
if to_read < 0:
|
||||
to_read = 0
|
||||
|
||||
if to_read != 0:
|
||||
self.__cache.seek(0, 2)
|
||||
if to_read is None:
|
||||
assert whence == 2
|
||||
self.__cache.write(self.wrapped.read())
|
||||
self.read_complete = True
|
||||
self.__pos = self.__cache.tell() - offset
|
||||
else:
|
||||
data = self.wrapped.read(to_read)
|
||||
if not data:
|
||||
self.read_complete = True
|
||||
else:
|
||||
self.__cache.write(data)
|
||||
# Don't raise an exception even if we've seek()ed past the end
|
||||
# of .wrapped, since fseek() doesn't complain in that case.
|
||||
# Also like fseek(), pretend we have seek()ed past the end,
|
||||
# i.e. not:
|
||||
#self.__pos = self.__cache.tell()
|
||||
# but rather:
|
||||
self.__pos = dest
|
||||
else:
|
||||
self.__pos = dest
|
||||
|
||||
def tell(self):
|
||||
return self.__pos
|
||||
|
||||
def __copy__(self):
|
||||
cpy = self.__class__(self.wrapped)
|
||||
cpy.__cache = self.__cache
|
||||
cpy.__read_complete_state = self.__read_complete_state
|
||||
cpy.__is_closed_state = self.__is_closed_state
|
||||
return cpy
|
||||
|
||||
def get_data(self):
|
||||
pos = self.__pos
|
||||
try:
|
||||
self.seek(0)
|
||||
return self.read(-1)
|
||||
finally:
|
||||
self.__pos = pos
|
||||
|
||||
def read(self, size=-1):
|
||||
pos = self.__pos
|
||||
end = len_of_seekable(self.__cache)
|
||||
available = end - pos
|
||||
|
||||
# enough data already cached?
|
||||
if size <= available and size != -1:
|
||||
self.__cache.seek(pos)
|
||||
self.__pos = pos+size
|
||||
return self.__cache.read(size)
|
||||
|
||||
# no, so read sufficient data from wrapped file and cache it
|
||||
self.__cache.seek(0, 2)
|
||||
if size == -1:
|
||||
self.__cache.write(self.wrapped.read())
|
||||
self.read_complete = True
|
||||
else:
|
||||
to_read = size - available
|
||||
assert to_read > 0
|
||||
data = self.wrapped.read(to_read)
|
||||
if not data:
|
||||
self.read_complete = True
|
||||
else:
|
||||
self.__cache.write(data)
|
||||
self.__cache.seek(pos)
|
||||
|
||||
data = self.__cache.read(size)
|
||||
self.__pos = self.__cache.tell()
|
||||
assert self.__pos == pos + len(data)
|
||||
return data
|
||||
|
||||
def readline(self, size=-1):
|
||||
if not self.__have_readline:
|
||||
raise NotImplementedError("no readline method on wrapped object")
|
||||
|
||||
# line we're about to read might not be complete in the cache, so
|
||||
# read another line first
|
||||
pos = self.__pos
|
||||
self.__cache.seek(0, 2)
|
||||
data = self.wrapped.readline()
|
||||
if not data:
|
||||
self.read_complete = True
|
||||
else:
|
||||
self.__cache.write(data)
|
||||
self.__cache.seek(pos)
|
||||
|
||||
data = self.__cache.readline()
|
||||
if size != -1:
|
||||
r = data[:size]
|
||||
self.__pos = pos+size
|
||||
else:
|
||||
r = data
|
||||
self.__pos = pos+len(data)
|
||||
return r
|
||||
|
||||
def readlines(self, sizehint=-1):
|
||||
pos = self.__pos
|
||||
self.__cache.seek(0, 2)
|
||||
self.__cache.write(self.wrapped.read())
|
||||
self.read_complete = True
|
||||
self.__cache.seek(pos)
|
||||
data = self.__cache.readlines(sizehint)
|
||||
self.__pos = self.__cache.tell()
|
||||
return data
|
||||
|
||||
def __iter__(self): return self
|
||||
def next(self):
|
||||
line = self.readline()
|
||||
if line == "": raise StopIteration
|
||||
return line
|
||||
|
||||
xreadlines = __iter__
|
||||
|
||||
def __repr__(self):
|
||||
return ("<%s at %s whose wrapped object = %r>" %
|
||||
(self.__class__.__name__, hex(abs(id(self))), self.wrapped))
|
||||
|
||||
|
||||
class response_seek_wrapper(seek_wrapper):
|
||||
|
||||
"""
|
||||
Supports copying response objects and setting response body data.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wrapped):
|
||||
seek_wrapper.__init__(self, wrapped)
|
||||
self._headers = self.wrapped.info()
|
||||
|
||||
def __copy__(self):
|
||||
cpy = seek_wrapper.__copy__(self)
|
||||
# copy headers from delegate
|
||||
cpy._headers = copy.copy(self.info())
|
||||
return cpy
|
||||
|
||||
# Note that .info() and .geturl() (the only two urllib2 response methods
|
||||
# that are not implemented by seek_wrapper) must be here explicitly rather
|
||||
# than by seek_wrapper's __getattr__ delegation) so that the nasty
|
||||
# dynamically-created HTTPError classes in get_seek_wrapper_class() get the
|
||||
# wrapped object's implementation, and not HTTPError's.
|
||||
|
||||
def info(self):
|
||||
return self._headers
|
||||
|
||||
def geturl(self):
|
||||
return self.wrapped.geturl()
|
||||
|
||||
def set_data(self, data):
|
||||
self.seek(0)
|
||||
self.read()
|
||||
self.close()
|
||||
cache = self._seek_wrapper__cache = StringIO()
|
||||
cache.write(data)
|
||||
self.seek(0)
|
||||
|
||||
|
||||
class eoffile:
|
||||
# file-like object that always claims to be at end-of-file...
|
||||
def read(self, size=-1): return ""
|
||||
def readline(self, size=-1): return ""
|
||||
def __iter__(self): return self
|
||||
def next(self): return ""
|
||||
def close(self): pass
|
||||
|
||||
class eofresponse(eoffile):
|
||||
def __init__(self, url, headers, code, msg):
|
||||
self._url = url
|
||||
self._headers = headers
|
||||
self.code = code
|
||||
self.msg = msg
|
||||
def geturl(self): return self._url
|
||||
def info(self): return self._headers
|
||||
|
||||
|
||||
class closeable_response:
|
||||
"""Avoids unnecessarily clobbering urllib.addinfourl methods on .close().
|
||||
|
||||
Only supports responses returned by mechanize.HTTPHandler.
|
||||
|
||||
After .close(), the following methods are supported:
|
||||
|
||||
.read()
|
||||
.readline()
|
||||
.info()
|
||||
.geturl()
|
||||
.__iter__()
|
||||
.next()
|
||||
.close()
|
||||
|
||||
and the following attributes are supported:
|
||||
|
||||
.code
|
||||
.msg
|
||||
|
||||
Also supports pickling (but the stdlib currently does something to prevent
|
||||
it: http://python.org/sf/1144636).
|
||||
|
||||
"""
|
||||
# presence of this attr indicates is useable after .close()
|
||||
closeable_response = None
|
||||
|
||||
def __init__(self, fp, headers, url, code, msg):
|
||||
self._set_fp(fp)
|
||||
self._headers = headers
|
||||
self._url = url
|
||||
self.code = code
|
||||
self.msg = msg
|
||||
|
||||
def _set_fp(self, fp):
|
||||
self.fp = fp
|
||||
self.read = self.fp.read
|
||||
self.readline = self.fp.readline
|
||||
if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
|
||||
if hasattr(self.fp, "fileno"):
|
||||
self.fileno = self.fp.fileno
|
||||
else:
|
||||
self.fileno = lambda: None
|
||||
self.__iter__ = self.fp.__iter__
|
||||
self.next = self.fp.next
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s at %s whose fp = %r>' % (
|
||||
self.__class__.__name__, hex(abs(id(self))), self.fp)
|
||||
|
||||
def info(self):
|
||||
return self._headers
|
||||
|
||||
def geturl(self):
|
||||
return self._url
|
||||
|
||||
def close(self):
|
||||
wrapped = self.fp
|
||||
wrapped.close()
|
||||
new_wrapped = eofresponse(
|
||||
self._url, self._headers, self.code, self.msg)
|
||||
self._set_fp(new_wrapped)
|
||||
|
||||
def __getstate__(self):
|
||||
# There are three obvious options here:
|
||||
# 1. truncate
|
||||
# 2. read to end
|
||||
# 3. close socket, pickle state including read position, then open
|
||||
# again on unpickle and use Range header
|
||||
# XXXX um, 4. refuse to pickle unless .close()d. This is better,
|
||||
# actually ("errors should never pass silently"). Pickling doesn't
|
||||
# work anyway ATM, because of http://python.org/sf/1144636 so fix
|
||||
# this later
|
||||
|
||||
# 2 breaks pickle protocol, because one expects the original object
|
||||
# to be left unscathed by pickling. 3 is too complicated and
|
||||
# surprising (and too much work ;-) to happen in a sane __getstate__.
|
||||
# So we do 1.
|
||||
|
||||
state = self.__dict__.copy()
|
||||
new_wrapped = eofresponse(
|
||||
self._url, self._headers, self.code, self.msg)
|
||||
state["wrapped"] = new_wrapped
|
||||
return state
|
||||
|
||||
def test_response(data='test data', headers=[],
|
||||
url="http://example.com/", code=200, msg="OK"):
|
||||
return make_response(data, headers, url, code, msg)
|
||||
|
||||
def test_html_response(data='test data', headers=[],
|
||||
url="http://example.com/", code=200, msg="OK"):
|
||||
headers += [("Content-type", "text/html")]
|
||||
return make_response(data, headers, url, code, msg)
|
||||
|
||||
def make_response(data, headers, url, code, msg):
|
||||
"""Convenient factory for objects implementing response interface.
|
||||
|
||||
data: string containing response body data
|
||||
headers: sequence of (name, value) pairs
|
||||
url: URL of response
|
||||
code: integer response code (e.g. 200)
|
||||
msg: string response code message (e.g. "OK")
|
||||
|
||||
"""
|
||||
mime_headers = make_headers(headers)
|
||||
r = closeable_response(StringIO(data), mime_headers, url, code, msg)
|
||||
return response_seek_wrapper(r)
|
||||
|
||||
|
||||
def make_headers(headers):
|
||||
"""
|
||||
headers: sequence of (name, value) pairs
|
||||
"""
|
||||
hdr_text = []
|
||||
for name_value in headers:
|
||||
hdr_text.append("%s: %s" % name_value)
|
||||
return mimetools.Message(StringIO("\n".join(hdr_text)))
|
||||
|
||||
|
||||
# Rest of this module is especially horrible, but needed, at least until fork
|
||||
# urllib2. Even then, may want to preseve urllib2 compatibility.
|
||||
|
||||
def get_seek_wrapper_class(response):
|
||||
# in order to wrap response objects that are also exceptions, we must
|
||||
# dynamically subclass the exception :-(((
|
||||
if (isinstance(response, urllib2.HTTPError) and
|
||||
not hasattr(response, "seek")):
|
||||
if response.__class__.__module__ == "__builtin__":
|
||||
exc_class_name = response.__class__.__name__
|
||||
else:
|
||||
exc_class_name = "%s.%s" % (
|
||||
response.__class__.__module__, response.__class__.__name__)
|
||||
|
||||
class httperror_seek_wrapper(response_seek_wrapper, response.__class__):
|
||||
# this only derives from HTTPError in order to be a subclass --
|
||||
# the HTTPError behaviour comes from delegation
|
||||
|
||||
_exc_class_name = exc_class_name
|
||||
|
||||
def __init__(self, wrapped):
|
||||
response_seek_wrapper.__init__(self, wrapped)
|
||||
# be compatible with undocumented HTTPError attributes :-(
|
||||
self.hdrs = wrapped.info()
|
||||
self.filename = wrapped.geturl()
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"<%s (%s instance) at %s "
|
||||
"whose wrapped object = %r>" % (
|
||||
self.__class__.__name__, self._exc_class_name,
|
||||
hex(abs(id(self))), self.wrapped)
|
||||
)
|
||||
wrapper_class = httperror_seek_wrapper
|
||||
else:
|
||||
wrapper_class = response_seek_wrapper
|
||||
return wrapper_class
|
||||
|
||||
def seek_wrapped_response(response):
|
||||
"""Return a copy of response that supports seekable response interface.
|
||||
|
||||
Accepts responses from both mechanize and urllib2 handlers.
|
||||
|
||||
Copes with both ordinary response instances and HTTPError instances (which
|
||||
can't be simply wrapped due to the requirement of preserving the exception
|
||||
base class).
|
||||
"""
|
||||
if not hasattr(response, "seek"):
|
||||
wrapper_class = get_seek_wrapper_class(response)
|
||||
response = wrapper_class(response)
|
||||
assert hasattr(response, "get_data")
|
||||
return response
|
||||
|
||||
def upgrade_response(response):
|
||||
"""Return a copy of response that supports Browser response interface.
|
||||
|
||||
Browser response interface is that of "seekable responses"
|
||||
(response_seek_wrapper), plus the requirement that responses must be
|
||||
useable after .close() (closeable_response).
|
||||
|
||||
Accepts responses from both mechanize and urllib2 handlers.
|
||||
|
||||
Copes with both ordinary response instances and HTTPError instances (which
|
||||
can't be simply wrapped due to the requirement of preserving the exception
|
||||
base class).
|
||||
"""
|
||||
wrapper_class = get_seek_wrapper_class(response)
|
||||
if hasattr(response, "closeable_response"):
|
||||
if not hasattr(response, "seek"):
|
||||
response = wrapper_class(response)
|
||||
assert hasattr(response, "get_data")
|
||||
return copy.copy(response)
|
||||
|
||||
# a urllib2 handler constructed the response, i.e. the response is an
|
||||
# urllib.addinfourl or a urllib2.HTTPError, instead of a
|
||||
# _Util.closeable_response as returned by e.g. mechanize.HTTPHandler
|
||||
try:
|
||||
code = response.code
|
||||
except AttributeError:
|
||||
code = None
|
||||
try:
|
||||
msg = response.msg
|
||||
except AttributeError:
|
||||
msg = None
|
||||
|
||||
# may have already-.read() data from .seek() cache
|
||||
data = None
|
||||
get_data = getattr(response, "get_data", None)
|
||||
if get_data:
|
||||
data = get_data()
|
||||
|
||||
response = closeable_response(
|
||||
response.fp, response.info(), response.geturl(), code, msg)
|
||||
response = wrapper_class(response)
|
||||
if data:
|
||||
response.set_data(data)
|
||||
return response
|
Binary file not shown.
|
@ -0,0 +1,245 @@
|
|||
"""RFC 3986 URI parsing and relative reference resolution / absolutization.
|
||||
|
||||
(aka splitting and joining)
|
||||
|
||||
Copyright 2006 John J. Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it under
|
||||
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
||||
|
||||
import re, urllib
|
||||
|
||||
## def chr_range(a, b):
|
||||
## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
||||
|
||||
## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
## "abcdefghijklmnopqrstuvwxyz"
|
||||
## "0123456789"
|
||||
## "-_.~")
|
||||
## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
||||
## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
||||
# this re matches any character that's not in URI_CHARS
|
||||
BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
||||
|
||||
|
||||
def clean_url(url, encoding):
|
||||
# percent-encode illegal URI characters
|
||||
# Trying to come up with test cases for this gave me a headache, revisit
|
||||
# when do switch to unicode.
|
||||
# Somebody else's comments (lost the attribution):
|
||||
## - IE will return you the url in the encoding you send it
|
||||
## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
||||
## characters in your link. It will send you utf-8 however if there are...
|
||||
if type(url) == type(""):
|
||||
url = url.decode(encoding, "replace")
|
||||
url = url.strip()
|
||||
# for second param to urllib.quote(), we want URI_CHARS, minus the
|
||||
# 'always_safe' characters that urllib.quote() never percent-encodes
|
||||
return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
||||
|
||||
def is_clean_uri(uri):
|
||||
"""
|
||||
>>> is_clean_uri("ABC!")
|
||||
True
|
||||
>>> is_clean_uri(u"ABC!")
|
||||
True
|
||||
>>> is_clean_uri("ABC|")
|
||||
False
|
||||
>>> is_clean_uri(u"ABC|")
|
||||
False
|
||||
>>> is_clean_uri("http://example.com/0")
|
||||
True
|
||||
>>> is_clean_uri(u"http://example.com/0")
|
||||
True
|
||||
"""
|
||||
# note module re treats bytestrings as through they were decoded as latin-1
|
||||
# so this function accepts both unicode and bytestrings
|
||||
return not bool(BAD_URI_CHARS_RE.search(uri))
|
||||
|
||||
|
||||
SPLIT_MATCH = re.compile(
|
||||
r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
||||
def urlsplit(absolute_uri):
|
||||
"""Return scheme, authority, path, query, fragment."""
|
||||
match = SPLIT_MATCH(absolute_uri)
|
||||
if match:
|
||||
g = match.groups()
|
||||
return g[1], g[3], g[4], g[6], g[8]
|
||||
|
||||
def urlunsplit(parts):
|
||||
scheme, authority, path, query, fragment = parts
|
||||
r = []
|
||||
append = r.append
|
||||
if scheme is not None:
|
||||
append(scheme)
|
||||
append(":")
|
||||
if authority is not None:
|
||||
append("//")
|
||||
append(authority)
|
||||
append(path)
|
||||
if query is not None:
|
||||
append("?")
|
||||
append(query)
|
||||
if fragment is not None:
|
||||
append("#")
|
||||
append(fragment)
|
||||
return "".join(r)
|
||||
|
||||
def urljoin(base_uri, uri_reference):
|
||||
"""Join a base URI with a URI reference and return the resulting URI.
|
||||
|
||||
See RFC 3986.
|
||||
"""
|
||||
return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
||||
urlsplit(uri_reference)))
|
||||
|
||||
# oops, this doesn't do the same thing as the literal translation
|
||||
# from the RFC below
|
||||
## import posixpath
|
||||
## def urljoin_parts(base_parts, reference_parts):
|
||||
## scheme, authority, path, query, fragment = base_parts
|
||||
## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
||||
|
||||
## # compute target URI path
|
||||
## if rpath == "":
|
||||
## tpath = path
|
||||
## else:
|
||||
## tpath = rpath
|
||||
## if not tpath.startswith("/"):
|
||||
## tpath = merge(authority, path, tpath)
|
||||
## tpath = posixpath.normpath(tpath)
|
||||
|
||||
## if rscheme is not None:
|
||||
## return (rscheme, rauthority, tpath, rquery, rfragment)
|
||||
## elif rauthority is not None:
|
||||
## return (scheme, rauthority, tpath, rquery, rfragment)
|
||||
## elif rpath == "":
|
||||
## if rquery is not None:
|
||||
## tquery = rquery
|
||||
## else:
|
||||
## tquery = query
|
||||
## return (scheme, authority, tpath, tquery, rfragment)
|
||||
## else:
|
||||
## return (scheme, authority, tpath, rquery, rfragment)
|
||||
|
||||
def urljoin_parts(base_parts, reference_parts):
|
||||
scheme, authority, path, query, fragment = base_parts
|
||||
rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
||||
|
||||
if rscheme == scheme:
|
||||
rscheme = None
|
||||
|
||||
if rscheme is not None:
|
||||
tscheme, tauthority, tpath, tquery = (
|
||||
rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
||||
else:
|
||||
if rauthority is not None:
|
||||
tauthority, tpath, tquery = (
|
||||
rauthority, remove_dot_segments(rpath), rquery)
|
||||
else:
|
||||
if rpath == "":
|
||||
tpath = path
|
||||
if rquery is not None:
|
||||
tquery = rquery
|
||||
else:
|
||||
tquery = query
|
||||
else:
|
||||
if rpath.startswith("/"):
|
||||
tpath = remove_dot_segments(rpath)
|
||||
else:
|
||||
tpath = merge(authority, path, rpath)
|
||||
tpath = remove_dot_segments(tpath)
|
||||
tquery = rquery
|
||||
tauthority = authority
|
||||
tscheme = scheme
|
||||
tfragment = rfragment
|
||||
return (tscheme, tauthority, tpath, tquery, tfragment)
|
||||
|
||||
# um, something *vaguely* like this is what I want, but I have to generate
|
||||
# lots of test cases first, if only to understand what it is that
|
||||
# remove_dot_segments really does...
|
||||
## def remove_dot_segments(path):
|
||||
## if path == '':
|
||||
## return ''
|
||||
## comps = path.split('/')
|
||||
## new_comps = []
|
||||
## for comp in comps:
|
||||
## if comp in ['.', '']:
|
||||
## if not new_comps or new_comps[-1]:
|
||||
## new_comps.append('')
|
||||
## continue
|
||||
## if comp != '..':
|
||||
## new_comps.append(comp)
|
||||
## elif new_comps:
|
||||
## new_comps.pop()
|
||||
## return '/'.join(new_comps)
|
||||
|
||||
|
||||
def remove_dot_segments(path):
|
||||
r = []
|
||||
while path:
|
||||
# A
|
||||
if path.startswith("../"):
|
||||
path = path[3:]
|
||||
continue
|
||||
if path.startswith("./"):
|
||||
path = path[2:]
|
||||
continue
|
||||
# B
|
||||
if path.startswith("/./"):
|
||||
path = path[2:]
|
||||
continue
|
||||
if path == "/.":
|
||||
path = "/"
|
||||
continue
|
||||
# C
|
||||
if path.startswith("/../"):
|
||||
path = path[3:]
|
||||
if r:
|
||||
r.pop()
|
||||
continue
|
||||
if path == "/..":
|
||||
path = "/"
|
||||
if r:
|
||||
r.pop()
|
||||
continue
|
||||
# D
|
||||
if path == ".":
|
||||
path = path[1:]
|
||||
continue
|
||||
if path == "..":
|
||||
path = path[2:]
|
||||
continue
|
||||
# E
|
||||
start = 0
|
||||
if path.startswith("/"):
|
||||
start = 1
|
||||
ii = path.find("/", start)
|
||||
if ii < 0:
|
||||
ii = None
|
||||
r.append(path[:ii])
|
||||
if ii is None:
|
||||
break
|
||||
path = path[ii:]
|
||||
return "".join(r)
|
||||
|
||||
def merge(base_authority, base_path, ref_path):
|
||||
# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
||||
# doesn't even take base_authority as a parameter, despite the wording in
|
||||
# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
||||
#if base_authority is not None and base_path == "":
|
||||
if base_path == "":
|
||||
return "/" + ref_path
|
||||
ii = base_path.rfind("/")
|
||||
if ii >= 0:
|
||||
return base_path[:ii+1] + ref_path
|
||||
return ref_path
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
Binary file not shown.
|
@ -0,0 +1,559 @@
|
|||
# Taken from Python 2.6.4 and regexp module constants modified
|
||||
"""A parser for SGML, using the derived class as a static DTD."""
|
||||
|
||||
# XXX This only supports those SGML features used by HTML.
|
||||
|
||||
# XXX There should be a way to distinguish between PCDATA (parsed
|
||||
# character data -- the normal case), RCDATA (replaceable character
|
||||
# data -- only char and entity references and end tags are special)
|
||||
# and CDATA (character data -- only end tags are special). RCDATA is
|
||||
# not supported at all.
|
||||
|
||||
|
||||
# from warnings import warnpy3k
|
||||
# warnpy3k("the sgmllib module has been removed in Python 3.0",
|
||||
# stacklevel=2)
|
||||
# del warnpy3k
|
||||
|
||||
import markupbase
|
||||
import re
|
||||
|
||||
__all__ = ["SGMLParser", "SGMLParseError"]
|
||||
|
||||
# Regular expressions used for parsing
|
||||
|
||||
interesting = re.compile('[&<]')
|
||||
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
|
||||
'<([a-zA-Z][^<>]*|'
|
||||
'/([a-zA-Z][^<>]*)?|'
|
||||
'![^<>]*)?')
|
||||
|
||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||
# hack to fix http://bugs.python.org/issue803422
|
||||
# charref = re.compile('&#([0-9]+)[^0-9]')
|
||||
charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
|
||||
|
||||
starttagopen = re.compile('<[>a-zA-Z]')
|
||||
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
||||
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
||||
piclose = re.compile('>')
|
||||
endbracket = re.compile('[<>]')
|
||||
# hack moved from _beautifulsoup.py (bundled BeautifulSoup version 2)
|
||||
#This code makes Beautiful Soup able to parse XML with namespaces
|
||||
# tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||
tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
||||
attrfind = re.compile(
|
||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
|
||||
|
||||
|
||||
class SGMLParseError(RuntimeError):
|
||||
"""Exception raised for all parse errors."""
|
||||
pass
|
||||
|
||||
|
||||
# SGML parser base class -- find tags and call handler functions.
|
||||
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
|
||||
# The dtd is defined by deriving a class which defines methods
|
||||
# with special names to handle tags: start_foo and end_foo to handle
|
||||
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
|
||||
# (Tags are converted to lower case for this purpose.) The data
|
||||
# between tags is passed to the parser by calling self.handle_data()
|
||||
# with some data as argument (the data may be split up in arbitrary
|
||||
# chunks). Entity references are passed by calling
|
||||
# self.handle_entityref() with the entity reference as argument.
|
||||
|
||||
class SGMLParser(markupbase.ParserBase):
|
||||
# Definition of entities -- derived classes may override
|
||||
entity_or_charref = re.compile('&(?:'
|
||||
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
|
||||
')(;?)')
|
||||
|
||||
def __init__(self, verbose=0):
|
||||
"""Initialize and reset this instance."""
|
||||
self.verbose = verbose
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset this instance. Loses all unprocessed data."""
|
||||
self.__starttag_text = None
|
||||
self.rawdata = ''
|
||||
self.stack = []
|
||||
self.lasttag = '???'
|
||||
self.nomoretags = 0
|
||||
self.literal = 0
|
||||
markupbase.ParserBase.reset(self)
|
||||
|
||||
def setnomoretags(self):
|
||||
"""Enter literal mode (CDATA) till EOF.
|
||||
|
||||
Intended for derived classes only.
|
||||
"""
|
||||
self.nomoretags = self.literal = 1
|
||||
|
||||
def setliteral(self, *args):
|
||||
"""Enter literal mode (CDATA).
|
||||
|
||||
Intended for derived classes only.
|
||||
"""
|
||||
self.literal = 1
|
||||
|
||||
def feed(self, data):
|
||||
"""Feed some data to the parser.
|
||||
|
||||
Call this as often as you want, with as little or as much text
|
||||
as you want (may include '\n'). (This just saves the text,
|
||||
all the processing is done by goahead().)
|
||||
"""
|
||||
|
||||
self.rawdata = self.rawdata + data
|
||||
self.goahead(0)
|
||||
|
||||
def close(self):
|
||||
"""Handle the remaining data."""
|
||||
self.goahead(1)
|
||||
|
||||
def error(self, message):
|
||||
raise SGMLParseError(message)
|
||||
|
||||
# Internal -- handle data as far as reasonable. May leave state
|
||||
# and data to be processed by a subsequent call. If 'end' is
|
||||
# true, force handling all data as if followed by EOF marker.
|
||||
def goahead(self, end):
|
||||
rawdata = self.rawdata
|
||||
i = 0
|
||||
n = len(rawdata)
|
||||
while i < n:
|
||||
if self.nomoretags:
|
||||
self.handle_data(rawdata[i:n])
|
||||
i = n
|
||||
break
|
||||
match = interesting.search(rawdata, i)
|
||||
if match: j = match.start()
|
||||
else: j = n
|
||||
if i < j:
|
||||
self.handle_data(rawdata[i:j])
|
||||
i = j
|
||||
if i == n: break
|
||||
if rawdata[i] == '<':
|
||||
if starttagopen.match(rawdata, i):
|
||||
if self.literal:
|
||||
self.handle_data(rawdata[i])
|
||||
i = i+1
|
||||
continue
|
||||
k = self.parse_starttag(i)
|
||||
if k < 0: break
|
||||
i = k
|
||||
continue
|
||||
if rawdata.startswith("</", i):
|
||||
k = self.parse_endtag(i)
|
||||
if k < 0: break
|
||||
i = k
|
||||
self.literal = 0
|
||||
continue
|
||||
if self.literal:
|
||||
if n > (i + 1):
|
||||
self.handle_data("<")
|
||||
i = i+1
|
||||
else:
|
||||
# incomplete
|
||||
break
|
||||
continue
|
||||
if rawdata.startswith("<!--", i):
|
||||
# Strictly speaking, a comment is --.*--
|
||||
# within a declaration tag <!...>.
|
||||
# This should be removed,
|
||||
# and comments handled only in parse_declaration.
|
||||
k = self.parse_comment(i)
|
||||
if k < 0: break
|
||||
i = k
|
||||
continue
|
||||
if rawdata.startswith("<?", i):
|
||||
k = self.parse_pi(i)
|
||||
if k < 0: break
|
||||
i = i+k
|
||||
continue
|
||||
if rawdata.startswith("<!", i):
|
||||
# This is some sort of declaration; in "HTML as
|
||||
# deployed," this should only be the document type
|
||||
# declaration ("<!DOCTYPE html...>").
|
||||
k = self.parse_declaration(i)
|
||||
if k < 0: break
|
||||
i = k
|
||||
continue
|
||||
elif rawdata[i] == '&':
|
||||
if self.literal:
|
||||
self.handle_data(rawdata[i])
|
||||
i = i+1
|
||||
continue
|
||||
match = charref.match(rawdata, i)
|
||||
if match:
|
||||
name = match.group(1)
|
||||
self.handle_charref(name)
|
||||
i = match.end(0)
|
||||
if rawdata[i-1] != ';': i = i-1
|
||||
continue
|
||||
match = entityref.match(rawdata, i)
|
||||
if match:
|
||||
name = match.group(1)
|
||||
self.handle_entityref(name)
|
||||
i = match.end(0)
|
||||
if rawdata[i-1] != ';': i = i-1
|
||||
continue
|
||||
else:
|
||||
self.error('neither < nor & ??')
|
||||
# We get here only if incomplete matches but
|
||||
# nothing else
|
||||
match = incomplete.match(rawdata, i)
|
||||
if not match:
|
||||
self.handle_data(rawdata[i])
|
||||
i = i+1
|
||||
continue
|
||||
j = match.end(0)
|
||||
if j == n:
|
||||
break # Really incomplete
|
||||
self.handle_data(rawdata[i:j])
|
||||
i = j
|
||||
# end while
|
||||
if end and i < n:
|
||||
self.handle_data(rawdata[i:n])
|
||||
i = n
|
||||
self.rawdata = rawdata[i:]
|
||||
# XXX if end: check for empty stack
|
||||
|
||||
# Extensions for the DOCTYPE scanner:
|
||||
_decl_otherchars = '='
|
||||
|
||||
# Internal -- parse processing instr, return length or -1 if not terminated
|
||||
def parse_pi(self, i):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+2] != '<?':
|
||||
self.error('unexpected call to parse_pi()')
|
||||
match = piclose.search(rawdata, i+2)
|
||||
if not match:
|
||||
return -1
|
||||
j = match.start(0)
|
||||
self.handle_pi(rawdata[i+2: j])
|
||||
j = match.end(0)
|
||||
return j-i
|
||||
|
||||
def get_starttag_text(self):
|
||||
return self.__starttag_text
|
||||
|
||||
# Internal -- handle starttag, return length or -1 if not terminated
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
start_pos = i
|
||||
rawdata = self.rawdata
|
||||
if shorttagopen.match(rawdata, i):
|
||||
# SGML shorthand: <tag/data/ == <tag>data</tag>
|
||||
# XXX Can data contain &... (entity or char refs)?
|
||||
# XXX Can data contain < or > (tag characters)?
|
||||
# XXX Can there be whitespace before the first /?
|
||||
match = shorttag.match(rawdata, i)
|
||||
if not match:
|
||||
return -1
|
||||
tag, data = match.group(1, 2)
|
||||
self.__starttag_text = '<%s/' % tag
|
||||
tag = tag.lower()
|
||||
k = match.end(0)
|
||||
self.finish_shorttag(tag, data)
|
||||
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
|
||||
return k
|
||||
# XXX The following should skip matching quotes (' or ")
|
||||
# As a shortcut way to exit, this isn't so bad, but shouldn't
|
||||
# be used to locate the actual end of the start tag since the
|
||||
# < or > characters may be embedded in an attribute value.
|
||||
match = endbracket.search(rawdata, i+1)
|
||||
if not match:
|
||||
return -1
|
||||
j = match.start(0)
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
if rawdata[i:i+2] == '<>':
|
||||
# SGML shorthand: <> == <last open tag seen>
|
||||
k = j
|
||||
tag = self.lasttag
|
||||
else:
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
if not match:
|
||||
self.error('unexpected call to parse_starttag')
|
||||
k = match.end(0)
|
||||
tag = rawdata[i+1:k].lower()
|
||||
self.lasttag = tag
|
||||
while k < j:
|
||||
match = attrfind.match(rawdata, k)
|
||||
if not match: break
|
||||
attrname, rest, attrvalue = match.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = attrname
|
||||
else:
|
||||
if (attrvalue[:1] == "'" == attrvalue[-1:] or
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]):
|
||||
# strip quotes
|
||||
attrvalue = attrvalue[1:-1]
|
||||
attrvalue = self.entity_or_charref.sub(
|
||||
self._convert_ref, attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = match.end(0)
|
||||
if rawdata[j] == '>':
|
||||
j = j+1
|
||||
self.__starttag_text = rawdata[start_pos:j]
|
||||
self.finish_starttag(tag, attrs)
|
||||
return j
|
||||
|
||||
# Internal -- convert entity or character reference
|
||||
def _convert_ref(self, match):
|
||||
if match.group(2):
|
||||
return self.convert_charref(match.group(2)) or \
|
||||
'&#%s%s' % match.groups()[1:]
|
||||
elif match.group(3):
|
||||
return self.convert_entityref(match.group(1)) or \
|
||||
'&%s;' % match.group(1)
|
||||
else:
|
||||
return '&%s' % match.group(1)
|
||||
|
||||
# Internal -- parse endtag
|
||||
def parse_endtag(self, i):
|
||||
rawdata = self.rawdata
|
||||
match = endbracket.search(rawdata, i+1)
|
||||
if not match:
|
||||
return -1
|
||||
j = match.start(0)
|
||||
tag = rawdata[i+2:j].strip().lower()
|
||||
if rawdata[j] == '>':
|
||||
j = j+1
|
||||
self.finish_endtag(tag)
|
||||
return j
|
||||
|
||||
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
|
||||
def finish_shorttag(self, tag, data):
|
||||
self.finish_starttag(tag, [])
|
||||
self.handle_data(data)
|
||||
self.finish_endtag(tag)
|
||||
|
||||
# Internal -- finish processing of start tag
|
||||
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
|
||||
def finish_starttag(self, tag, attrs):
|
||||
try:
|
||||
method = getattr(self, 'start_' + tag)
|
||||
except AttributeError:
|
||||
try:
|
||||
method = getattr(self, 'do_' + tag)
|
||||
except AttributeError:
|
||||
self.unknown_starttag(tag, attrs)
|
||||
return -1
|
||||
else:
|
||||
self.handle_starttag(tag, method, attrs)
|
||||
return 0
|
||||
else:
|
||||
self.stack.append(tag)
|
||||
self.handle_starttag(tag, method, attrs)
|
||||
return 1
|
||||
|
||||
# Internal -- finish processing of end tag
|
||||
def finish_endtag(self, tag):
|
||||
if not tag:
|
||||
found = len(self.stack) - 1
|
||||
if found < 0:
|
||||
self.unknown_endtag(tag)
|
||||
return
|
||||
else:
|
||||
if tag not in self.stack:
|
||||
try:
|
||||
method = getattr(self, 'end_' + tag)
|
||||
except AttributeError:
|
||||
self.unknown_endtag(tag)
|
||||
else:
|
||||
self.report_unbalanced(tag)
|
||||
return
|
||||
found = len(self.stack)
|
||||
for i in range(found):
|
||||
if self.stack[i] == tag: found = i
|
||||
while len(self.stack) > found:
|
||||
tag = self.stack[-1]
|
||||
try:
|
||||
method = getattr(self, 'end_' + tag)
|
||||
except AttributeError:
|
||||
method = None
|
||||
if method:
|
||||
self.handle_endtag(tag, method)
|
||||
else:
|
||||
self.unknown_endtag(tag)
|
||||
del self.stack[-1]
|
||||
|
||||
# Overridable -- handle start tag
|
||||
def handle_starttag(self, tag, method, attrs):
|
||||
method(attrs)
|
||||
|
||||
# Overridable -- handle end tag
|
||||
def handle_endtag(self, tag, method):
|
||||
method()
|
||||
|
||||
# Example -- report an unbalanced </...> tag.
|
||||
def report_unbalanced(self, tag):
|
||||
if self.verbose:
|
||||
print '*** Unbalanced </' + tag + '>'
|
||||
print '*** Stack:', self.stack
|
||||
|
||||
def convert_charref(self, name):
|
||||
"""Convert character reference, may be overridden."""
|
||||
try:
|
||||
n = int(name)
|
||||
except ValueError:
|
||||
return
|
||||
if not 0 <= n <= 127:
|
||||
return
|
||||
return self.convert_codepoint(n)
|
||||
|
||||
def convert_codepoint(self, codepoint):
|
||||
return chr(codepoint)
|
||||
|
||||
def handle_charref(self, name):
|
||||
"""Handle character reference, no need to override."""
|
||||
replacement = self.convert_charref(name)
|
||||
if replacement is None:
|
||||
self.unknown_charref(name)
|
||||
else:
|
||||
self.handle_data(replacement)
|
||||
|
||||
# Definition of entities -- derived classes may override
|
||||
entitydefs = \
|
||||
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
|
||||
|
||||
def convert_entityref(self, name):
|
||||
"""Convert entity references.
|
||||
|
||||
As an alternative to overriding this method; one can tailor the
|
||||
results by setting up the self.entitydefs mapping appropriately.
|
||||
"""
|
||||
table = self.entitydefs
|
||||
if name in table:
|
||||
return table[name]
|
||||
else:
|
||||
return
|
||||
|
||||
def handle_entityref(self, name):
|
||||
"""Handle entity references, no need to override."""
|
||||
replacement = self.convert_entityref(name)
|
||||
if replacement is None:
|
||||
self.unknown_entityref(name)
|
||||
else:
|
||||
self.handle_data(replacement)
|
||||
|
||||
# Example -- handle data, should be overridden
|
||||
def handle_data(self, data):
|
||||
pass
|
||||
|
||||
# Example -- handle comment, could be overridden
|
||||
def handle_comment(self, data):
|
||||
pass
|
||||
|
||||
# Example -- handle declaration, could be overridden
|
||||
def handle_decl(self, decl):
|
||||
pass
|
||||
|
||||
# Example -- handle processing instruction, could be overridden
|
||||
def handle_pi(self, data):
|
||||
pass
|
||||
|
||||
# To be overridden -- handlers for unknown objects
|
||||
def unknown_starttag(self, tag, attrs): pass
|
||||
def unknown_endtag(self, tag): pass
|
||||
def unknown_charref(self, ref): pass
|
||||
def unknown_entityref(self, ref): pass
|
||||
|
||||
|
||||
class TestSGMLParser(SGMLParser):
|
||||
|
||||
def __init__(self, verbose=0):
|
||||
self.testdata = ""
|
||||
SGMLParser.__init__(self, verbose)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.testdata = self.testdata + data
|
||||
if len(repr(self.testdata)) >= 70:
|
||||
self.flush()
|
||||
|
||||
def flush(self):
|
||||
data = self.testdata
|
||||
if data:
|
||||
self.testdata = ""
|
||||
print 'data:', repr(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.flush()
|
||||
r = repr(data)
|
||||
if len(r) > 68:
|
||||
r = r[:32] + '...' + r[-32:]
|
||||
print 'comment:', r
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
self.flush()
|
||||
if not attrs:
|
||||
print 'start tag: <' + tag + '>'
|
||||
else:
|
||||
print 'start tag: <' + tag,
|
||||
for name, value in attrs:
|
||||
print name + '=' + '"' + value + '"',
|
||||
print '>'
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
self.flush()
|
||||
print 'end tag: </' + tag + '>'
|
||||
|
||||
def unknown_entityref(self, ref):
|
||||
self.flush()
|
||||
print '*** unknown entity ref: &' + ref + ';'
|
||||
|
||||
def unknown_charref(self, ref):
|
||||
self.flush()
|
||||
print '*** unknown char ref: &#' + ref + ';'
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self.flush()
|
||||
print '*** unknown decl: [' + data + ']'
|
||||
|
||||
def close(self):
|
||||
SGMLParser.close(self)
|
||||
self.flush()
|
||||
|
||||
|
||||
def test(args = None):
|
||||
import sys
|
||||
|
||||
if args is None:
|
||||
args = sys.argv[1:]
|
||||
|
||||
if args and args[0] == '-s':
|
||||
args = args[1:]
|
||||
klass = SGMLParser
|
||||
else:
|
||||
klass = TestSGMLParser
|
||||
|
||||
if args:
|
||||
file = args[0]
|
||||
else:
|
||||
file = 'test.html'
|
||||
|
||||
if file == '-':
|
||||
f = sys.stdin
|
||||
else:
|
||||
try:
|
||||
f = open(file, 'r')
|
||||
except IOError, msg:
|
||||
print file, ":", msg
|
||||
sys.exit(1)
|
||||
|
||||
data = f.read()
|
||||
if f is not sys.stdin:
|
||||
f.close()
|
||||
|
||||
x = klass()
|
||||
for c in data:
|
||||
x.feed(c)
|
||||
x.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
Binary file not shown.
|
@ -0,0 +1,6 @@
|
|||
import socket
|
||||
|
||||
try:
|
||||
_GLOBAL_DEFAULT_TIMEOUT = socket._GLOBAL_DEFAULT_TIMEOUT
|
||||
except AttributeError:
|
||||
_GLOBAL_DEFAULT_TIMEOUT = object()
|
Binary file not shown.
|
@ -0,0 +1,162 @@
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
|
||||
class SetupStack(object):
|
||||
|
||||
def __init__(self):
|
||||
self._on_teardown = []
|
||||
|
||||
def add_teardown(self, teardown):
|
||||
self._on_teardown.append(teardown)
|
||||
|
||||
def tear_down(self):
|
||||
for func in reversed(self._on_teardown):
|
||||
func()
|
||||
|
||||
|
||||
class TearDownConvenience(object):
|
||||
|
||||
def __init__(self, setup_stack=None):
|
||||
self._own_setup_stack = setup_stack is None
|
||||
if setup_stack is None:
|
||||
setup_stack = SetupStack()
|
||||
self._setup_stack = setup_stack
|
||||
|
||||
# only call this convenience method if no setup_stack was supplied to c'tor
|
||||
def tear_down(self):
|
||||
assert self._own_setup_stack
|
||||
self._setup_stack.tear_down()
|
||||
|
||||
|
||||
class TempDirMaker(TearDownConvenience):
|
||||
|
||||
def make_temp_dir(self, dir_=None):
|
||||
temp_dir = tempfile.mkdtemp(prefix="tmp-%s-" % self.__class__.__name__,
|
||||
dir=dir_)
|
||||
def tear_down():
|
||||
shutil.rmtree(temp_dir)
|
||||
self._setup_stack.add_teardown(tear_down)
|
||||
return temp_dir
|
||||
|
||||
|
||||
class MonkeyPatcher(TearDownConvenience):
|
||||
|
||||
Unset = object()
|
||||
|
||||
def monkey_patch(self, obj, name, value):
|
||||
orig_value = getattr(obj, name)
|
||||
setattr(obj, name, value)
|
||||
def reverse_patch():
|
||||
setattr(obj, name, orig_value)
|
||||
self._setup_stack.add_teardown(reverse_patch)
|
||||
|
||||
def _set_environ(self, env, name, value):
|
||||
if value is self.Unset:
|
||||
try:
|
||||
del env[name]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
env[name] = value
|
||||
|
||||
def monkey_patch_environ(self, name, value, env=os.environ):
|
||||
orig_value = env.get(name, self.Unset)
|
||||
self._set_environ(env, name, value)
|
||||
def reverse_patch():
|
||||
self._set_environ(env, name, orig_value)
|
||||
self._setup_stack.add_teardown(reverse_patch)
|
||||
|
||||
|
||||
class FixtureFactory(object):
|
||||
|
||||
def __init__(self):
|
||||
self._setup_stack = SetupStack()
|
||||
self._context_managers = {}
|
||||
self._fixtures = {}
|
||||
|
||||
def register_context_manager(self, name, context_manager):
|
||||
self._context_managers[name] = context_manager
|
||||
|
||||
def get_fixture(self, name, add_teardown):
|
||||
context_manager = self._context_managers[name]
|
||||
fixture = context_manager.__enter__()
|
||||
add_teardown(lambda: context_manager.__exit__(None, None, None))
|
||||
return fixture
|
||||
|
||||
def get_cached_fixture(self, name):
|
||||
fixture = self._fixtures.get(name)
|
||||
if fixture is None:
|
||||
fixture = self.get_fixture(name, self._setup_stack.add_teardown)
|
||||
self._fixtures[name] = fixture
|
||||
return fixture
|
||||
|
||||
def tear_down(self):
|
||||
self._setup_stack.tear_down()
|
||||
|
||||
|
||||
class TestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self._setup_stack = SetupStack()
|
||||
self._monkey_patcher = MonkeyPatcher(self._setup_stack)
|
||||
|
||||
def tearDown(self):
|
||||
self._setup_stack.tear_down()
|
||||
|
||||
def register_context_manager(self, name, context_manager):
|
||||
return self.fixture_factory.register_context_manager(
|
||||
name, context_manager)
|
||||
|
||||
def get_fixture(self, name):
|
||||
return self.fixture_factory.get_fixture(name, self.add_teardown)
|
||||
|
||||
def get_cached_fixture(self, name):
|
||||
return self.fixture_factory.get_cached_fixture(name)
|
||||
|
||||
def add_teardown(self, *args, **kwds):
|
||||
self._setup_stack.add_teardown(*args, **kwds)
|
||||
|
||||
def make_temp_dir(self, *args, **kwds):
|
||||
return TempDirMaker(self._setup_stack).make_temp_dir(*args, **kwds)
|
||||
|
||||
def monkey_patch(self, *args, **kwds):
|
||||
return self._monkey_patcher.monkey_patch(*args, **kwds)
|
||||
|
||||
def monkey_patch_environ(self, *args, **kwds):
|
||||
return self._monkey_patcher.monkey_patch_environ(*args, **kwds)
|
||||
|
||||
def assert_contains(self, container, containee):
|
||||
self.assertTrue(containee in container, "%r not in %r" %
|
||||
(containee, container))
|
||||
|
||||
def assert_less_than(self, got, expected):
|
||||
self.assertTrue(got < expected, "%r >= %r" %
|
||||
(got, expected))
|
||||
|
||||
|
||||
# http://lackingrhoticity.blogspot.com/2009/01/testing-using-golden-files-in-python.html
|
||||
|
||||
class GoldenTestCase(TestCase):
|
||||
|
||||
run_meld = False
|
||||
|
||||
def assert_golden(self, dir_got, dir_expect):
|
||||
assert os.path.exists(dir_expect), dir_expect
|
||||
proc = subprocess.Popen(["diff", "--recursive", "-u", "-N",
|
||||
"--exclude=.*", dir_expect, dir_got],
|
||||
stdout=subprocess.PIPE)
|
||||
stdout, stderr = proc.communicate()
|
||||
if len(stdout) > 0:
|
||||
if self.run_meld:
|
||||
# Put expected output on the right because that is the
|
||||
# side we usually edit.
|
||||
subprocess.call(["meld", dir_got, dir_expect])
|
||||
raise AssertionError(
|
||||
"Differences from golden files found.\n"
|
||||
"Try running with --meld to update golden files.\n"
|
||||
"%s" % stdout)
|
||||
self.assertEquals(proc.wait(), 0)
|
|
@ -0,0 +1,50 @@
|
|||
# urllib2 work-alike interface
|
||||
# ...from urllib2...
|
||||
from urllib2 import \
|
||||
URLError, \
|
||||
HTTPError
|
||||
# ...and from mechanize
|
||||
from _auth import \
|
||||
HTTPProxyPasswordMgr, \
|
||||
HTTPSClientCertMgr
|
||||
from _debug import \
|
||||
HTTPResponseDebugProcessor, \
|
||||
HTTPRedirectDebugProcessor
|
||||
# crap ATM
|
||||
## from _gzip import \
|
||||
## HTTPGzipProcessor
|
||||
from _urllib2_fork import \
|
||||
AbstractBasicAuthHandler, \
|
||||
AbstractDigestAuthHandler, \
|
||||
BaseHandler, \
|
||||
CacheFTPHandler, \
|
||||
FileHandler, \
|
||||
FTPHandler, \
|
||||
HTTPBasicAuthHandler, \
|
||||
HTTPCookieProcessor, \
|
||||
HTTPDefaultErrorHandler, \
|
||||
HTTPDigestAuthHandler, \
|
||||
HTTPErrorProcessor, \
|
||||
HTTPHandler, \
|
||||
HTTPPasswordMgr, \
|
||||
HTTPPasswordMgrWithDefaultRealm, \
|
||||
HTTPRedirectHandler, \
|
||||
ProxyBasicAuthHandler, \
|
||||
ProxyDigestAuthHandler, \
|
||||
ProxyHandler, \
|
||||
UnknownHandler
|
||||
from _http import \
|
||||
HTTPEquivProcessor, \
|
||||
HTTPRefererProcessor, \
|
||||
HTTPRefreshProcessor, \
|
||||
HTTPRobotRulesProcessor, \
|
||||
RobotExclusionError
|
||||
import httplib
|
||||
if hasattr(httplib, 'HTTPS'):
|
||||
from _urllib2_fork import HTTPSHandler
|
||||
del httplib
|
||||
from _opener import OpenerDirector, \
|
||||
SeekableResponseOpener, \
|
||||
build_opener, install_opener, urlopen
|
||||
from _request import \
|
||||
Request
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,367 @@
|
|||
"""Convenient HTTP UserAgent class.
|
||||
|
||||
This is a subclass of urllib2.OpenerDirector.
|
||||
|
||||
|
||||
Copyright 2003-2006 John J. Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it under
|
||||
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
||||
included with the distribution).
|
||||
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import _auth
|
||||
import _gzip
|
||||
import _opener
|
||||
import _response
|
||||
import _sockettimeout
|
||||
import _urllib2
|
||||
|
||||
|
||||
class UserAgentBase(_opener.OpenerDirector):
|
||||
"""Convenient user-agent class.
|
||||
|
||||
Do not use .add_handler() to add a handler for something already dealt with
|
||||
by this code.
|
||||
|
||||
The only reason at present for the distinction between UserAgent and
|
||||
UserAgentBase is so that classes that depend on .seek()able responses
|
||||
(e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass
|
||||
UserAgent exposes a .set_seekable_responses() method that allows switching
|
||||
off the adding of a .seek() method to responses.
|
||||
|
||||
Public attributes:
|
||||
|
||||
addheaders: list of (name, value) pairs specifying headers to send with
|
||||
every request, unless they are overridden in the Request instance.
|
||||
|
||||
>>> ua = UserAgentBase()
|
||||
>>> ua.addheaders = [
|
||||
... ("User-agent", "Mozilla/5.0 (compatible)"),
|
||||
... ("From", "responsible.person@example.com")]
|
||||
|
||||
"""
|
||||
|
||||
handler_classes = {
|
||||
# scheme handlers
|
||||
"http": _urllib2.HTTPHandler,
|
||||
# CacheFTPHandler is buggy, at least in 2.3, so we don't use it
|
||||
"ftp": _urllib2.FTPHandler,
|
||||
"file": _urllib2.FileHandler,
|
||||
|
||||
# other handlers
|
||||
"_unknown": _urllib2.UnknownHandler,
|
||||
# HTTP{S,}Handler depend on HTTPErrorProcessor too
|
||||
"_http_error": _urllib2.HTTPErrorProcessor,
|
||||
"_http_default_error": _urllib2.HTTPDefaultErrorHandler,
|
||||
|
||||
# feature handlers
|
||||
"_basicauth": _urllib2.HTTPBasicAuthHandler,
|
||||
"_digestauth": _urllib2.HTTPDigestAuthHandler,
|
||||
"_redirect": _urllib2.HTTPRedirectHandler,
|
||||
"_cookies": _urllib2.HTTPCookieProcessor,
|
||||
"_refresh": _urllib2.HTTPRefreshProcessor,
|
||||
"_equiv": _urllib2.HTTPEquivProcessor,
|
||||
"_proxy": _urllib2.ProxyHandler,
|
||||
"_proxy_basicauth": _urllib2.ProxyBasicAuthHandler,
|
||||
"_proxy_digestauth": _urllib2.ProxyDigestAuthHandler,
|
||||
"_robots": _urllib2.HTTPRobotRulesProcessor,
|
||||
"_gzip": _gzip.HTTPGzipProcessor, # experimental!
|
||||
|
||||
# debug handlers
|
||||
"_debug_redirect": _urllib2.HTTPRedirectDebugProcessor,
|
||||
"_debug_response_body": _urllib2.HTTPResponseDebugProcessor,
|
||||
}
|
||||
|
||||
default_schemes = ["http", "ftp", "file"]
|
||||
default_others = ["_unknown", "_http_error", "_http_default_error"]
|
||||
default_features = ["_redirect", "_cookies",
|
||||
"_refresh", "_equiv",
|
||||
"_basicauth", "_digestauth",
|
||||
"_proxy", "_proxy_basicauth", "_proxy_digestauth",
|
||||
"_robots",
|
||||
]
|
||||
if hasattr(_urllib2, 'HTTPSHandler'):
|
||||
handler_classes["https"] = _urllib2.HTTPSHandler
|
||||
default_schemes.append("https")
|
||||
|
||||
def __init__(self):
|
||||
_opener.OpenerDirector.__init__(self)
|
||||
|
||||
ua_handlers = self._ua_handlers = {}
|
||||
for scheme in (self.default_schemes+
|
||||
self.default_others+
|
||||
self.default_features):
|
||||
klass = self.handler_classes[scheme]
|
||||
ua_handlers[scheme] = klass()
|
||||
for handler in ua_handlers.itervalues():
|
||||
self.add_handler(handler)
|
||||
|
||||
# Yuck.
|
||||
# Ensure correct default constructor args were passed to
|
||||
# HTTPRefreshProcessor and HTTPEquivProcessor.
|
||||
if "_refresh" in ua_handlers:
|
||||
self.set_handle_refresh(True)
|
||||
if "_equiv" in ua_handlers:
|
||||
self.set_handle_equiv(True)
|
||||
# Ensure default password managers are installed.
|
||||
pm = ppm = None
|
||||
if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers:
|
||||
pm = _urllib2.HTTPPasswordMgrWithDefaultRealm()
|
||||
if ("_proxy_basicauth" in ua_handlers or
|
||||
"_proxy_digestauth" in ua_handlers):
|
||||
ppm = _auth.HTTPProxyPasswordMgr()
|
||||
self.set_password_manager(pm)
|
||||
self.set_proxy_password_manager(ppm)
|
||||
# set default certificate manager
|
||||
if "https" in ua_handlers:
|
||||
cm = _urllib2.HTTPSClientCertMgr()
|
||||
self.set_client_cert_manager(cm)
|
||||
|
||||
def close(self):
|
||||
_opener.OpenerDirector.close(self)
|
||||
self._ua_handlers = None
|
||||
|
||||
# XXX
|
||||
## def set_timeout(self, timeout):
|
||||
## self._timeout = timeout
|
||||
## def set_http_connection_cache(self, conn_cache):
|
||||
## self._http_conn_cache = conn_cache
|
||||
## def set_ftp_connection_cache(self, conn_cache):
|
||||
## # XXX ATM, FTP has cache as part of handler; should it be separate?
|
||||
## self._ftp_conn_cache = conn_cache
|
||||
|
||||
def set_handled_schemes(self, schemes):
|
||||
"""Set sequence of URL scheme (protocol) strings.
|
||||
|
||||
For example: ua.set_handled_schemes(["http", "ftp"])
|
||||
|
||||
If this fails (with ValueError) because you've passed an unknown
|
||||
scheme, the set of handled schemes will not be changed.
|
||||
|
||||
"""
|
||||
want = {}
|
||||
for scheme in schemes:
|
||||
if scheme.startswith("_"):
|
||||
raise ValueError("not a scheme '%s'" % scheme)
|
||||
if scheme not in self.handler_classes:
|
||||
raise ValueError("unknown scheme '%s'")
|
||||
want[scheme] = None
|
||||
|
||||
# get rid of scheme handlers we don't want
|
||||
for scheme, oldhandler in self._ua_handlers.items():
|
||||
if scheme.startswith("_"): continue # not a scheme handler
|
||||
if scheme not in want:
|
||||
self._replace_handler(scheme, None)
|
||||
else:
|
||||
del want[scheme] # already got it
|
||||
# add the scheme handlers that are missing
|
||||
for scheme in want.keys():
|
||||
self._set_handler(scheme, True)
|
||||
|
||||
def set_cookiejar(self, cookiejar):
|
||||
"""Set a mechanize.CookieJar, or None."""
|
||||
self._set_handler("_cookies", obj=cookiejar)
|
||||
|
||||
# XXX could use Greg Stein's httpx for some of this instead?
|
||||
# or httplib2??
|
||||
def set_proxies(self, proxies=None, proxy_bypass=None):
|
||||
"""Configure proxy settings.
|
||||
|
||||
proxies: dictionary mapping URL scheme to proxy specification. None
|
||||
means use the default system-specific settings.
|
||||
proxy_bypass: function taking hostname, returning whether proxy should
|
||||
be used. None means use the default system-specific settings.
|
||||
|
||||
The default is to try to obtain proxy settings from the system (see the
|
||||
documentation for urllib.urlopen for information about the
|
||||
system-specific methods used -- note that's urllib, not urllib2).
|
||||
|
||||
To avoid all use of proxies, pass an empty proxies dict.
|
||||
|
||||
>>> ua = UserAgentBase()
|
||||
>>> def proxy_bypass(hostname):
|
||||
... return hostname == "noproxy.com"
|
||||
>>> ua.set_proxies(
|
||||
... {"http": "joe:password@myproxy.example.com:3128",
|
||||
... "ftp": "proxy.example.com"},
|
||||
... proxy_bypass)
|
||||
|
||||
"""
|
||||
self._set_handler("_proxy", True,
|
||||
constructor_kwds=dict(proxies=proxies,
|
||||
proxy_bypass=proxy_bypass))
|
||||
|
||||
def add_password(self, url, user, password, realm=None):
|
||||
self._password_manager.add_password(realm, url, user, password)
|
||||
def add_proxy_password(self, user, password, hostport=None, realm=None):
|
||||
self._proxy_password_manager.add_password(
|
||||
realm, hostport, user, password)
|
||||
|
||||
def add_client_certificate(self, url, key_file, cert_file):
|
||||
"""Add an SSL client certificate, for HTTPS client auth.
|
||||
|
||||
key_file and cert_file must be filenames of the key and certificate
|
||||
files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS
|
||||
12) file to PEM format:
|
||||
|
||||
openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem
|
||||
openssl pkcs12 -nocerts -in cert.p12 -out key.pem
|
||||
|
||||
|
||||
Note that client certificate password input is very inflexible ATM. At
|
||||
the moment this seems to be console only, which is presumably the
|
||||
default behaviour of libopenssl. In future mechanize may support
|
||||
third-party libraries that (I assume) allow more options here.
|
||||
|
||||
"""
|
||||
self._client_cert_manager.add_key_cert(url, key_file, cert_file)
|
||||
|
||||
# the following are rarely useful -- use add_password / add_proxy_password
|
||||
# instead
|
||||
def set_password_manager(self, password_manager):
|
||||
"""Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None."""
|
||||
self._password_manager = password_manager
|
||||
self._set_handler("_basicauth", obj=password_manager)
|
||||
self._set_handler("_digestauth", obj=password_manager)
|
||||
def set_proxy_password_manager(self, password_manager):
|
||||
"""Set a mechanize.HTTPProxyPasswordMgr, or None."""
|
||||
self._proxy_password_manager = password_manager
|
||||
self._set_handler("_proxy_basicauth", obj=password_manager)
|
||||
self._set_handler("_proxy_digestauth", obj=password_manager)
|
||||
def set_client_cert_manager(self, cert_manager):
|
||||
"""Set a mechanize.HTTPClientCertMgr, or None."""
|
||||
self._client_cert_manager = cert_manager
|
||||
handler = self._ua_handlers["https"]
|
||||
handler.client_cert_manager = cert_manager
|
||||
|
||||
# these methods all take a boolean parameter
|
||||
def set_handle_robots(self, handle):
|
||||
"""Set whether to observe rules from robots.txt."""
|
||||
self._set_handler("_robots", handle)
|
||||
def set_handle_redirect(self, handle):
|
||||
"""Set whether to handle HTTP 30x redirections."""
|
||||
self._set_handler("_redirect", handle)
|
||||
def set_handle_refresh(self, handle, max_time=None, honor_time=True):
|
||||
"""Set whether to handle HTTP Refresh headers."""
|
||||
self._set_handler("_refresh", handle, constructor_kwds=
|
||||
{"max_time": max_time, "honor_time": honor_time})
|
||||
def set_handle_equiv(self, handle, head_parser_class=None):
|
||||
"""Set whether to treat HTML http-equiv headers like HTTP headers.
|
||||
|
||||
Response objects may be .seek()able if this is set (currently returned
|
||||
responses are, raised HTTPError exception responses are not).
|
||||
|
||||
"""
|
||||
if head_parser_class is not None:
|
||||
constructor_kwds = {"head_parser_class": head_parser_class}
|
||||
else:
|
||||
constructor_kwds={}
|
||||
self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds)
|
||||
def set_handle_gzip(self, handle):
|
||||
"""Handle gzip transfer encoding.
|
||||
|
||||
"""
|
||||
if handle:
|
||||
warnings.warn(
|
||||
"gzip transfer encoding is experimental!", stacklevel=2)
|
||||
self._set_handler("_gzip", handle)
|
||||
def set_debug_redirects(self, handle):
|
||||
"""Log information about HTTP redirects (including refreshes).
|
||||
|
||||
Logging is performed using module logging. The logger name is
|
||||
"mechanize.http_redirects". To actually print some debug output,
|
||||
eg:
|
||||
|
||||
import sys, logging
|
||||
logger = logging.getLogger("mechanize.http_redirects")
|
||||
logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
Other logger names relevant to this module:
|
||||
|
||||
"mechanize.http_responses"
|
||||
"mechanize.cookies"
|
||||
|
||||
To turn on everything:
|
||||
|
||||
import sys, logging
|
||||
logger = logging.getLogger("mechanize")
|
||||
logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
"""
|
||||
self._set_handler("_debug_redirect", handle)
|
||||
def set_debug_responses(self, handle):
|
||||
"""Log HTTP response bodies.
|
||||
|
||||
See docstring for .set_debug_redirects() for details of logging.
|
||||
|
||||
Response objects may be .seek()able if this is set (currently returned
|
||||
responses are, raised HTTPError exception responses are not).
|
||||
|
||||
"""
|
||||
self._set_handler("_debug_response_body", handle)
|
||||
def set_debug_http(self, handle):
|
||||
"""Print HTTP headers to sys.stdout."""
|
||||
level = int(bool(handle))
|
||||
for scheme in "http", "https":
|
||||
h = self._ua_handlers.get(scheme)
|
||||
if h is not None:
|
||||
h.set_http_debuglevel(level)
|
||||
|
||||
def _set_handler(self, name, handle=None, obj=None,
|
||||
constructor_args=(), constructor_kwds={}):
|
||||
if handle is None:
|
||||
handle = obj is not None
|
||||
if handle:
|
||||
handler_class = self.handler_classes[name]
|
||||
if obj is not None:
|
||||
newhandler = handler_class(obj)
|
||||
else:
|
||||
newhandler = handler_class(
|
||||
*constructor_args, **constructor_kwds)
|
||||
else:
|
||||
newhandler = None
|
||||
self._replace_handler(name, newhandler)
|
||||
|
||||
def _replace_handler(self, name, newhandler=None):
|
||||
# first, if handler was previously added, remove it
|
||||
if name is not None:
|
||||
handler = self._ua_handlers.get(name)
|
||||
if handler:
|
||||
try:
|
||||
self.handlers.remove(handler)
|
||||
except ValueError:
|
||||
pass
|
||||
# then add the replacement, if any
|
||||
if newhandler is not None:
|
||||
self.add_handler(newhandler)
|
||||
self._ua_handlers[name] = newhandler
|
||||
|
||||
|
||||
class UserAgent(UserAgentBase):
|
||||
|
||||
def __init__(self):
|
||||
UserAgentBase.__init__(self)
|
||||
self._seekable = False
|
||||
|
||||
def set_seekable_responses(self, handle):
|
||||
"""Make response objects .seek()able."""
|
||||
self._seekable = bool(handle)
|
||||
|
||||
def open(self, fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
if self._seekable:
|
||||
def bound_open(fullurl, data=None,
|
||||
timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT):
|
||||
return UserAgentBase.open(self, fullurl, data, timeout)
|
||||
response = _opener.wrapped_open(
|
||||
bound_open, _response.seek_wrapped_response, fullurl, data,
|
||||
timeout)
|
||||
else:
|
||||
response = UserAgentBase.open(self, fullurl, data)
|
||||
return response
|
Binary file not shown.
|
@ -0,0 +1,305 @@
|
|||
"""Utility functions and date/time routines.
|
||||
|
||||
Copyright 2002-2006 John J Lee <jjl@pobox.com>
|
||||
|
||||
This code is free software; you can redistribute it and/or modify it
|
||||
under the terms of the BSD or ZPL 2.1 licenses (see the file
|
||||
COPYING.txt included with the distribution).
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import warnings
|
||||
|
||||
|
||||
class ExperimentalWarning(UserWarning):
|
||||
pass
|
||||
|
||||
def experimental(message):
|
||||
warnings.warn(message, ExperimentalWarning, stacklevel=3)
|
||||
def hide_experimental_warnings():
|
||||
warnings.filterwarnings("ignore", category=ExperimentalWarning)
|
||||
def reset_experimental_warnings():
|
||||
warnings.filterwarnings("default", category=ExperimentalWarning)
|
||||
|
||||
def deprecation(message):
|
||||
warnings.warn(message, DeprecationWarning, stacklevel=3)
|
||||
def hide_deprecations():
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
def reset_deprecations():
|
||||
warnings.filterwarnings("default", category=DeprecationWarning)
|
||||
|
||||
|
||||
def write_file(filename, data):
|
||||
f = open(filename, "wb")
|
||||
try:
|
||||
f.write(data)
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
|
||||
def get1(sequence):
|
||||
assert len(sequence) == 1
|
||||
return sequence[0]
|
||||
|
||||
|
||||
def isstringlike(x):
|
||||
try: x+""
|
||||
except: return False
|
||||
else: return True
|
||||
|
||||
## def caller():
|
||||
## try:
|
||||
## raise SyntaxError
|
||||
## except:
|
||||
## import sys
|
||||
## return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name
|
||||
|
||||
|
||||
from calendar import timegm
|
||||
|
||||
# Date/time conversion routines for formats used by the HTTP protocol.
|
||||
|
||||
EPOCH = 1970
|
||||
def my_timegm(tt):
|
||||
year, month, mday, hour, min, sec = tt[:6]
|
||||
if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and
|
||||
(0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
|
||||
return timegm(tt)
|
||||
else:
|
||||
return None
|
||||
|
||||
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
||||
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
||||
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
||||
months_lower = []
|
||||
for month in months: months_lower.append(month.lower())
|
||||
|
||||
|
||||
def time2isoz(t=None):
|
||||
"""Return a string representing time in seconds since epoch, t.
|
||||
|
||||
If the function is called without an argument, it will use the current
|
||||
time.
|
||||
|
||||
The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
||||
representing Universal Time (UTC, aka GMT). An example of this format is:
|
||||
|
||||
1994-11-24 08:49:37Z
|
||||
|
||||
"""
|
||||
if t is None: t = time.time()
|
||||
year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
||||
return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
||||
year, mon, mday, hour, min, sec)
|
||||
|
||||
def time2netscape(t=None):
|
||||
"""Return a string representing time in seconds since epoch, t.
|
||||
|
||||
If the function is called without an argument, it will use the current
|
||||
time.
|
||||
|
||||
The format of the returned string is like this:
|
||||
|
||||
Wed, DD-Mon-YYYY HH:MM:SS GMT
|
||||
|
||||
"""
|
||||
if t is None: t = time.time()
|
||||
year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
|
||||
return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
|
||||
days[wday], mday, months[mon-1], year, hour, min, sec)
|
||||
|
||||
|
||||
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
||||
|
||||
timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
||||
def offset_from_tz_string(tz):
|
||||
offset = None
|
||||
if UTC_ZONES.has_key(tz):
|
||||
offset = 0
|
||||
else:
|
||||
m = timezone_re.search(tz)
|
||||
if m:
|
||||
offset = 3600 * int(m.group(2))
|
||||
if m.group(3):
|
||||
offset = offset + 60 * int(m.group(3))
|
||||
if m.group(1) == '-':
|
||||
offset = -offset
|
||||
return offset
|
||||
|
||||
def _str2time(day, mon, yr, hr, min, sec, tz):
|
||||
# translate month name to number
|
||||
# month numbers start with 1 (January)
|
||||
try:
|
||||
mon = months_lower.index(mon.lower())+1
|
||||
except ValueError:
|
||||
# maybe it's already a number
|
||||
try:
|
||||
imon = int(mon)
|
||||
except ValueError:
|
||||
return None
|
||||
if 1 <= imon <= 12:
|
||||
mon = imon
|
||||
else:
|
||||
return None
|
||||
|
||||
# make sure clock elements are defined
|
||||
if hr is None: hr = 0
|
||||
if min is None: min = 0
|
||||
if sec is None: sec = 0
|
||||
|
||||
yr = int(yr)
|
||||
day = int(day)
|
||||
hr = int(hr)
|
||||
min = int(min)
|
||||
sec = int(sec)
|
||||
|
||||
if yr < 1000:
|
||||
# find "obvious" year
|
||||
cur_yr = time.localtime(time.time())[0]
|
||||
m = cur_yr % 100
|
||||
tmp = yr
|
||||
yr = yr + cur_yr - m
|
||||
m = m - tmp
|
||||
if abs(m) > 50:
|
||||
if m > 0: yr = yr + 100
|
||||
else: yr = yr - 100
|
||||
|
||||
# convert UTC time tuple to seconds since epoch (not timezone-adjusted)
|
||||
t = my_timegm((yr, mon, day, hr, min, sec, tz))
|
||||
|
||||
if t is not None:
|
||||
# adjust time using timezone string, to get absolute time since epoch
|
||||
if tz is None:
|
||||
tz = "UTC"
|
||||
tz = tz.upper()
|
||||
offset = offset_from_tz_string(tz)
|
||||
if offset is None:
|
||||
return None
|
||||
t = t - offset
|
||||
|
||||
return t
|
||||
|
||||
|
||||
strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
||||
r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
||||
wkday_re = re.compile(
|
||||
r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
||||
loose_http_re = re.compile(
|
||||
r"""^
|
||||
(\d\d?) # day
|
||||
(?:\s+|[-\/])
|
||||
(\w+) # month
|
||||
(?:\s+|[-\/])
|
||||
(\d+) # year
|
||||
(?:
|
||||
(?:\s+|:) # separator before clock
|
||||
(\d\d?):(\d\d) # hour:min
|
||||
(?::(\d\d))? # optional seconds
|
||||
)? # optional clock
|
||||
\s*
|
||||
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
||||
\s*
|
||||
(?:\(\w+\))? # ASCII representation of timezone in parens.
|
||||
\s*$""", re.X)
|
||||
def http2time(text):
|
||||
"""Returns time in seconds since epoch of time represented by a string.
|
||||
|
||||
Return value is an integer.
|
||||
|
||||
None is returned if the format of str is unrecognized, the time is outside
|
||||
the representable range, or the timezone string is not recognized. If the
|
||||
string contains no timezone, UTC is assumed.
|
||||
|
||||
The timezone in the string may be numerical (like "-0800" or "+0100") or a
|
||||
string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
|
||||
timezone strings equivalent to UTC (zero offset) are known to the function.
|
||||
|
||||
The function loosely parses the following formats:
|
||||
|
||||
Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
|
||||
Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
|
||||
Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
|
||||
09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
|
||||
08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
|
||||
08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
|
||||
|
||||
The parser ignores leading and trailing whitespace. The time may be
|
||||
absent.
|
||||
|
||||
If the year is given with only 2 digits, the function will select the
|
||||
century that makes the year closest to the current date.
|
||||
|
||||
"""
|
||||
# fast exit for strictly conforming string
|
||||
m = strict_re.search(text)
|
||||
if m:
|
||||
g = m.groups()
|
||||
mon = months_lower.index(g[1].lower()) + 1
|
||||
tt = (int(g[2]), mon, int(g[0]),
|
||||
int(g[3]), int(g[4]), float(g[5]))
|
||||
return my_timegm(tt)
|
||||
|
||||
# No, we need some messy parsing...
|
||||
|
||||
# clean up
|
||||
text = text.lstrip()
|
||||
text = wkday_re.sub("", text, 1) # Useless weekday
|
||||
|
||||
# tz is time zone specifier string
|
||||
day, mon, yr, hr, min, sec, tz = [None]*7
|
||||
|
||||
# loose regexp parse
|
||||
m = loose_http_re.search(text)
|
||||
if m is not None:
|
||||
day, mon, yr, hr, min, sec, tz = m.groups()
|
||||
else:
|
||||
return None # bad format
|
||||
|
||||
return _str2time(day, mon, yr, hr, min, sec, tz)
|
||||
|
||||
|
||||
iso_re = re.compile(
|
||||
"""^
|
||||
(\d{4}) # year
|
||||
[-\/]?
|
||||
(\d\d?) # numerical month
|
||||
[-\/]?
|
||||
(\d\d?) # day
|
||||
(?:
|
||||
(?:\s+|[-:Tt]) # separator before clock
|
||||
(\d\d?):?(\d\d) # hour:min
|
||||
(?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
|
||||
)? # optional clock
|
||||
\s*
|
||||
([-+]?\d\d?:?(:?\d\d)?
|
||||
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
||||
\s*$""", re.X)
|
||||
def iso2time(text):
|
||||
"""
|
||||
As for http2time, but parses the ISO 8601 formats:
|
||||
|
||||
1994-02-03 14:15:29 -0100 -- ISO 8601 format
|
||||
1994-02-03 14:15:29 -- zone is optional
|
||||
1994-02-03 -- only date
|
||||
1994-02-03T14:15:29 -- Use T as separator
|
||||
19940203T141529Z -- ISO 8601 compact format
|
||||
19940203 -- only date
|
||||
|
||||
"""
|
||||
# clean up
|
||||
text = text.lstrip()
|
||||
|
||||
# tz is time zone specifier string
|
||||
day, mon, yr, hr, min, sec, tz = [None]*7
|
||||
|
||||
# loose regexp parse
|
||||
m = iso_re.search(text)
|
||||
if m is not None:
|
||||
# XXX there's an extra bit of the timezone I'm ignoring here: is
|
||||
# this the right thing to do?
|
||||
yr, mon, day, hr, min, sec, tz, _ = m.groups()
|
||||
else:
|
||||
return None # bad format
|
||||
|
||||
return _str2time(day, mon, yr, hr, min, sec, tz)
|
Binary file not shown.
|
@ -0,0 +1,2 @@
|
|||
"0.2.5"
|
||||
__version__ = (0, 2, 5, None, None)
|
Binary file not shown.
Loading…
Reference in New Issue