"""HTML form handling for web clients.
HTML form handling for web clients: useful for parsing HTML forms, filling them
in and returning the completed forms to the server. This code developed from a
port of Gisle Aas' Perl module HTML::Form, from the libwww-perl library, but
the interface is not the same.
The most useful docstring is the one for HTMLForm.
RFC 1866: HTML 2.0
RFC 1867: Form-based File Upload in HTML
RFC 2388: Returning Values from Forms: multipart/form-data
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
HTML 4.01 Specification, W3C Recommendation 24 December 1999
Copyright 2002-2007 John J. Lee
Copyright 2005 Gary Poster
Copyright 2005 Zope Corporation
Copyright 1998-2000 Gisle Aas.
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD or ZPL 2.1 licenses (see the file
COPYING.txt included with the distribution).
"""
# TODO:
# Clean up post the merge into mechanize
# * Remove code that was duplicated in ClientForm and mechanize
# * Remove weird import stuff
# * Remove pre-Python 2.4 compatibility cruft
# * Clean up tests
# * Later release: Remove the ClientForm 0.1 backwards-compatibility switch
# Remove parser testing hack
# Clean action URI
# Switch to unicode throughout
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
# Apply recommendations from google code project CURLIES
# Apply recommendations from HTML 5 spec
# Add charset parameter to Content-type headers? How to find value??
# Functional tests to add:
# Single and multiple file upload
# File upload with missing name (check standards)
# mailto: submission & enctype text/plain??
# Replace by_label etc. with moniker / selector concept. Allows, e.g., a
# choice between selection by value / id / label / element contents. Or
# choice between matching labels exactly or by substring. etc.
__all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
'ListControl', 'LocateError', 'Missing', 'ParseError', 'ParseFile',
'ParseFileEx', 'ParseResponse', 'ParseResponseEx','PasswordControl',
'RadioControl', 'ScalarControl', 'SelectControl',
'SubmitButtonControl', 'SubmitControl', 'TextControl',
'TextareaControl', 'XHTMLCompatibleFormParser']
import HTMLParser
from cStringIO import StringIO
import inspect
import logging
import random
import re
import sys
import urllib
import urlparse
import warnings
import _beautifulsoup
import _request
# from Python itself, for backwards compatibility of raised exceptions
import sgmllib
# bundled copy of sgmllib
import _sgmllib_copy
VERSION = "0.2.11"
CHUNK = 1024 # size of chunks fed to parser, in bytes
DEFAULT_ENCODING = "latin-1"
_logger = logging.getLogger("mechanize.forms")
OPTIMIZATION_HACK = True
def debug(msg, *args, **kwds):
if OPTIMIZATION_HACK:
return
caller_name = inspect.stack()[1][3]
extended_msg = '%%s %s' % msg
extended_args = (caller_name,)+args
_logger.debug(extended_msg, *extended_args, **kwds)
def _show_debug_messages():
global OPTIMIZATION_HACK
OPTIMIZATION_HACK = False
_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
_logger.addHandler(handler)
def deprecation(message, stack_offset=0):
warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
class Missing: pass
_compress_re = re.compile(r"\s+")
def compress_text(text): return _compress_re.sub(" ", text.strip())
def normalize_line_endings(text):
return re.sub(r"(?:(?
w = MimeWriter(f)
...call w.addheader(key, value) 0 or more times...
followed by either:
f = w.startbody(content_type)
...call f.write(data) for body data...
or:
w.startmultipartbody(subtype)
for each part:
subwriter = w.nextpart()
...use the subwriter's methods to create the subpart...
w.lastpart()
The subwriter is another MimeWriter instance, and should be
treated in the same way as the toplevel MimeWriter. This way,
writing recursive body parts is easy.
Warning: don't forget to call lastpart()!
XXX There should be more state so calls made in the wrong order
are detected.
Some special cases:
- startbody() just returns the file passed to the constructor;
but don't use this knowledge, as it may be changed.
- startmultipartbody() actually returns a file as well;
this can be used to write the initial 'if you can read this your
mailer is not MIME-aware' message.
- If you call flushheaders(), the headers accumulated so far are
written out (and forgotten); this is useful if you don't need a
body part at all, e.g. for a subpart of type message/rfc822
that's (mis)used to store some header-like information.
- Passing a keyword argument 'prefix=' to addheader(),
start*body() affects where the header is inserted; 0 means
append at the end, 1 means insert at the start; default is
append for addheader(), but insert for start*body(), which use
it to determine where the Content-type header goes.
"""
def __init__(self, fp, http_hdrs=None):
self._http_hdrs = http_hdrs
self._fp = fp
self._headers = []
self._boundary = []
self._first_part = True
def addheader(self, key, value, prefix=0,
add_to_http_hdrs=0):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
lines = value.split("\r\n")
while lines and not lines[-1]: del lines[-1]
while lines and not lines[0]: del lines[0]
if add_to_http_hdrs:
value = "".join(lines)
# 2.2 urllib2 doesn't normalize header case
self._http_hdrs.append((key.capitalize(), value))
else:
for i in range(1, len(lines)):
lines[i] = " " + lines[i].strip()
value = "\r\n".join(lines) + "\r\n"
line = key.title() + ": " + value
if prefix:
self._headers.insert(0, line)
else:
self._headers.append(line)
def flushheaders(self):
self._fp.writelines(self._headers)
self._headers = []
def startbody(self, ctype=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
"""
prefix is ignored if add_to_http_hdrs is true.
"""
if content_type and ctype:
for name, value in plist:
ctype = ctype + ';\r\n %s=%s' % (name, value)
self.addheader("Content-Type", ctype, prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs)
self.flushheaders()
if not add_to_http_hdrs: self._fp.write("\r\n")
self._first_part = True
return self._fp
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
add_to_http_hdrs=0, content_type=1):
boundary = boundary or choose_boundary()
self._boundary.append(boundary)
return self.startbody("multipart/" + subtype,
[("boundary", boundary)] + plist,
prefix=prefix,
add_to_http_hdrs=add_to_http_hdrs,
content_type=content_type)
def nextpart(self):
boundary = self._boundary[-1]
if self._first_part:
self._first_part = False
else:
self._fp.write("\r\n")
self._fp.write("--" + boundary + "\r\n")
return self.__class__(self._fp)
def lastpart(self):
if self._first_part:
self.nextpart()
boundary = self._boundary.pop()
self._fp.write("\r\n--" + boundary + "--\r\n")
class LocateError(ValueError): pass
class AmbiguityError(LocateError): pass
class ControlNotFoundError(LocateError): pass
class ItemNotFoundError(LocateError): pass
class ItemCountError(ValueError): pass
# for backwards compatibility, ParseError derives from exceptions that were
# raised by versions of ClientForm <= 0.2.5
# TODO: move to _html
class ParseError(sgmllib.SGMLParseError,
HTMLParser.HTMLParseError):
def __init__(self, *args, **kwds):
Exception.__init__(self, *args, **kwds)
def __str__(self):
return Exception.__str__(self)
class _AbstractFormParser:
"""forms attribute contains HTMLForm instances on completion."""
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
if entitydefs is None:
entitydefs = get_entitydefs()
self._entitydefs = entitydefs
self._encoding = encoding
self.base = None
self.forms = []
self.labels = []
self._current_label = None
self._current_form = None
self._select = None
self._optgroup = None
self._option = None
self._textarea = None
# forms[0] will contain all controls that are outside of any form
# self._global_form is an alias for self.forms[0]
self._global_form = None
self.start_form([])
self.end_form()
self._current_form = self._global_form = self.forms[0]
def do_base(self, attrs):
debug("%s", attrs)
for key, value in attrs:
if key == "href":
self.base = self.unescape_attr_if_required(value)
def end_body(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is not self._global_form:
self.end_form()
def start_form(self, attrs):
debug("%s", attrs)
if self._current_form is not self._global_form:
raise ParseError("nested FORMs")
name = None
action = None
enctype = "application/x-www-form-urlencoded"
method = "GET"
d = {}
for key, value in attrs:
if key == "name":
name = self.unescape_attr_if_required(value)
elif key == "action":
action = self.unescape_attr_if_required(value)
elif key == "method":
method = self.unescape_attr_if_required(value.upper())
elif key == "enctype":
enctype = self.unescape_attr_if_required(value.lower())
d[key] = self.unescape_attr_if_required(value)
controls = []
self._current_form = (name, action, method, enctype), d, controls
def end_form(self):
debug("")
if self._current_label is not None:
self.end_label()
if self._current_form is self._global_form:
raise ParseError("end of FORM before start")
self.forms.append(self._current_form)
self._current_form = self._global_form
def start_select(self, attrs):
debug("%s", attrs)
if self._select is not None:
raise ParseError("nested SELECTs")
if self._textarea is not None:
raise ParseError("SELECT inside TEXTAREA")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._select = d
self._add_label(d)
self._append_select_control({"__select": d})
def end_select(self):
debug("")
if self._select is None:
raise ParseError("end of SELECT before start")
if self._option is not None:
self._end_option()
self._select = None
def start_optgroup(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTGROUP outside of SELECT")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._optgroup = d
def end_optgroup(self):
debug("")
if self._optgroup is None:
raise ParseError("end of OPTGROUP before start")
self._optgroup = None
def _start_option(self, attrs):
debug("%s", attrs)
if self._select is None:
raise ParseError("OPTION outside of SELECT")
if self._option is not None:
self._end_option()
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._option = {}
self._option.update(d)
if (self._optgroup and self._optgroup.has_key("disabled") and
not self._option.has_key("disabled")):
self._option["disabled"] = None
def _end_option(self):
debug("")
if self._option is None:
raise ParseError("end of OPTION before start")
contents = self._option.get("contents", "").strip()
self._option["contents"] = contents
if not self._option.has_key("value"):
self._option["value"] = contents
if not self._option.has_key("label"):
self._option["label"] = contents
# stuff dict of SELECT HTML attrs into a special private key
# (gets deleted again later)
self._option["__select"] = self._select
self._append_select_control(self._option)
self._option = None
def _append_select_control(self, attrs):
debug("%s", attrs)
controls = self._current_form[2]
name = self._select.get("name")
controls.append(("select", name, attrs))
def start_textarea(self, attrs):
debug("%s", attrs)
if self._textarea is not None:
raise ParseError("nested TEXTAREAs")
if self._select is not None:
raise ParseError("TEXTAREA inside SELECT")
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
self._add_label(d)
self._textarea = d
def end_textarea(self):
debug("")
if self._textarea is None:
raise ParseError("end of TEXTAREA before start")
controls = self._current_form[2]
name = self._textarea.get("name")
controls.append(("textarea", name, self._textarea))
self._textarea = None
def start_label(self, attrs):
debug("%s", attrs)
if self._current_label:
self.end_label()
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
taken = bool(d.get("for")) # empty id is invalid
d["__text"] = ""
d["__taken"] = taken
if taken:
self.labels.append(d)
self._current_label = d
def end_label(self):
debug("")
label = self._current_label
if label is None:
# something is ugly in the HTML, but we're ignoring it
return
self._current_label = None
# if it is staying around, it is True in all cases
del label["__taken"]
def _add_label(self, d):
#debug("%s", d)
if self._current_label is not None:
if not self._current_label["__taken"]:
self._current_label["__taken"] = True
d["__label"] = self._current_label
def handle_data(self, data):
debug("%s", data)
if self._option is not None:
# self._option is a dictionary of the OPTION element's HTML
# attributes, but it has two special keys, one of which is the
# special "contents" key contains text between OPTION tags (the
# other is the "__select" key: see the end_option method)
map = self._option
key = "contents"
elif self._textarea is not None:
map = self._textarea
key = "value"
data = normalize_line_endings(data)
# not if within option or textarea
elif self._current_label is not None:
map = self._current_label
key = "__text"
else:
return
if data and not map.has_key(key):
# according to
# http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1 line break
# immediately after start tags or immediately before end tags must
# be ignored, but real browsers only ignore a line break after a
# start tag, so we'll do that.
if data[0:2] == "\r\n":
data = data[2:]
elif data[0:1] in ["\n", "\r"]:
data = data[1:]
map[key] = data
else:
map[key] = map[key] + data
def do_button(self, attrs):
debug("%s", attrs)
d = {}
d["type"] = "submit" # default
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
# we don't want to lose information, so use a type string that
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
# e.g. type for BUTTON/RESET is "resetbutton"
# (type for INPUT/RESET is "reset")
type = type+"button"
self._add_label(d)
controls.append((type, name, d))
def do_input(self, attrs):
debug("%s", attrs)
d = {}
d["type"] = "text" # default
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
type = d["type"]
name = d.get("name")
self._add_label(d)
controls.append((type, name, d))
def do_isindex(self, attrs):
debug("%s", attrs)
d = {}
for key, val in attrs:
d[key] = self.unescape_attr_if_required(val)
controls = self._current_form[2]
self._add_label(d)
# isindex doesn't have type or name HTML attributes
controls.append(("isindex", None, d))
def handle_entityref(self, name):
#debug("%s", name)
self.handle_data(unescape(
'&%s;' % name, self._entitydefs, self._encoding))
def handle_charref(self, name):
#debug("%s", name)
self.handle_data(unescape_charref(name, self._encoding))
def unescape_attr(self, name):
#debug("%s", name)
return unescape(name, self._entitydefs, self._encoding)
def unescape_attrs(self, attrs):
#debug("%s", attrs)
escaped_attrs = {}
for key, val in attrs.items():
try:
val.items
except AttributeError:
escaped_attrs[key] = self.unescape_attr(val)
else:
# e.g. "__select" -- yuck!
escaped_attrs[key] = self.unescape_attrs(val)
return escaped_attrs
def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
def unknown_charref(self, ref): self.handle_data("%s;" % ref)
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
"""Good for XHTML, bad for tolerance of incorrect HTML."""
# thanks to Michael Howitz for this!
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
HTMLParser.HTMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def feed(self, data):
try:
HTMLParser.HTMLParser.feed(self, data)
except HTMLParser.HTMLParseError, exc:
raise ParseError(exc)
def start_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
def end_option(self):
_AbstractFormParser._end_option(self)
def handle_starttag(self, tag, attrs):
try:
method = getattr(self, "start_" + tag)
except AttributeError:
try:
method = getattr(self, "do_" + tag)
except AttributeError:
pass # unknown tag
else:
method(attrs)
else:
method(attrs)
def handle_endtag(self, tag):
try:
method = getattr(self, "end_" + tag)
except AttributeError:
pass # unknown tag
else:
method()
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
return self.unescape_attr(name)
def unescape_attr_if_required(self, name):
return name # HTMLParser.HTMLParser already did it
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
def close(self):
HTMLParser.HTMLParser.close(self)
self.end_body()
class _AbstractSgmllibParser(_AbstractFormParser):
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
# we override this attr to decode hex charrefs
entity_or_charref = re.compile(
'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
def convert_entityref(self, name):
return unescape("&%s;" % name, self._entitydefs, self._encoding)
def convert_charref(self, name):
return unescape_charref("%s" % name, self._encoding)
def unescape_attr_if_required(self, name):
return name # sgmllib already did it
def unescape_attrs_if_required(self, attrs):
return attrs # ditto
class FormParser(_AbstractSgmllibParser, _sgmllib_copy.SGMLParser):
"""Good for tolerance of incorrect HTML, bad for XHTML."""
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
_sgmllib_copy.SGMLParser.__init__(self)
_AbstractFormParser.__init__(self, entitydefs, encoding)
def feed(self, data):
try:
_sgmllib_copy.SGMLParser.feed(self, data)
except _sgmllib_copy.SGMLParseError, exc:
raise ParseError(exc)
def close(self):
_sgmllib_copy.SGMLParser.close(self)
self.end_body()
class _AbstractBSFormParser(_AbstractSgmllibParser):
bs_base_class = None
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
_AbstractFormParser.__init__(self, entitydefs, encoding)
self.bs_base_class.__init__(self)
def handle_data(self, data):
_AbstractFormParser.handle_data(self, data)
self.bs_base_class.handle_data(self, data)
def feed(self, data):
try:
self.bs_base_class.feed(self, data)
except _sgmllib_copy.SGMLParseError, exc:
raise ParseError(exc)
def close(self):
self.bs_base_class.close(self)
self.end_body()
class RobustFormParser(_AbstractBSFormParser, _beautifulsoup.BeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML."""
bs_base_class = _beautifulsoup.BeautifulSoup
class NestingRobustFormParser(_AbstractBSFormParser,
_beautifulsoup.ICantBelieveItsBeautifulSoup):
"""Tries to be highly tolerant of incorrect HTML.
Different from RobustFormParser in that it more often guesses nesting
above missing end tags (see BeautifulSoup docs).
"""
bs_base_class = _beautifulsoup.ICantBelieveItsBeautifulSoup
#FormParser = XHTMLCompatibleFormParser # testing hack
#FormParser = RobustFormParser # testing hack
def ParseResponseEx(response,
select_default=False,
form_parser_class=FormParser,
request_class=_request.Request,
entitydefs=None,
encoding=DEFAULT_ENCODING,
# private
_urljoin=urlparse.urljoin,
_urlparse=urlparse.urlparse,
_urlunparse=urlparse.urlunparse,
):
"""Identical to ParseResponse, except that:
1. The returned list contains an extra item. The first form in the list
contains all controls not contained in any FORM element.
2. The arguments ignore_errors and backwards_compat have been removed.
3. Backwards-compatibility mode (backwards_compat=True) is not available.
"""
return _ParseFileEx(response, response.geturl(),
select_default,
False,
form_parser_class,
request_class,
entitydefs,
False,
encoding,
_urljoin=_urljoin,
_urlparse=_urlparse,
_urlunparse=_urlunparse,
)
def ParseFileEx(file, base_uri,
select_default=False,
form_parser_class=FormParser,
request_class=_request.Request,
entitydefs=None,
encoding=DEFAULT_ENCODING,
# private
_urljoin=urlparse.urljoin,
_urlparse=urlparse.urlparse,
_urlunparse=urlparse.urlunparse,
):
"""Identical to ParseFile, except that:
1. The returned list contains an extra item. The first form in the list
contains all controls not contained in any FORM element.
2. The arguments ignore_errors and backwards_compat have been removed.
3. Backwards-compatibility mode (backwards_compat=True) is not available.
"""
return _ParseFileEx(file, base_uri,
select_default,
False,
form_parser_class,
request_class,
entitydefs,
False,
encoding,
_urljoin=_urljoin,
_urlparse=_urlparse,
_urlunparse=_urlunparse,
)
def ParseString(text, base_uri, *args, **kwds):
fh = StringIO(text)
return ParseFileEx(fh, base_uri, *args, **kwds)
def ParseResponse(response, *args, **kwds):
"""Parse HTTP response and return a list of HTMLForm instances.
The return value of mechanize.urlopen can be conveniently passed to this
function as the response parameter.
mechanize.ParseError is raised on parse errors.
response: file-like object (supporting read() method) with a method
geturl(), returning the URI of the HTTP response
select_default: for multiple-selection SELECT controls and RADIO controls,
pick the first item as the default if none are selected in the HTML
form_parser_class: class to instantiate and use to pass
request_class: class to return from .click() method (default is
mechanize.Request)
entitydefs: mapping like {"&": "&", ...} containing HTML entity
definitions (a sensible default is used)
encoding: character encoding used for encoding numeric character references
when matching link text. mechanize does not attempt to find the encoding
in a META HTTP-EQUIV attribute in the document itself (mechanize, for
example, does do that and will pass the correct value to mechanize using
this parameter).
backwards_compat: boolean that determines whether the returned HTMLForm
objects are backwards-compatible with old code. If backwards_compat is
true:
- ClientForm 0.1 code will continue to work as before.
- Label searches that do not specify a nr (number or count) will always
get the first match, even if other controls match. If
backwards_compat is False, label searches that have ambiguous results
will raise an AmbiguityError.
- Item label matching is done by strict string comparison rather than
substring matching.
- De-selecting individual list items is allowed even if the Item is
disabled.
The backwards_compat argument will be removed in a future release.
Pass a true value for select_default if you want the behaviour specified by
RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
RADIO or multiple-selection SELECT control if none were selected in the
HTML. Most browsers (including Microsoft Internet Explorer (IE) and
Netscape Navigator) instead leave all items unselected in these cases. The
W3C HTML 4.0 standard leaves this behaviour undefined in the case of
multiple-selection SELECT controls, but insists that at least one RADIO
button should be checked at all times, in contradiction to browser
behaviour.
There is a choice of parsers. mechanize.XHTMLCompatibleFormParser (uses
HTMLParser.HTMLParser) works best for XHTML, mechanize.FormParser (uses
bundled copy of sgmllib.SGMLParser) (the default) works better for ordinary
grubby HTML. Note that HTMLParser is only available in Python 2.2 and
later. You can pass your own class in here as a hack to work around bad
HTML, but at your own risk: there is no well-defined interface.
"""
return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
def ParseFile(file, base_uri, *args, **kwds):
"""Parse HTML and return a list of HTMLForm instances.
mechanize.ParseError is raised on parse errors.
file: file-like object (supporting read() method) containing HTML with zero
or more forms to be parsed
base_uri: the URI of the document (note that the base URI used to submit
the form will be that given in the BASE element if present, not that of
the document)
For the other arguments and further details, see ParseResponse.__doc__.
"""
return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
def _ParseFileEx(file, base_uri,
select_default=False,
ignore_errors=False,
form_parser_class=FormParser,
request_class=_request.Request,
entitydefs=None,
backwards_compat=True,
encoding=DEFAULT_ENCODING,
_urljoin=urlparse.urljoin,
_urlparse=urlparse.urlparse,
_urlunparse=urlparse.urlunparse,
):
if backwards_compat:
deprecation("operating in backwards-compatibility mode", 1)
fp = form_parser_class(entitydefs, encoding)
while 1:
data = file.read(CHUNK)
try:
fp.feed(data)
except ParseError, e:
e.base_uri = base_uri
raise
if len(data) != CHUNK: break
fp.close()
if fp.base is not None:
# HTML BASE element takes precedence over document URI
base_uri = fp.base
labels = [] # Label(label) for label in fp.labels]
id_to_labels = {}
for l in fp.labels:
label = Label(l)
labels.append(label)
for_id = l["for"]
coll = id_to_labels.get(for_id)
if coll is None:
id_to_labels[for_id] = [label]
else:
coll.append(label)
forms = []
for (name, action, method, enctype), attrs, controls in fp.forms:
if action is None:
action = base_uri
else:
action = _urljoin(base_uri, action)
# would be nice to make HTMLForm class (form builder) pluggable
form = HTMLForm(
action, method, enctype, name, attrs, request_class,
forms, labels, id_to_labels, backwards_compat)
form._urlparse = _urlparse
form._urlunparse = _urlunparse
for ii in range(len(controls)):
type, name, attrs = controls[ii]
# index=ii*10 allows ImageControl to return multiple ordered pairs
form.new_control(
type, name, attrs, select_default=select_default, index=ii*10)
forms.append(form)
for form in forms:
form.fixup()
return forms
class Label:
def __init__(self, attrs):
self.id = attrs.get("for")
self._text = attrs.get("__text").strip()
self._ctext = compress_text(self._text)
self.attrs = attrs
self._backwards_compat = False # maintained by HTMLForm
def __getattr__(self, name):
if name == "text":
if self._backwards_compat:
return self._text
else:
return self._ctext
return getattr(Label, name)
def __setattr__(self, name, value):
if name == "text":
# don't see any need for this, so make it read-only
raise AttributeError("text attribute is read-only")
self.__dict__[name] = value
def __str__(self):
return "