392 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			392 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
| """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
 | |
| 
 | |
| Examples
 | |
| 
 | |
| This program extracts all links from a document.  It will print one
 | |
| line for each link, containing the URL and the textual description
 | |
| between the <A>...</A> tags:
 | |
| 
 | |
| import pullparser, sys
 | |
| f = file(sys.argv[1])
 | |
| p = pullparser.PullParser(f)
 | |
| for token in p.tags("a"):
 | |
|     if token.type == "endtag": continue
 | |
|     url = dict(token.attrs).get("href", "-")
 | |
|     text = p.get_compressed_text(endat=("endtag", "a"))
 | |
|     print "%s\t%s" % (url, text)
 | |
| 
 | |
| This program extracts the <TITLE> from the document:
 | |
| 
 | |
| import pullparser, sys
 | |
| f = file(sys.argv[1])
 | |
| p = pullparser.PullParser(f)
 | |
| if p.get_tag("title"):
 | |
|     title = p.get_compressed_text()
 | |
|     print "Title: %s" % title
 | |
| 
 | |
| 
 | |
| Copyright 2003-2006 John J. Lee <jjl@pobox.com>
 | |
| Copyright 1998-2001 Gisle Aas (original libwww-perl code)
 | |
| 
 | |
| This code is free software; you can redistribute it and/or modify it
 | |
| under the terms of the BSD or ZPL 2.1 licenses.
 | |
| 
 | |
| """
 | |
| 
 | |
| import re, htmlentitydefs
 | |
| import _sgmllib_copy as sgmllib
 | |
| import HTMLParser
 | |
| from xml.sax import saxutils
 | |
| 
 | |
| from _html import unescape, unescape_charref
 | |
| 
 | |
| 
 | |
| class NoMoreTokensError(Exception): pass
 | |
| 
 | |
| class Token:
 | |
|     """Represents an HTML tag, declaration, processing instruction etc.
 | |
| 
 | |
|     Behaves as both a tuple-like object (ie. iterable) and has attributes
 | |
|     .type, .data and .attrs.
 | |
| 
 | |
|     >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
 | |
|     >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
 | |
|     True
 | |
|     >>> (t.type, t.data) == ("starttag", "a")
 | |
|     True
 | |
|     >>> t.attrs == [("href", "http://www.python.org/")]
 | |
|     True
 | |
| 
 | |
|     Public attributes
 | |
| 
 | |
|     type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
 | |
|      "data", "comment", "decl", "pi", after the corresponding methods of
 | |
|      HTMLParser.HTMLParser
 | |
|     data: For a tag, the tag name; otherwise, the relevant data carried by the
 | |
|      tag, as a string
 | |
|     attrs: list of (name, value) pairs representing HTML attributes
 | |
|      (or None if token does not represent an opening tag)
 | |
| 
 | |
|     """
 | |
|     def __init__(self, type, data, attrs=None):
 | |
|         self.type = type
 | |
|         self.data = data
 | |
|         self.attrs = attrs
 | |
|     def __iter__(self):
 | |
|         return iter((self.type, self.data, self.attrs))
 | |
|     def __eq__(self, other):
 | |
|         type, data, attrs = other
 | |
|         if (self.type == type and
 | |
|             self.data == data and
 | |
|             self.attrs == attrs):
 | |
|             return True
 | |
|         else:
 | |
|             return False
 | |
|     def __ne__(self, other): return not self.__eq__(other)
 | |
|     def __repr__(self):
 | |
|         args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
 | |
|         return self.__class__.__name__+"(%s)" % args
 | |
| 
 | |
|     def __str__(self):
 | |
|         """
 | |
|         >>> print Token("starttag", "br")
 | |
|         <br>
 | |
|         >>> print Token("starttag", "a",
 | |
|         ...     [("href", "http://www.python.org/"), ("alt", '"foo"')])
 | |
|         <a href="http://www.python.org/" alt='"foo"'>
 | |
|         >>> print Token("startendtag", "br")
 | |
|         <br />
 | |
|         >>> print Token("startendtag", "br", [("spam", "eggs")])
 | |
|         <br spam="eggs" />
 | |
|         >>> print Token("endtag", "p")
 | |
|         </p>
 | |
|         >>> print Token("charref", "38")
 | |
|         &
 | |
|         >>> print Token("entityref", "amp")
 | |
|         &
 | |
|         >>> print Token("data", "foo\\nbar")
 | |
|         foo
 | |
|         bar
 | |
|         >>> print Token("comment", "Life is a bowl\\nof cherries.")
 | |
|         <!--Life is a bowl
 | |
|         of cherries.-->
 | |
|         >>> print Token("decl", "decl")
 | |
|         <!decl>
 | |
|         >>> print Token("pi", "pi")
 | |
|         <?pi>
 | |
|         """
 | |
|         if self.attrs is not None:
 | |
|             attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
 | |
|                              k, v in self.attrs])
 | |
|         else:
 | |
|             attrs = ""
 | |
|         if self.type == "starttag":
 | |
|             return "<%s%s>" % (self.data, attrs)
 | |
|         elif self.type == "startendtag":
 | |
|             return "<%s%s />" % (self.data, attrs)
 | |
|         elif self.type == "endtag":
 | |
|             return "</%s>" % self.data
 | |
|         elif self.type == "charref":
 | |
|             return "&#%s;" % self.data
 | |
|         elif self.type == "entityref":
 | |
|             return "&%s;" % self.data
 | |
|         elif self.type == "data":
 | |
|             return self.data
 | |
|         elif self.type == "comment":
 | |
|             return "<!--%s-->" % self.data
 | |
|         elif self.type == "decl":
 | |
|             return "<!%s>" % self.data
 | |
|         elif self.type == "pi":
 | |
|             return "<?%s>" % self.data
 | |
|         assert False
 | |
| 
 | |
| 
 | |
| def iter_until_exception(fn, exception, *args, **kwds):
 | |
|     while 1:
 | |
|         try:
 | |
|             yield fn(*args, **kwds)
 | |
|         except exception:
 | |
|             raise StopIteration
 | |
| 
 | |
| 
 | |
| class _AbstractParser:
 | |
|     chunk = 1024
 | |
|     compress_re = re.compile(r"\s+")
 | |
|     def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
 | |
|                  encoding="ascii", entitydefs=None):
 | |
|         """
 | |
|         fh: file-like object (only a .read() method is required) from which to
 | |
|          read HTML to be parsed
 | |
|         textify: mapping used by .get_text() and .get_compressed_text() methods
 | |
|          to represent opening tags as text
 | |
|         encoding: encoding used to encode numeric character references by
 | |
|          .get_text() and .get_compressed_text() ("ascii" by default)
 | |
| 
 | |
|         entitydefs: mapping like {"amp": "&", ...} containing HTML entity
 | |
|          definitions (a sensible default is used).  This is used to unescape
 | |
|          entities in .get_text() (and .get_compressed_text()) and attribute
 | |
|          values.  If the encoding can not represent the character, the entity
 | |
|          reference is left unescaped.  Note that entity references (both
 | |
|          numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
 | |
|          unescaped in attribute values and the return value of .get_text(), but
 | |
|          not in data outside of tags.  Instead, entity references outside of
 | |
|          tags are represented as tokens.  This is a bit odd, it's true :-/
 | |
| 
 | |
|         If the element name of an opening tag matches a key in the textify
 | |
|         mapping then that tag is converted to text.  The corresponding value is
 | |
|         used to specify which tag attribute to obtain the text from.  textify
 | |
|         maps from element names to either:
 | |
| 
 | |
|           - an HTML attribute name, in which case the HTML attribute value is
 | |
|             used as its text value along with the element name in square
 | |
|             brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute
 | |
|             were missing, just "[IMG]")
 | |
|           - a callable object (e.g. a function) which takes a Token and returns
 | |
|             the string to be used as its text value
 | |
| 
 | |
|         If textify has no key for an element name, nothing is substituted for
 | |
|         the opening tag.
 | |
| 
 | |
|         Public attributes:
 | |
| 
 | |
|         encoding and textify: see above
 | |
| 
 | |
|         """
 | |
|         self._fh = fh
 | |
|         self._tokenstack = []  # FIFO
 | |
|         self.textify = textify
 | |
|         self.encoding = encoding
 | |
|         if entitydefs is None:
 | |
|             entitydefs = htmlentitydefs.name2codepoint
 | |
|         self._entitydefs = entitydefs
 | |
| 
 | |
|     def __iter__(self): return self
 | |
| 
 | |
|     def tags(self, *names):
 | |
|         return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
 | |
| 
 | |
|     def tokens(self, *tokentypes):
 | |
|         return iter_until_exception(self.get_token, NoMoreTokensError,
 | |
|                                     *tokentypes)
 | |
| 
 | |
|     def next(self):
 | |
|         try:
 | |
|             return self.get_token()
 | |
|         except NoMoreTokensError:
 | |
|             raise StopIteration()
 | |
| 
 | |
|     def get_token(self, *tokentypes):
 | |
|         """Pop the next Token object from the stack of parsed tokens.
 | |
| 
 | |
|         If arguments are given, they are taken to be token types in which the
 | |
|         caller is interested: tokens representing other elements will be
 | |
|         skipped.  Element names must be given in lower case.
 | |
| 
 | |
|         Raises NoMoreTokensError.
 | |
| 
 | |
|         """
 | |
|         while 1:
 | |
|             while self._tokenstack:
 | |
|                 token = self._tokenstack.pop(0)
 | |
|                 if tokentypes:
 | |
|                     if token.type in tokentypes:
 | |
|                         return token
 | |
|                 else:
 | |
|                     return token
 | |
|             data = self._fh.read(self.chunk)
 | |
|             if not data:
 | |
|                 raise NoMoreTokensError()
 | |
|             self.feed(data)
 | |
| 
 | |
|     def unget_token(self, token):
 | |
|         """Push a Token back onto the stack."""
 | |
|         self._tokenstack.insert(0, token)
 | |
| 
 | |
|     def get_tag(self, *names):
 | |
|         """Return the next Token that represents an opening or closing tag.
 | |
| 
 | |
|         If arguments are given, they are taken to be element names in which the
 | |
|         caller is interested: tags representing other elements will be skipped.
 | |
|         Element names must be given in lower case.
 | |
| 
 | |
|         Raises NoMoreTokensError.
 | |
| 
 | |
|         """
 | |
|         while 1:
 | |
|             tok = self.get_token()
 | |
|             if tok.type not in ["starttag", "endtag", "startendtag"]:
 | |
|                 continue
 | |
|             if names:
 | |
|                 if tok.data in names:
 | |
|                     return tok
 | |
|             else:
 | |
|                 return tok
 | |
| 
 | |
|     def get_text(self, endat=None):
 | |
|         """Get some text.
 | |
| 
 | |
|         endat: stop reading text at this tag (the tag is included in the
 | |
|          returned text); endtag is a tuple (type, name) where type is
 | |
|          "starttag", "endtag" or "startendtag", and name is the element name of
 | |
|          the tag (element names must be given in lower case)
 | |
| 
 | |
|         If endat is not given, .get_text() will stop at the next opening or
 | |
|         closing tag, or when there are no more tokens (no exception is raised).
 | |
|         Note that .get_text() includes the text representation (if any) of the
 | |
|         opening tag, but pushes the opening tag back onto the stack.  As a
 | |
|         result, if you want to call .get_text() again, you need to call
 | |
|         .get_tag() first (unless you want an empty string returned when you
 | |
|         next call .get_text()).
 | |
| 
 | |
|         Entity references are translated using the value of the entitydefs
 | |
|         constructor argument (a mapping from names to characters like that
 | |
|         provided by the standard module htmlentitydefs).  Named entity
 | |
|         references that are not in this mapping are left unchanged.
 | |
| 
 | |
|         The textify attribute is used to translate opening tags into text: see
 | |
|         the class docstring.
 | |
| 
 | |
|         """
 | |
|         text = []
 | |
|         tok = None
 | |
|         while 1:
 | |
|             try:
 | |
|                 tok = self.get_token()
 | |
|             except NoMoreTokensError:
 | |
|                 # unget last token (not the one we just failed to get)
 | |
|                 if tok: self.unget_token(tok)
 | |
|                 break
 | |
|             if tok.type == "data":
 | |
|                 text.append(tok.data)
 | |
|             elif tok.type == "entityref":
 | |
|                 t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
 | |
|                 text.append(t)
 | |
|             elif tok.type == "charref":
 | |
|                 t = unescape_charref(tok.data, self.encoding)
 | |
|                 text.append(t)
 | |
|             elif tok.type in ["starttag", "endtag", "startendtag"]:
 | |
|                 tag_name = tok.data
 | |
|                 if tok.type in ["starttag", "startendtag"]:
 | |
|                     alt = self.textify.get(tag_name)
 | |
|                     if alt is not None:
 | |
|                         if callable(alt):
 | |
|                             text.append(alt(tok))
 | |
|                         elif tok.attrs is not None:
 | |
|                             for k, v in tok.attrs:
 | |
|                                 if k == alt:
 | |
|                                     text.append(v)
 | |
|                             text.append("[%s]" % tag_name.upper())
 | |
|                 if endat is None or endat == (tok.type, tag_name):
 | |
|                     self.unget_token(tok)
 | |
|                     break
 | |
|         return "".join(text)
 | |
| 
 | |
|     def get_compressed_text(self, *args, **kwds):
 | |
|         """
 | |
|         As .get_text(), but collapses each group of contiguous whitespace to a
 | |
|         single space character, and removes all initial and trailing
 | |
|         whitespace.
 | |
| 
 | |
|         """
 | |
|         text = self.get_text(*args, **kwds)
 | |
|         text = text.strip()
 | |
|         return self.compress_re.sub(" ", text)
 | |
| 
 | |
|     def handle_startendtag(self, tag, attrs):
 | |
|         self._tokenstack.append(Token("startendtag", tag, attrs))
 | |
|     def handle_starttag(self, tag, attrs):
 | |
|         self._tokenstack.append(Token("starttag", tag, attrs))
 | |
|     def handle_endtag(self, tag):
 | |
|         self._tokenstack.append(Token("endtag", tag))
 | |
|     def handle_charref(self, name):
 | |
|         self._tokenstack.append(Token("charref", name))
 | |
|     def handle_entityref(self, name):
 | |
|         self._tokenstack.append(Token("entityref", name))
 | |
|     def handle_data(self, data):
 | |
|         self._tokenstack.append(Token("data", data))
 | |
|     def handle_comment(self, data):
 | |
|         self._tokenstack.append(Token("comment", data))
 | |
|     def handle_decl(self, decl):
 | |
|         self._tokenstack.append(Token("decl", decl))
 | |
|     def unknown_decl(self, data):
 | |
|         # XXX should this call self.error instead?
 | |
|         #self.error("unknown declaration: " + `data`)
 | |
|         self._tokenstack.append(Token("decl", data))
 | |
|     def handle_pi(self, data):
 | |
|         self._tokenstack.append(Token("pi", data))
 | |
| 
 | |
|     def unescape_attr(self, name):
 | |
|         return unescape(name, self._entitydefs, self.encoding)
 | |
|     def unescape_attrs(self, attrs):
 | |
|         escaped_attrs = []
 | |
|         for key, val in attrs:
 | |
|             escaped_attrs.append((key, self.unescape_attr(val)))
 | |
|         return escaped_attrs
 | |
| 
 | |
| class PullParser(_AbstractParser, HTMLParser.HTMLParser):
 | |
|     def __init__(self, *args, **kwds):
 | |
|         HTMLParser.HTMLParser.__init__(self)
 | |
|         _AbstractParser.__init__(self, *args, **kwds)
 | |
|     def unescape(self, name):
 | |
|         # Use the entitydefs passed into constructor, not
 | |
|         # HTMLParser.HTMLParser's entitydefs.
 | |
|         return self.unescape_attr(name)
 | |
| 
 | |
| class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
 | |
|     def __init__(self, *args, **kwds):
 | |
|         sgmllib.SGMLParser.__init__(self)
 | |
|         _AbstractParser.__init__(self, *args, **kwds)
 | |
|     def unknown_starttag(self, tag, attrs):
 | |
|         attrs = self.unescape_attrs(attrs)
 | |
|         self._tokenstack.append(Token("starttag", tag, attrs))
 | |
|     def unknown_endtag(self, tag):
 | |
|         self._tokenstack.append(Token("endtag", tag))
 | |
| 
 | |
| 
 | |
| def _test():
 | |
|    import doctest, _pullparser
 | |
|    return doctest.testmod(_pullparser)
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|    _test()
 |