246 lines
7.5 KiB
Python
246 lines
7.5 KiB
Python
"""RFC 3986 URI parsing and relative reference resolution / absolutization.
|
|
|
|
(aka splitting and joining)
|
|
|
|
Copyright 2006 John J. Lee <jjl@pobox.com>
|
|
|
|
This code is free software; you can redistribute it and/or modify it under
|
|
the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
|
|
included with the distribution).
|
|
|
|
"""
|
|
|
|
# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
|
|
|
|
import re, urllib
|
|
|
|
## def chr_range(a, b):
|
|
## return "".join(map(chr, range(ord(a), ord(b)+1)))
|
|
|
|
## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
## "abcdefghijklmnopqrstuvwxyz"
|
|
## "0123456789"
|
|
## "-_.~")
|
|
## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
|
|
## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
|
|
# this re matches any character that's not in URI_CHARS
|
|
BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
|
|
|
|
|
|
def clean_url(url, encoding):
|
|
# percent-encode illegal URI characters
|
|
# Trying to come up with test cases for this gave me a headache, revisit
|
|
# when do switch to unicode.
|
|
# Somebody else's comments (lost the attribution):
|
|
## - IE will return you the url in the encoding you send it
|
|
## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
|
|
## characters in your link. It will send you utf-8 however if there are...
|
|
if type(url) == type(""):
|
|
url = url.decode(encoding, "replace")
|
|
url = url.strip()
|
|
# for second param to urllib.quote(), we want URI_CHARS, minus the
|
|
# 'always_safe' characters that urllib.quote() never percent-encodes
|
|
return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
|
|
|
|
def is_clean_uri(uri):
|
|
"""
|
|
>>> is_clean_uri("ABC!")
|
|
True
|
|
>>> is_clean_uri(u"ABC!")
|
|
True
|
|
>>> is_clean_uri("ABC|")
|
|
False
|
|
>>> is_clean_uri(u"ABC|")
|
|
False
|
|
>>> is_clean_uri("http://example.com/0")
|
|
True
|
|
>>> is_clean_uri(u"http://example.com/0")
|
|
True
|
|
"""
|
|
# note module re treats bytestrings as through they were decoded as latin-1
|
|
# so this function accepts both unicode and bytestrings
|
|
return not bool(BAD_URI_CHARS_RE.search(uri))
|
|
|
|
|
|
SPLIT_MATCH = re.compile(
|
|
r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
|
|
def urlsplit(absolute_uri):
|
|
"""Return scheme, authority, path, query, fragment."""
|
|
match = SPLIT_MATCH(absolute_uri)
|
|
if match:
|
|
g = match.groups()
|
|
return g[1], g[3], g[4], g[6], g[8]
|
|
|
|
def urlunsplit(parts):
|
|
scheme, authority, path, query, fragment = parts
|
|
r = []
|
|
append = r.append
|
|
if scheme is not None:
|
|
append(scheme)
|
|
append(":")
|
|
if authority is not None:
|
|
append("//")
|
|
append(authority)
|
|
append(path)
|
|
if query is not None:
|
|
append("?")
|
|
append(query)
|
|
if fragment is not None:
|
|
append("#")
|
|
append(fragment)
|
|
return "".join(r)
|
|
|
|
def urljoin(base_uri, uri_reference):
|
|
"""Join a base URI with a URI reference and return the resulting URI.
|
|
|
|
See RFC 3986.
|
|
"""
|
|
return urlunsplit(urljoin_parts(urlsplit(base_uri),
|
|
urlsplit(uri_reference)))
|
|
|
|
# oops, this doesn't do the same thing as the literal translation
|
|
# from the RFC below
|
|
## import posixpath
|
|
## def urljoin_parts(base_parts, reference_parts):
|
|
## scheme, authority, path, query, fragment = base_parts
|
|
## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
|
|
|
## # compute target URI path
|
|
## if rpath == "":
|
|
## tpath = path
|
|
## else:
|
|
## tpath = rpath
|
|
## if not tpath.startswith("/"):
|
|
## tpath = merge(authority, path, tpath)
|
|
## tpath = posixpath.normpath(tpath)
|
|
|
|
## if rscheme is not None:
|
|
## return (rscheme, rauthority, tpath, rquery, rfragment)
|
|
## elif rauthority is not None:
|
|
## return (scheme, rauthority, tpath, rquery, rfragment)
|
|
## elif rpath == "":
|
|
## if rquery is not None:
|
|
## tquery = rquery
|
|
## else:
|
|
## tquery = query
|
|
## return (scheme, authority, tpath, tquery, rfragment)
|
|
## else:
|
|
## return (scheme, authority, tpath, rquery, rfragment)
|
|
|
|
def urljoin_parts(base_parts, reference_parts):
|
|
scheme, authority, path, query, fragment = base_parts
|
|
rscheme, rauthority, rpath, rquery, rfragment = reference_parts
|
|
|
|
if rscheme == scheme:
|
|
rscheme = None
|
|
|
|
if rscheme is not None:
|
|
tscheme, tauthority, tpath, tquery = (
|
|
rscheme, rauthority, remove_dot_segments(rpath), rquery)
|
|
else:
|
|
if rauthority is not None:
|
|
tauthority, tpath, tquery = (
|
|
rauthority, remove_dot_segments(rpath), rquery)
|
|
else:
|
|
if rpath == "":
|
|
tpath = path
|
|
if rquery is not None:
|
|
tquery = rquery
|
|
else:
|
|
tquery = query
|
|
else:
|
|
if rpath.startswith("/"):
|
|
tpath = remove_dot_segments(rpath)
|
|
else:
|
|
tpath = merge(authority, path, rpath)
|
|
tpath = remove_dot_segments(tpath)
|
|
tquery = rquery
|
|
tauthority = authority
|
|
tscheme = scheme
|
|
tfragment = rfragment
|
|
return (tscheme, tauthority, tpath, tquery, tfragment)
|
|
|
|
# um, something *vaguely* like this is what I want, but I have to generate
|
|
# lots of test cases first, if only to understand what it is that
|
|
# remove_dot_segments really does...
|
|
## def remove_dot_segments(path):
|
|
## if path == '':
|
|
## return ''
|
|
## comps = path.split('/')
|
|
## new_comps = []
|
|
## for comp in comps:
|
|
## if comp in ['.', '']:
|
|
## if not new_comps or new_comps[-1]:
|
|
## new_comps.append('')
|
|
## continue
|
|
## if comp != '..':
|
|
## new_comps.append(comp)
|
|
## elif new_comps:
|
|
## new_comps.pop()
|
|
## return '/'.join(new_comps)
|
|
|
|
|
|
def remove_dot_segments(path):
|
|
r = []
|
|
while path:
|
|
# A
|
|
if path.startswith("../"):
|
|
path = path[3:]
|
|
continue
|
|
if path.startswith("./"):
|
|
path = path[2:]
|
|
continue
|
|
# B
|
|
if path.startswith("/./"):
|
|
path = path[2:]
|
|
continue
|
|
if path == "/.":
|
|
path = "/"
|
|
continue
|
|
# C
|
|
if path.startswith("/../"):
|
|
path = path[3:]
|
|
if r:
|
|
r.pop()
|
|
continue
|
|
if path == "/..":
|
|
path = "/"
|
|
if r:
|
|
r.pop()
|
|
continue
|
|
# D
|
|
if path == ".":
|
|
path = path[1:]
|
|
continue
|
|
if path == "..":
|
|
path = path[2:]
|
|
continue
|
|
# E
|
|
start = 0
|
|
if path.startswith("/"):
|
|
start = 1
|
|
ii = path.find("/", start)
|
|
if ii < 0:
|
|
ii = None
|
|
r.append(path[:ii])
|
|
if ii is None:
|
|
break
|
|
path = path[ii:]
|
|
return "".join(r)
|
|
|
|
def merge(base_authority, base_path, ref_path):
|
|
# XXXX Oddly, the sample Perl implementation of this by Roy Fielding
|
|
# doesn't even take base_authority as a parameter, despite the wording in
|
|
# the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
|
|
#if base_authority is not None and base_path == "":
|
|
if base_path == "":
|
|
return "/" + ref_path
|
|
ii = base_path.rfind("/")
|
|
if ii >= 0:
|
|
return base_path[:ii+1] + ref_path
|
|
return ref_path
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod()
|