urls.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. from __future__ import annotations
  2. import codecs
  3. import re
  4. import typing as t
  5. import urllib.parse
  6. from urllib.parse import quote
  7. from urllib.parse import unquote
  8. from urllib.parse import urlencode
  9. from urllib.parse import urlsplit
  10. from urllib.parse import urlunsplit
  11. from .datastructures import iter_multi_items
  12. def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]:
  13. """Used in :func:`uri_to_iri` after unquoting to re-quote any
  14. invalid bytes.
  15. """
  16. # the docs state that UnicodeError does have these attributes,
  17. # but mypy isn't picking them up
  18. out = quote(e.object[e.start : e.end], safe="") # type: ignore
  19. return out, e.end # type: ignore
  20. codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)
  21. def _make_unquote_part(name: str, chars: str) -> t.Callable[[str], str]:
  22. """Create a function that unquotes all percent encoded characters except those
  23. given. This allows working with unquoted characters if possible while not changing
  24. the meaning of a given part of a URL.
  25. """
  26. choices = "|".join(f"{ord(c):02X}" for c in sorted(chars))
  27. pattern = re.compile(f"((?:%(?:{choices}))+)", re.I)
  28. def _unquote_partial(value: str) -> str:
  29. parts = iter(pattern.split(value))
  30. out = []
  31. for part in parts:
  32. out.append(unquote(part, "utf-8", "werkzeug.url_quote"))
  33. out.append(next(parts, ""))
  34. return "".join(out)
  35. _unquote_partial.__name__ = f"_unquote_{name}"
  36. return _unquote_partial
  37. # characters that should remain quoted in URL parts
  38. # based on https://url.spec.whatwg.org/#percent-encoded-bytes
  39. # always keep all controls, space, and % quoted
  40. _always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode()
  41. _unquote_fragment = _make_unquote_part("fragment", _always_unsafe)
  42. _unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#")
  43. _unquote_path = _make_unquote_part("path", _always_unsafe + "/?#")
  44. _unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#")
  45. def uri_to_iri(uri: str) -> str:
  46. """Convert a URI to an IRI. All valid UTF-8 characters are unquoted,
  47. leaving all reserved and invalid characters quoted. If the URL has
  48. a domain, it is decoded from Punycode.
  49. >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")
  50. 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'
  51. :param uri: The URI to convert.
  52. .. versionchanged:: 3.0
  53. Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters,
  54. are removed.
  55. .. versionchanged:: 2.3
  56. Which characters remain quoted is specific to each part of the URL.
  57. .. versionchanged:: 0.15
  58. All reserved and invalid characters remain quoted. Previously,
  59. only some reserved characters were preserved, and invalid bytes
  60. were replaced instead of left quoted.
  61. .. versionadded:: 0.6
  62. """
  63. parts = urlsplit(uri)
  64. path = _unquote_path(parts.path)
  65. query = _unquote_query(parts.query)
  66. fragment = _unquote_fragment(parts.fragment)
  67. if parts.hostname:
  68. netloc = _decode_idna(parts.hostname)
  69. else:
  70. netloc = ""
  71. if ":" in netloc:
  72. netloc = f"[{netloc}]"
  73. if parts.port:
  74. netloc = f"{netloc}:{parts.port}"
  75. if parts.username:
  76. auth = _unquote_user(parts.username)
  77. if parts.password:
  78. password = _unquote_user(parts.password)
  79. auth = f"{auth}:{password}"
  80. netloc = f"{auth}@{netloc}"
  81. return urlunsplit((parts.scheme, netloc, path, query, fragment))
  82. def iri_to_uri(iri: str) -> str:
  83. """Convert an IRI to a URI. All non-ASCII and unsafe characters are
  84. quoted. If the URL has a domain, it is encoded to Punycode.
  85. >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')
  86. 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'
  87. :param iri: The IRI to convert.
  88. .. versionchanged:: 3.0
  89. Passing a tuple or bytes, the ``charset`` and ``errors`` parameters,
  90. and the ``safe_conversion`` parameter, are removed.
  91. .. versionchanged:: 2.3
  92. Which characters remain unquoted is specific to each part of the URL.
  93. .. versionchanged:: 0.15
  94. All reserved characters remain unquoted. Previously, only some reserved
  95. characters were left unquoted.
  96. .. versionchanged:: 0.9.6
  97. The ``safe_conversion`` parameter was added.
  98. .. versionadded:: 0.6
  99. """
  100. parts = urlsplit(iri)
  101. # safe = https://url.spec.whatwg.org/#url-path-segment-string
  102. # as well as percent for things that are already quoted
  103. path = quote(parts.path, safe="%!$&'()*+,/:;=@")
  104. query = quote(parts.query, safe="%!$&'()*+,/:;=?@")
  105. fragment = quote(parts.fragment, safe="%!#$&'()*+,/:;=?@")
  106. if parts.hostname:
  107. netloc = parts.hostname.encode("idna").decode("ascii")
  108. else:
  109. netloc = ""
  110. if ":" in netloc:
  111. netloc = f"[{netloc}]"
  112. if parts.port:
  113. netloc = f"{netloc}:{parts.port}"
  114. if parts.username:
  115. auth = quote(parts.username, safe="%!$&'()*+,;=")
  116. if parts.password:
  117. password = quote(parts.password, safe="%!$&'()*+,;=")
  118. auth = f"{auth}:{password}"
  119. netloc = f"{auth}@{netloc}"
  120. return urlunsplit((parts.scheme, netloc, path, query, fragment))
  121. # Python < 3.12
  122. # itms-services was worked around in previous iri_to_uri implementations, but
  123. # we can tell Python directly that it needs to preserve the //.
  124. if "itms-services" not in urllib.parse.uses_netloc:
  125. urllib.parse.uses_netloc.append("itms-services")
  126. def _decode_idna(domain: str) -> str:
  127. try:
  128. data = domain.encode("ascii")
  129. except UnicodeEncodeError:
  130. # If the domain is not ASCII, it's decoded already.
  131. return domain
  132. try:
  133. # Try decoding in one shot.
  134. return data.decode("idna")
  135. except UnicodeDecodeError:
  136. pass
  137. # Decode each part separately, leaving invalid parts as punycode.
  138. parts = []
  139. for part in data.split(b"."):
  140. try:
  141. parts.append(part.decode("idna"))
  142. except UnicodeDecodeError:
  143. parts.append(part.decode("ascii"))
  144. return ".".join(parts)
  145. def _urlencode(query: t.Mapping[str, str] | t.Iterable[tuple[str, str]]) -> str:
  146. items = [x for x in iter_multi_items(query) if x[1] is not None]
  147. # safe = https://url.spec.whatwg.org/#percent-encoded-bytes
  148. return urlencode(items, safe="!$'()*,/:;?@")