test_urlutils.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. # -*- coding: utf-8 -*-
  2. import pytest
  3. from boltons import urlutils
  4. from boltons.urlutils import URL, _URL_RE, find_all_links
  5. try:
  6. unicode
  7. except NameError:
  8. unicode = str
  9. # fully quoted urls that should round trip
  10. TEST_URLS = [
  11. 'http://googlewebsite.com/e-shops.aspx',
  12. 'http://example.com:8080/search?q=123&business=Nothing%20Special',
  13. 'http://hatnote.com:9000?arg=1&arg=2&arg=3',
  14. 'https://xn--bcher-kva.ch',
  15. 'http://xn--ggbla1c4e.xn--ngbc5azd/',
  16. 'http://tools.ietf.org/html/rfc3986#section-3.4',
  17. 'http://wiki:pedia@hatnote.com',
  18. 'ftp://ftp.rfc-editor.org/in-notes/tar/RFCs0001-0500.tar.gz',
  19. 'http://[1080:0:0:0:8:800:200C:417A]/index.html',
  20. 'ssh://192.0.2.16:2222/',
  21. 'https://[::101.45.75.219]:80/?hi=bye',
  22. 'ldap://[::192.9.5.5]/dc=example,dc=com??sub?(sn=Jensen)',
  23. 'mailto:me@example.com?to=me@example.com&body=hi%20http://wikipedia.org',
  24. 'news:alt.rec.motorcycle',
  25. 'tel:+1-800-867-5309',
  26. 'urn:oasis:member:A00024:x',
  27. ('magnet:?xt=urn:btih:1a42b9e04e122b97a5254e3df77ab3c4b7da725f&dn=Puppy%'
  28. '20Linux%20precise-5.7.1.iso&tr=udp://tracker.openbittorrent.com:80&'
  29. 'tr=udp://tracker.publicbt.com:80&tr=udp://tracker.istole.it:6969&'
  30. 'tr=udp://tracker.ccc.de:80&tr=udp://open.demonii.com:1337'),
  31. # from twisted:
  32. "http://localhost",
  33. "http://localhost/",
  34. "http://localhost/foo",
  35. "http://localhost/foo/",
  36. "http://localhost/foo!!bar/",
  37. "http://localhost/foo%20bar/",
  38. "http://localhost/foo%2Fbar/",
  39. "http://localhost/foo?n",
  40. "http://localhost/foo?n=v",
  41. "http://localhost/foo?n=/a/b",
  42. "http://example.com/foo!@$bar?b!@z=123",
  43. "http://localhost/asd?a=asd%20sdf/345",
  44. "http://(%2525)/(%2525)?(%2525)&(%2525)=(%2525)#(%2525)",
  45. "http://(%C3%A9)/(%C3%A9)?(%C3%A9)&(%C3%A9)=(%C3%A9)#(%C3%A9)"
  46. ]
  47. @pytest.fixture(scope="module", params=TEST_URLS)
  48. def test_url(request):
  49. param = request.param
  50. return param
  51. def test_regex(test_url):
  52. match = _URL_RE.match(test_url)
  53. assert match.groupdict()
  54. def test_roundtrip(test_url):
  55. result = URL(test_url).to_text(full_quote=True)
  56. assert test_url == result
  57. def test_basic():
  58. u1 = URL('http://googlewebsite.com/e-shops.aspx')
  59. assert isinstance(u1.to_text(), unicode)
  60. assert u1.host == 'googlewebsite.com'
  61. def test_utf8_url():
  62. url_bytes = (b'http://\xd9\x85\xd8\xab\xd8\xa7\xd9\x84'
  63. b'.\xd8\xa2\xd8\xb2\xd9\x85\xd8\xa7'
  64. b'\xdb\x8c\xd8\xb4\xdb\x8c')
  65. url = URL(url_bytes)
  66. assert url.scheme == 'http'
  67. assert url.host == u'مثال.آزمایشی'
  68. def test_idna():
  69. u1 = URL(u'http://bücher.ch')
  70. assert u1.host == u'bücher.ch'
  71. assert u1.to_text(full_quote=True) == 'http://xn--bcher-kva.ch'
  72. assert u1.to_text(full_quote=False) == u'http://bücher.ch'
  73. u2 = URL('https://xn--bcher-kva.ch')
  74. assert u2.host == u'bücher.ch'
  75. assert u2.to_text(full_quote=True) == 'https://xn--bcher-kva.ch'
  76. assert u2.to_text(full_quote=False) == u'https://bücher.ch'
  77. def test_query_params(test_url):
  78. url_obj = URL(test_url)
  79. if not url_obj.query_params or url_obj.fragment:
  80. return
  81. qp_text = url_obj.query_params.to_text(full_quote=True)
  82. assert test_url.endswith(qp_text)
  83. def test_iri_query():
  84. url = URL(u'http://minerals.mountain.ore/?rock=\N{SHAMROCK}')
  85. assert url.query_params['rock'] == u'\N{SHAMROCK}'
  86. assert url.query_params.to_text(full_quote=True).endswith(u'%E2%98%98')
  87. def test_iri_path():
  88. url = URL(u'http://minerals.mountain.ore/rock/\N{SHAMROCK}/')
  89. assert url.path == u'/rock/\N{SHAMROCK}/'
  90. assert url.to_text(full_quote=True).endswith('%E2%98%98/')
  91. def test_url_copy():
  92. url = URL('http://example.com/foo?bar=baz')
  93. url_copy = URL(url)
  94. assert url == url_copy
  95. def test_invalid_port():
  96. with pytest.raises(ValueError):
  97. URL('http://reader.googlewebsite.com:neverforget')
  98. def test_invalid_ipv6():
  99. invalid_ipv6_ips = ['2001::0234:C1ab::A0:aabc:003F',
  100. '2001::1::3F']
  101. for ip in invalid_ipv6_ips:
  102. with pytest.raises(ValueError):
  103. URL('http://[' + ip + ']')
  104. def test_parse_url():
  105. expected = {'family': 2, 'password': None, 'fragment': None,
  106. 'authority': u'127.0.0.1:3000', 'port': 3000, 'query': u'a=1',
  107. '_netloc_sep': u'//', 'path': u'/', 'scheme': u'http',
  108. 'host': u'127.0.0.1', 'username': None}
  109. res = urlutils.parse_url('http://127.0.0.1:3000/?a=1')
  110. assert res == expected
  111. def test_parse_equals_in_qp_value():
  112. u = URL('http://localhost/?=x=x=x')
  113. assert u.qp[''] == 'x=x=x'
  114. assert u.to_text() == 'http://localhost/?=x%3Dx%3Dx'
  115. u = URL('http://localhost/?foo=x=x=x&bar=y')
  116. assert u.qp['foo'] == 'x=x=x'
  117. assert u.qp['bar'] == 'y'
  118. def test_identical_equal():
  119. u = URL('http://example.com/path?query=param#frag')
  120. assert u == u
  121. def test_equal():
  122. u = URL('http://example.com/path?query=param#frag')
  123. bono = URL('http://example.com/path?query=param#frag')
  124. assert bono == u
  125. def test_not_equal():
  126. u = URL('http://example.com/path?query=param1#frag')
  127. bono = URL('http://example.com/path?query=param2#frag')
  128. assert bono != u
  129. def _test_bad_utf8(): # not part of the API
  130. bad_bin_url = 'http://xn--9ca.com/%00%FF/%C3%A9'
  131. url = URL(bad_bin_url)
  132. expected = ('http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/'
  133. '%00%FF/'
  134. '\N{LATIN SMALL LETTER E WITH ACUTE}')
  135. actual = url.to_text()
  136. assert expected == actual
  137. def test_userinfo():
  138. url = URL('http://someuser:somepassword@example.com/some-segment@ignore')
  139. assert url.username == 'someuser'
  140. assert url.password == 'somepassword'
  141. assert url.to_text() == 'http://someuser:somepassword@example.com/some-segment@ignore'
  142. def test_quoted_userinfo():
  143. url = URL('http://wikipedia.org')
  144. url.username = u'user'
  145. url.password = u'p@ss'
  146. assert url.to_text(full_quote=True) == 'http://user:p%40ss@wikipedia.org'
  147. url = URL(u'http://beyonc\xe9:b\xe9b@tmp.com')
  148. # assert url.to_text(full_quote=False) == u'http://beyoncé:b%C3%A9b@tmp.com'
  149. assert url.to_text(full_quote=True) == u'http://beyonc%C3%A9:b%C3%A9b@tmp.com'
  150. def test_mailto():
  151. mt = 'mailto:mahmoud@hatnote.com'
  152. url = URL(mt)
  153. assert url.scheme == 'mailto'
  154. assert url.to_text() == mt
  155. # Examples from RFC 3986 section 5.4, Reference Resolution Examples
  156. # painstakingly copied from the lovingly transcribed version in
  157. # twisted's test_url, with inapplicable cases removed
  158. REL_URL_BASE = 'http://a/b/c/d;p?q'
  159. REL_URL_TEST_CASES = [
  160. # "Normal"
  161. #('g:h', 'g:h'), # Not supported: scheme with relative path
  162. ('g', 'http://a/b/c/g'),
  163. ('./g', 'http://a/b/c/g'),
  164. ('g/', 'http://a/b/c/g/'),
  165. ('/g', 'http://a/g'),
  166. (';x', 'http://a/b/c/;x'),
  167. ('g;x', 'http://a/b/c/g;x'),
  168. ('', 'http://a/b/c/d;p?q'),
  169. ('.', 'http://a/b/c/'),
  170. ('./', 'http://a/b/c/'),
  171. ('..', 'http://a/b/'),
  172. ('../', 'http://a/b/'),
  173. ('../g', 'http://a/b/g'),
  174. ('../..', 'http://a/'),
  175. ('../../', 'http://a/'),
  176. ('../../g', 'http://a/g'),
  177. # Abnormal examples
  178. # ".." cannot be used to change the authority component of a URI.
  179. ('../../../g', 'http://a/g'), # TODO (rooted?)
  180. ('../../../../g', 'http://a/g'), # TODO (rooted)?
  181. # Only include "." and ".." when they are only part of a larger segment,
  182. # not by themselves.
  183. ('/./g', 'http://a/g'),
  184. ('/../g', 'http://a/g'),
  185. ('g.', 'http://a/b/c/g.'),
  186. ('.g', 'http://a/b/c/.g'),
  187. ('g..', 'http://a/b/c/g..'),
  188. ('..g', 'http://a/b/c/..g'),
  189. # Unnecessary or nonsensical forms of "." and "..".
  190. ('./../g', 'http://a/b/g'),
  191. ('./g/.', 'http://a/b/c/g/'),
  192. ('g/./h', 'http://a/b/c/g/h'),
  193. ('g/../h', 'http://a/b/c/h'),
  194. ('g;x=1/./y', 'http://a/b/c/g;x=1/y'),
  195. ('g;x=1/../y', 'http://a/b/c/y'),
  196. ]
  197. def test_rel_navigate():
  198. for suffix, expected in REL_URL_TEST_CASES:
  199. url = URL(REL_URL_BASE)
  200. new_url = url.navigate(suffix)
  201. assert new_url.to_text() == expected
  202. new_url = url.navigate(URL(suffix))
  203. assert new_url.to_text() == expected
  204. return
  205. def test_navigate():
  206. orig_text = u'http://a.b/c/d?e#f'
  207. orig = URL(orig_text)
  208. navd = orig.navigate('')
  209. # fragment removed on empty navigate
  210. assert navd.to_text() == u'http://a.b/c/d?e'
  211. # query also removed on non-empty navigate (interp'd as rel path)
  212. navd = orig.navigate('dd')
  213. assert navd.to_text() == u'http://a.b/c/dd'
  214. # check trailing slash
  215. navd = orig.navigate('dd/')
  216. assert navd.to_text() == u'http://a.b/c/dd/'
  217. # path removed on absolute path navigate
  218. navd = orig.navigate('/C')
  219. assert navd.to_text() == u'http://a.b/C'
  220. # only query string
  221. navd = orig.navigate('?e=E&ee=EE')
  222. assert navd.to_text() == u'http://a.b/c/d?e=E&ee=EE'
  223. # only fragment
  224. navd = orig.navigate('#FFF')
  225. assert navd.to_text() == u'http://a.b/c/d?e#FFF'
  226. # an odd case, bears more consideration perhaps
  227. navd = orig.navigate('https:')
  228. assert navd.to_text() == u'https://a.b/c/d?e'
  229. # another odd one, host only
  230. navd = orig.navigate('//newhost')
  231. assert navd.to_text() == u'http://newhost/c/d?e'
  232. # absolute URLs (with scheme + host) replace everything
  233. _dest_text = u'http://hatnote.com'
  234. _dest = URL(_dest_text)
  235. navd = orig.navigate(_dest)
  236. assert _dest is not navd # make sure copies are made
  237. assert navd.to_text() == _dest_text
  238. navd = orig.navigate(_dest_text)
  239. assert navd.to_text() == _dest_text
  240. @pytest.mark.parametrize(
  241. ('expected', 'base', 'paths'), [
  242. ('https://host/b', 'https://host', ('a', '/b', )),
  243. ('https://host/b', 'https://host', ('a', 'b', )),
  244. ('https://host/a/b', 'https://host', ('a/', 'b', )),
  245. ('https://host/b', 'https://host', ('/a', 'b', )),
  246. ('https://host/a/b', 'https://host/a/', (None, 'b', )),
  247. ('https://host/b', 'https://host/a', (None, 'b', )),
  248. ])
  249. def test_chained_navigate(expected, base, paths):
  250. """Chained :meth:`navigate` calls produces correct results."""
  251. url = URL(base)
  252. for path in paths:
  253. url = url.navigate(path)
  254. assert expected == url.to_text()
  255. # TODO: RFC3986 6.2.3 (not just for query add, either)
  256. # def test_add_query():
  257. # url = URL('http://www.example.com')
  258. # url.qp['key'] = 'value'
  259. # assert url.to_text() == 'http://www.example.com/?key=value'
  260. def test_self_normalize():
  261. url = URL('http://hatnote.com/a/../../b?k=v#hashtags')
  262. url.normalize()
  263. assert url.to_text() == 'http://hatnote.com/b?k=v#hashtags'
  264. def test_normalize_with_case():
  265. # from RFC 3986 Section 6.2.2
  266. url1 = URL('example://a/b/c/%7Bfoo%7D')
  267. url2 = URL('eXAMPLE://a/./b/../b/%63/%7bfoo%7d')
  268. assert url1 != url2
  269. url1.normalize()
  270. url2.normalize()
  271. assert url1 == url2
  272. def test_netloc_slashes():
  273. # basic sanity checks
  274. url = URL('mailto:mahmoud@hatnote.com')
  275. assert url.scheme == 'mailto'
  276. assert url.to_text() == 'mailto:mahmoud@hatnote.com'
  277. url = URL('http://hatnote.com')
  278. assert url.scheme == 'http'
  279. assert url.to_text() == 'http://hatnote.com'
  280. # test that unrecognized schemes stay consistent with '//'
  281. url = URL('newscheme:a:b:c')
  282. assert url.scheme == 'newscheme'
  283. assert url.to_text() == 'newscheme:a:b:c'
  284. url = URL('newerscheme://a/b/c')
  285. assert url.scheme == 'newerscheme'
  286. assert url.to_text() == 'newerscheme://a/b/c'
  287. # test that reasonable guesses are made
  288. url = URL('git+ftp://gitstub.biz/glyph/lefkowitz')
  289. assert url.scheme == 'git+ftp'
  290. assert url.to_text() == 'git+ftp://gitstub.biz/glyph/lefkowitz'
  291. url = URL('what+mailto:freerealestate@enotuniq.org')
  292. assert url.scheme == 'what+mailto'
  293. assert url.to_text() == 'what+mailto:freerealestate@enotuniq.org'
  294. url = URL()
  295. url.scheme = 'ztp'
  296. url.path = '/x/y/z'
  297. assert url.to_text() == 'ztp:/x/y/z'
  298. # also works when the input doesn't include '//'
  299. url = URL()
  300. url.scheme = 'git+ftp'
  301. url.path = '/x/y/z/'
  302. assert url.to_text() == 'git+ftp:///x/y/z/'
  303. # really why would this ever come up but ok
  304. url = URL('file:///path/to/heck')
  305. url.scheme = 'mailto'
  306. assert url.to_text() == 'mailto:/path/to/heck'
  307. return
  308. # (link_text, expected_urls)
  309. # adapted from tornado test suite
  310. FAL_TESTS = [("hello http://world.com/!", ["http://world.com/"]),
  311. ("hello http://world.com/with?param=true&stuff=yes", ["http://world.com/with?param=true&stuff=yes"]),
  312. ("http://url.com/w(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", ["http://url.com/w"]),
  313. ("http://url.com/withmany.......................................", ["http://url.com/withmany"]),
  314. ("http://url.com/withmany((((((((((((((((((((((((((((((((((a)", ["http://url.com/withmany"]),
  315. # some examples from http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  316. ("http://foo.com/blah_blah", ["http://foo.com/blah_blah"]),
  317. ("http://foo.com/blah_blah/", ["http://foo.com/blah_blah/"]),
  318. ("(Something like http://foo.com/blah_blah)", ["http://foo.com/blah_blah"]),
  319. ("http://foo.com/blah_blah_(wikipedia)", ["http://foo.com/blah_blah_(wikipedia)"]),
  320. ("http://foo.com/blah_(blah)_(wikipedia)_blah", ["http://foo.com/blah_(blah)_(wikipedia)_blah"]),
  321. ("http://foo.com/blah_blah.", ["http://foo.com/blah_blah"]),
  322. ("http://foo.com/blah_blah/.", ["http://foo.com/blah_blah/"]),
  323. ("<http://foo.com/blah_blah>", ["http://foo.com/blah_blah"]),
  324. ("<http://foo.com/blah_blah/>", ["http://foo.com/blah_blah/"]),
  325. ("http://foo.com/blah_blah,", ["http://foo.com/blah_blah"]),
  326. ("http://www.example.com/wpstyle/?p=364.", ["http://www.example.com/wpstyle/?p=364"]),
  327. ("rdar://1234", ["rdar://1234"]),
  328. ("rdar:/1234", ["rdar:/1234"]),
  329. ("http://userid:password@example.com:8080", ["http://userid:password@example.com:8080"]),
  330. ("http://userid@example.com", ["http://userid@example.com"]),
  331. ("message://%3c330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e", ["message://%3C330e7f8409726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e"]),
  332. (u"http://\u27a1.ws/\u4a39", [u"http://\u27a1.ws/\u4a39"]),
  333. ("<tag>http://example.com</tag>", ["http://example.com"]),
  334. ("Just a www.example.com link.", ["https://www.example.com"]),
  335. ("www.a-link.com", ["https://www.a-link.com"]),
  336. ("www.a-link.com and www.b-link.com/blogs extra", ["https://www.a-link.com", "https://www.b-link.com/blogs"])
  337. ]
  338. def test_find_all_links_basic():
  339. target = """hi my name is prince nigeria, please visit my website
  340. http://richprince.biz or if that's blocked try
  341. https://getprince.ly! Thanks for your attention.bye!
  342. PS if those ports are blocked, how about trying
  343. https://crownbux.afamilycompany:broken/affiliate
  344. PPS if all else fails you can always mailto:thePrince@machovelli.an
  345. """
  346. urls = find_all_links(target)
  347. assert len(urls) == 2
  348. def test_find_all_links():
  349. prefix = "a little something before, "
  350. suffix = " a bit of another after."
  351. for content, expected_links in FAL_TESTS:
  352. text = prefix + content + suffix
  353. links = find_all_links(text)
  354. assert len(links) == len(expected_links)
  355. for link, expected in zip(links, expected_links):
  356. assert link.to_text(full_quote=False) == expected
  357. link_tokens = find_all_links(text, with_text=True)
  358. assert link_tokens[0].startswith(prefix)
  359. assert link_tokens[-1].endswith(suffix)
  360. def test_unicodey():
  361. unicodey = (u'http://\N{LATIN SMALL LETTER E WITH ACUTE}.com/'
  362. u'\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}'
  363. u'?\N{LATIN SMALL LETTER A}\N{COMBINING ACUTE ACCENT}='
  364. u'\N{LATIN SMALL LETTER I}\N{COMBINING ACUTE ACCENT}'
  365. u'#\N{LATIN SMALL LETTER U}\N{COMBINING ACUTE ACCENT}')
  366. url = URL(unicodey)
  367. assert url.host == u'é.com'
  368. assert url.path_parts[1] == u'\N{LATIN SMALL LETTER E}\N{COMBINING ACUTE ACCENT}'
  369. assert url.to_text(full_quote=False) == unicodey
  370. fully_quoted = 'http://xn--9ca.com/%C3%A9?%C3%A1=%C3%AD#%C3%BA'
  371. assert url.to_text(full_quote=True) == fully_quoted
  372. def test_str_repr():
  373. assert str(URL("http://googlewebsite.com/e-shops.aspx")) == "http://googlewebsite.com/e-shops.aspx"