|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# These tests require a working internet connection.
|
|
|
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from __future__ import division
|
|
|
|
|
|
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
|
from builtins import map, zip, filter
|
|
|
|
|
from builtins import object, range, next
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
|
import unittest
|
|
|
|
|
import time
|
|
|
|
|
import warnings
|
|
|
|
|
|
|
|
|
|
from pattern import web
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
PATH = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
except:
|
|
|
|
|
PATH = ""
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCache(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_cache(self):
|
|
|
|
|
# Assert cache unicode.
|
|
|
|
|
k, v = "test", "ünîcødé"
|
|
|
|
|
web.cache[k] = v
|
|
|
|
|
self.assertTrue(isinstance(web.cache[k], str))
|
|
|
|
|
self.assertEqual(web.cache[k], v)
|
|
|
|
|
self.assertEqual(web.cache.age(k), 0)
|
|
|
|
|
del web.cache[k]
|
|
|
|
|
print("pattern.web.Cache")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestUnicode(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
# Test data with different (or wrong) encodings.
|
|
|
|
|
self.strings = (
|
|
|
|
|
"ünîcøde",
|
|
|
|
|
"ünîcøde".encode("utf-16"),
|
|
|
|
|
"ünîcøde".encode("latin-1"),
|
|
|
|
|
"ünîcøde".encode("windows-1252"),
|
|
|
|
|
"ünîcøde",
|
|
|
|
|
"אוניקאָד"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_decode_utf8(self):
|
|
|
|
|
# Assert unicode.
|
|
|
|
|
for s in self.strings:
|
|
|
|
|
self.assertTrue(isinstance(web.decode_utf8(s), str))
|
|
|
|
|
print("pattern.web.decode_utf8()")
|
|
|
|
|
|
|
|
|
|
def test_encode_utf8(self):
|
|
|
|
|
# Assert Python bytestring.
|
|
|
|
|
for s in self.strings:
|
|
|
|
|
self.assertTrue(isinstance(web.encode_utf8(s), bytes))
|
|
|
|
|
print("pattern.web.encode_utf8()")
|
|
|
|
|
|
|
|
|
|
def test_fix(self):
|
|
|
|
|
# Assert fix for common Unicode mistakes.
|
|
|
|
|
self.assertEqual(web.fix("cliché"), "cliché")
|
|
|
|
|
self.assertEqual(web.fix("cliché"), "cliché")
|
|
|
|
|
self.assertEqual(web.fix("cliché"), "cliché")
|
|
|
|
|
self.assertEqual(web.fix("–"), "–")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestURL(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
# Test a live URL that has fast response time
|
|
|
|
|
self.live = "http://www.google.com/"
|
|
|
|
|
# Test a fake URL with the URL parser.
|
|
|
|
|
self.url = "https://username:password@www.domain.com:8080/path/path/page.html?q=1#anchor"
|
|
|
|
|
self.parts = {
|
|
|
|
|
"protocol": "https",
|
|
|
|
|
"username": "username",
|
|
|
|
|
"password": "password",
|
|
|
|
|
"domain": "www.domain.com",
|
|
|
|
|
"port": 8080,
|
|
|
|
|
"path": ["path", "path"],
|
|
|
|
|
"page": "page.html",
|
|
|
|
|
"query": {"q": 1},
|
|
|
|
|
"anchor": "anchor"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def test_asynchrous(self):
|
|
|
|
|
# Assert asynchronous function call (returns 1).
|
|
|
|
|
v = web.asynchronous(lambda t: time.sleep(t) or 1, 0.2)
|
|
|
|
|
while not v.done:
|
|
|
|
|
time.sleep(0.1)
|
|
|
|
|
self.assertEqual(v.value, 1)
|
|
|
|
|
print("pattern.web.asynchronous()")
|
|
|
|
|
|
|
|
|
|
def test_extension(self):
|
|
|
|
|
# Assert filename extension.
|
|
|
|
|
v = web.extension(os.path.join("pattern", "test", "test-web.py.zip"))
|
|
|
|
|
self.assertEqual(v, ".zip")
|
|
|
|
|
print("pattern.web.extension()")
|
|
|
|
|
|
|
|
|
|
def test_urldecode(self):
|
|
|
|
|
# Assert URL decode (inverse of urllib.urlencode).
|
|
|
|
|
v = web.urldecode("?user=me&page=1&q=&")
|
|
|
|
|
self.assertEqual(v, {"user": "me", "page": 1, "q": None})
|
|
|
|
|
print("pattern.web.urldecode()")
|
|
|
|
|
|
|
|
|
|
def test_proxy(self):
|
|
|
|
|
# Assert URL proxy.
|
|
|
|
|
v = web.proxy("www.proxy.com", "https")
|
|
|
|
|
self.assertEqual(v, ("www.proxy.com", "https"))
|
|
|
|
|
print("pattern.web.proxy()")
|
|
|
|
|
|
|
|
|
|
def test_url_parts(self):
|
|
|
|
|
# Assert URL._parse and URL.parts{}.
|
|
|
|
|
v = web.URL(self.url)
|
|
|
|
|
for a, b in (
|
|
|
|
|
(web.PROTOCOL, self.parts["protocol"]),
|
|
|
|
|
(web.USERNAME, self.parts["username"]),
|
|
|
|
|
(web.PASSWORD, self.parts["password"]),
|
|
|
|
|
(web.DOMAIN, self.parts["domain"]),
|
|
|
|
|
(web.PORT, self.parts["port"]),
|
|
|
|
|
(web.PATH, self.parts["path"]),
|
|
|
|
|
(web.PAGE, self.parts["page"]),
|
|
|
|
|
(web.QUERY, self.parts["query"]),
|
|
|
|
|
(web.ANCHOR, self.parts["anchor"])):
|
|
|
|
|
self.assertEqual(v.parts[a], b)
|
|
|
|
|
print("pattern.web.URL.parts")
|
|
|
|
|
|
|
|
|
|
def test_url_query(self):
|
|
|
|
|
# Assert URL.query and URL.querystring.
|
|
|
|
|
v = web.URL(self.url)
|
|
|
|
|
v.query["page"] = 10
|
|
|
|
|
v.query["user"] = None
|
|
|
|
|
self.assertEqual(v.query, {"q": 1, "page": 10, "user": None})
|
|
|
|
|
self.assertEqual(v.querystring, "q=1&page=10&user=")
|
|
|
|
|
# Assert URL.querystring encodes unicode arguments.
|
|
|
|
|
q = ({"ünîcødé": 1.5}, "%C3%BCn%C3%AEc%C3%B8d%C3%A9=1.5")
|
|
|
|
|
v.query = q[0]
|
|
|
|
|
self.assertEqual(v.querystring, q[1])
|
|
|
|
|
# Assert URL.query decodes unicode arguments.
|
|
|
|
|
v = web.URL("http://domain.com?" + q[1])
|
|
|
|
|
self.assertEqual(v.query, q[0])
|
|
|
|
|
print("pattern.web.URL.query")
|
|
|
|
|
print("pattern.web.URL.querystring")
|
|
|
|
|
|
|
|
|
|
def test_url_string(self):
|
|
|
|
|
# Assert URL._set_string().
|
|
|
|
|
v = web.URL("")
|
|
|
|
|
v.string = "https://domain.com"
|
|
|
|
|
self.assertEqual(v.parts[web.PROTOCOL], "https")
|
|
|
|
|
self.assertEqual(v.parts[web.DOMAIN], "domain.com")
|
|
|
|
|
self.assertEqual(v.parts[web.PATH], [])
|
|
|
|
|
print("pattern.web.URL.string")
|
|
|
|
|
|
|
|
|
|
def test_url(self):
|
|
|
|
|
# Assert URL.copy().
|
|
|
|
|
v = web.URL(self.url)
|
|
|
|
|
v = v.copy()
|
|
|
|
|
# Assert URL.__setattr__().
|
|
|
|
|
v.username = "new-username"
|
|
|
|
|
v.password = "new-password"
|
|
|
|
|
# Assert URL.__getattr__().
|
|
|
|
|
self.assertEqual(v.method, web.GET)
|
|
|
|
|
self.assertEqual(v.protocol, self.parts["protocol"])
|
|
|
|
|
self.assertEqual(v.username, "new-username")
|
|
|
|
|
self.assertEqual(v.password, "new-password")
|
|
|
|
|
self.assertEqual(v.domain, self.parts["domain"])
|
|
|
|
|
self.assertEqual(v.port, self.parts["port"])
|
|
|
|
|
self.assertEqual(v.path, self.parts["path"])
|
|
|
|
|
self.assertEqual(v.page, self.parts["page"])
|
|
|
|
|
self.assertEqual(v.query, self.parts["query"])
|
|
|
|
|
self.assertEqual(v.anchor, self.parts["anchor"])
|
|
|
|
|
print("pattern.web.URL")
|
|
|
|
|
|
|
|
|
|
def test_url_open(self):
|
|
|
|
|
# Assert URLError.
|
|
|
|
|
v = web.URL(self.live.replace("http://", "htp://"))
|
|
|
|
|
self.assertRaises(web.URLError, v.open)
|
|
|
|
|
self.assertEqual(v.exists, False)
|
|
|
|
|
# Assert HTTPError.
|
|
|
|
|
v = web.URL(self.live + "iphone/android.html")
|
|
|
|
|
self.assertRaises(web.HTTPError, v.open)
|
|
|
|
|
self.assertRaises(web.HTTP404NotFound, v.open)
|
|
|
|
|
self.assertEqual(v.exists, False)
|
|
|
|
|
# Assert socket connection.
|
|
|
|
|
v = web.URL(self.live)
|
|
|
|
|
self.assertTrue(v.open() is not None)
|
|
|
|
|
self.assertEqual(v.exists, True)
|
|
|
|
|
# Assert user-agent and referer.
|
|
|
|
|
self.assertTrue(v.open(user_agent=web.MOZILLA, referrer=web.REFERRER) is not None)
|
|
|
|
|
print("pattern.web.URL.exists")
|
|
|
|
|
print("pattern.web.URL.open()")
|
|
|
|
|
|
|
|
|
|
def test_url_download(self):
|
|
|
|
|
t = time.time()
|
|
|
|
|
v = web.URL(self.live).download(cached=False, throttle=0.25, unicode=True)
|
|
|
|
|
t = time.time() - t
|
|
|
|
|
# Assert unicode content.
|
|
|
|
|
self.assertTrue(isinstance(v, str))
|
|
|
|
|
# Assert download rate limiting.
|
|
|
|
|
self.assertTrue(t >= 0.25)
|
|
|
|
|
print("pattern.web.URL.download()")
|
|
|
|
|
|
|
|
|
|
def test_url_mimetype(self):
|
|
|
|
|
# Assert URL MIME-type.
|
|
|
|
|
v = web.URL(self.live).mimetype
|
|
|
|
|
self.assertTrue(v in web.MIMETYPE_WEBPAGE)
|
|
|
|
|
print("pattern.web.URL.mimetype")
|
|
|
|
|
|
|
|
|
|
def test_url_headers(self):
|
|
|
|
|
# Assert URL headers.
|
|
|
|
|
v = web.URL(self.live).headers["content-type"].split(";")[0]
|
|
|
|
|
self.assertEqual(v, "text/html")
|
|
|
|
|
print("pattern.web.URL.headers")
|
|
|
|
|
|
|
|
|
|
def test_url_redirect(self):
|
|
|
|
|
# Assert URL redirected URL (this depends on where you are).
|
|
|
|
|
# In Belgium, it yields "http://www.google.be/".
|
|
|
|
|
v = web.URL(self.live).redirect
|
|
|
|
|
print("pattern.web.URL.redirect: " + self.live + " => " + str(v))
|
|
|
|
|
|
|
|
|
|
def test_abs(self):
|
|
|
|
|
# Assert absolute URL (special attention for anchors).
|
|
|
|
|
for a, b in (
|
|
|
|
|
("../page.html", "http://domain.com/path/"),
|
|
|
|
|
( "page.html", "http://domain.com/home.html")):
|
|
|
|
|
v = web.abs(a, base=b)
|
|
|
|
|
self.assertEqual(v, "http://domain.com/page.html")
|
|
|
|
|
for a, b, c in (
|
|
|
|
|
( "#anchor", "http://domain.com", "/"),
|
|
|
|
|
( "#anchor", "http://domain.com/", ""),
|
|
|
|
|
( "#anchor", "http://domain.com/page", "")):
|
|
|
|
|
v = web.abs(a, base=b)
|
|
|
|
|
self.assertEqual(v, b + c + a) # http://domain.com/#anchor
|
|
|
|
|
print("pattern.web.abs()")
|
|
|
|
|
|
|
|
|
|
def test_base(self):
|
|
|
|
|
# Assert base URL domain name.
|
|
|
|
|
self.assertEqual(web.base("http://domain.com/home.html"), "domain.com")
|
|
|
|
|
print("pattern.web.base()")
|
|
|
|
|
|
|
|
|
|
def test_oauth(self):
|
|
|
|
|
# Assert OAuth algorithm.
|
|
|
|
|
data = {
|
|
|
|
|
"q": '"cåts, døgs & chîckéns = fün+"',
|
|
|
|
|
"oauth_version": "1.0",
|
|
|
|
|
"oauth_nonce": "0",
|
|
|
|
|
"oauth_timestamp": 0,
|
|
|
|
|
"oauth_consumer_key": "key",
|
|
|
|
|
"oauth_signature_method": "HMAC-SHA1"
|
|
|
|
|
}
|
|
|
|
|
v = web.oauth.sign("http://yboss.yahooapis.com/ysearch/web", data, secret="secret")
|
|
|
|
|
self.assertEqual(v, "RtTu8dxSp3uBzSbsuLAXIWOKfyI=")
|
|
|
|
|
print("pattern.web.oauth.sign()")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestPlaintext(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_find_urls(self):
|
|
|
|
|
# Assert URL finder with common URL notations.
|
|
|
|
|
for url in (
|
|
|
|
|
"http://domain.co.uk",
|
|
|
|
|
"https://domain.co.uk",
|
|
|
|
|
"www.domain.cu.uk",
|
|
|
|
|
"domain.com",
|
|
|
|
|
"domain.org",
|
|
|
|
|
"domain.net"):
|
|
|
|
|
self.assertEqual(web.find_urls("(" + url + ".")[0], url)
|
|
|
|
|
# Assert case-insensitive, punctuation and <a href="">.
|
|
|
|
|
# Assert several matches in string.
|
|
|
|
|
self.assertEqual(web.find_urls("HTTP://domain.net")[0], "HTTP://domain.net")
|
|
|
|
|
self.assertEqual(web.find_urls("http://domain.net),};")[0], "http://domain.net")
|
|
|
|
|
self.assertEqual(web.find_urls("http://domain.net\">domain")[0], "http://domain.net")
|
|
|
|
|
self.assertEqual(web.find_urls("domain.com, domain.net"), ["domain.com", "domain.net"])
|
|
|
|
|
print("pattern.web.find_urls()")
|
|
|
|
|
|
|
|
|
|
def test_find_email(self):
|
|
|
|
|
# Assert e-mail finder with common e-mail notations.
|
|
|
|
|
s = "firstname.last+name@domain.ac.co.uk"
|
|
|
|
|
v = web.find_email("(" + s + ".")
|
|
|
|
|
self.assertEqual(v[0], s)
|
|
|
|
|
# Assert several matches in string.
|
|
|
|
|
s = ["me@site1.com", "me@site2.com"]
|
|
|
|
|
v = web.find_email("(" + ",".join(s) + ")")
|
|
|
|
|
self.assertEqual(v, s)
|
|
|
|
|
print("pattern.web.find_email()")
|
|
|
|
|
|
|
|
|
|
def test_find_between(self):
|
|
|
|
|
# Assert search between open tag and close tag.
|
|
|
|
|
s = "<script type='text/javascript'>alert(0);</script>"
|
|
|
|
|
v = web.find_between("<script", "</script>", s)
|
|
|
|
|
self.assertEqual(v[0], " type='text/javascript'>alert(0);")
|
|
|
|
|
# Assert several matches in string.
|
|
|
|
|
s = "a0ba1b"
|
|
|
|
|
v = web.find_between("a", "b", s)
|
|
|
|
|
self.assertEqual(v, ["0", "1"])
|
|
|
|
|
print("pattern.web.find_between()")
|
|
|
|
|
|
|
|
|
|
def test_strip_tags(self):
|
|
|
|
|
# Assert HTML parser and tag stripper.
|
|
|
|
|
for html, plain in (
|
|
|
|
|
("<b>ünîcøde</b>", "ünîcøde"),
|
|
|
|
|
("<img src=""/>", ""),
|
|
|
|
|
("<p>text</p>", "text\n\n"),
|
|
|
|
|
("<li>text</li>", "* text\n"),
|
|
|
|
|
("<td>text</td>", "text\t"),
|
|
|
|
|
("<br>", "\n"),
|
|
|
|
|
("<br/>", "\n\n"),
|
|
|
|
|
("<br /><br/><br>", "\n\n\n\n\n")):
|
|
|
|
|
self.assertEqual(web.strip_tags(html), plain)
|
|
|
|
|
# Assert exclude tags and attributes
|
|
|
|
|
v = web.strip_tags("<a href=\"\" onclick=\"\">text</a>", exclude={"a": ["href"]})
|
|
|
|
|
self.assertEqual(v, "<a href=\"\">text</a>")
|
|
|
|
|
print("pattern.web.strip_tags()")
|
|
|
|
|
|
|
|
|
|
def test_strip_element(self):
|
|
|
|
|
# Assert strip <p> elements.
|
|
|
|
|
v = web.strip_element(" <p><p></p>text</p> <b><P></P></b>", "p")
|
|
|
|
|
self.assertEqual(v, " <b></b>")
|
|
|
|
|
print("pattern.web.strip_element()")
|
|
|
|
|
|
|
|
|
|
def test_strip_between(self):
|
|
|
|
|
# Assert strip <p> elements.
|
|
|
|
|
v = web.strip_between("<p", "</p>", " <p><p></p>text</p> <b><P></P></b>")
|
|
|
|
|
self.assertEqual(v, " text</p> <b></b>")
|
|
|
|
|
print("pattern.web.strip_between()")
|
|
|
|
|
|
|
|
|
|
def test_strip_javascript(self):
|
|
|
|
|
# Assert strip <script> elements.
|
|
|
|
|
v = web.strip_javascript(" <script type=\"text/javascript\">text</script> ")
|
|
|
|
|
self.assertEqual(v, " ")
|
|
|
|
|
print("pattern.web.strip_javascript()")
|
|
|
|
|
|
|
|
|
|
def test_strip_inline_css(self):
|
|
|
|
|
# Assert strip <style> elements.
|
|
|
|
|
v = web.strip_inline_css(" <style type=\"text/css\">text</style> ")
|
|
|
|
|
self.assertEqual(v, " ")
|
|
|
|
|
print("pattern.web.strip_inline_css()")
|
|
|
|
|
|
|
|
|
|
def test_strip_comments(self):
|
|
|
|
|
# Assert strip <!-- --> elements.
|
|
|
|
|
v = web.strip_comments(" <!-- text --> ")
|
|
|
|
|
self.assertEqual(v, " ")
|
|
|
|
|
print("pattern.web.strip_comments()")
|
|
|
|
|
|
|
|
|
|
def test_strip_forms(self):
|
|
|
|
|
# Assert strip <form> elements.
|
|
|
|
|
v = web.strip_forms(" <form method=\"get\">text</form> ")
|
|
|
|
|
self.assertEqual(v, " ")
|
|
|
|
|
print("pattern.web.strip_forms()")
|
|
|
|
|
|
|
|
|
|
def test_encode_entities(self):
|
|
|
|
|
# Assert HTML entity encoder (e.g., "&" => "&&")
|
|
|
|
|
for a, b in (
|
|
|
|
|
("É", "É"),
|
|
|
|
|
("&", "&"),
|
|
|
|
|
("<", "<"),
|
|
|
|
|
(">", ">"),
|
|
|
|
|
('"', """),
|
|
|
|
|
("'", "'")):
|
|
|
|
|
self.assertEqual(web.encode_entities(a), b)
|
|
|
|
|
print("pattern.web.encode_entities()")
|
|
|
|
|
|
|
|
|
|
def test_decode_entities(self):
|
|
|
|
|
# Assert HMTL entity decoder (e.g., "&" => "&")
|
|
|
|
|
for a, b in (
|
|
|
|
|
("&", "&"),
|
|
|
|
|
("&", "&"),
|
|
|
|
|
("&", "&"),
|
|
|
|
|
(" ", "\xa0"),
|
|
|
|
|
("&foo;", "&foo;")):
|
|
|
|
|
self.assertEqual(web.decode_entities(a), b)
|
|
|
|
|
print("pattern.web.decode_entities()")
|
|
|
|
|
|
|
|
|
|
def test_collapse_spaces(self):
|
|
|
|
|
# Assert collapse multiple spaces.
|
|
|
|
|
for a, b in (
|
|
|
|
|
(" ", ""),
|
|
|
|
|
(" .. ", ".."),
|
|
|
|
|
(". .", ". ."),
|
|
|
|
|
(". \n", "."),
|
|
|
|
|
("\xa0", "")):
|
|
|
|
|
self.assertEqual(web.collapse_spaces(a), b)
|
|
|
|
|
# Assert preserve indendation.
|
|
|
|
|
self.assertEqual(web.collapse_spaces(" . \n", indentation=True), " .")
|
|
|
|
|
print("pattern.web.collapse_spaces()")
|
|
|
|
|
|
|
|
|
|
def test_collapse_tabs(self):
|
|
|
|
|
# Assert collapse multiple tabs to 1 space.
|
|
|
|
|
for a, b in (
|
|
|
|
|
("\t\t\t", ""),
|
|
|
|
|
("\t..\t", ".."),
|
|
|
|
|
(".\t\t.", ". ."),
|
|
|
|
|
(".\t\n", ".")):
|
|
|
|
|
self.assertEqual(web.collapse_tabs(a), b)
|
|
|
|
|
# Assert preserve indendation.
|
|
|
|
|
self.assertEqual(web.collapse_tabs("\t\t .\t\n", indentation=True), "\t\t .")
|
|
|
|
|
print("pattern.web.collapse_tabs()")
|
|
|
|
|
|
|
|
|
|
def test_collapse_linebreaks(self):
|
|
|
|
|
# Assert collapse multiple linebreaks.
|
|
|
|
|
for a, b in (
|
|
|
|
|
("\n\n\n", "\n"),
|
|
|
|
|
(".\n\n.", ".\n."),
|
|
|
|
|
(".\r\n.", ".\n."),
|
|
|
|
|
(".\n .", ".\n ."),
|
|
|
|
|
(" \n .", "\n .")):
|
|
|
|
|
self.assertEqual(web.collapse_linebreaks(a), b)
|
|
|
|
|
print("pattern.web.collapse_linebreaks()")
|
|
|
|
|
|
|
|
|
|
def test_plaintext(self):
|
|
|
|
|
# Assert plaintext:
|
|
|
|
|
# - strip <script>, <style>, <form>, <!-- --> elements,
|
|
|
|
|
# - strip tags,
|
|
|
|
|
# - decode entities,
|
|
|
|
|
# - collapse whitespace,
|
|
|
|
|
html = """
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<title>tags & things</title>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div id="content"> \n\n\n\
|
|
|
|
|
<!-- main content -->
|
|
|
|
|
<script type="text/javascript>"alert(0);</script>
|
|
|
|
|
<h1>title1</h1>
|
|
|
|
|
<h2>title2</h2>
|
|
|
|
|
<p>paragraph1</p>
|
|
|
|
|
<p>paragraph2 <a href="http://www.domain.com" onclick="alert(0);">link</a></p>
|
|
|
|
|
<ul>
|
|
|
|
|
<li>item1 xxx</li>
|
|
|
|
|
<li>item2</li>
|
|
|
|
|
<ul>
|
|
|
|
|
</div>
|
|
|
|
|
<br />
|
|
|
|
|
<br />
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
self.assertEqual(web.plaintext(html, keep={"a": "href"}),
|
|
|
|
|
"tags & things\n\ntitle1\n\ntitle2\n\nparagraph1\n\nparagraph2 " + \
|
|
|
|
|
"<a href=\"http://www.domain.com\">link</a>\n\n* item1 xxx\n* item2")
|
|
|
|
|
print("pattern.web.plaintext()")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSearchEngine(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
# Test data for all search engines:
|
|
|
|
|
# {api: (source, license, Engine)}.
|
|
|
|
|
self.api = {
|
|
|
|
|
"Google": (web.GOOGLE, web.GOOGLE_LICENSE, web.Google),
|
|
|
|
|
"Yahoo": (web.YAHOO, web.YAHOO_LICENSE, web.Yahoo),
|
|
|
|
|
"Bing": (web.BING, web.BING_LICENSE, web.Bing),
|
|
|
|
|
"Twitter": (web.TWITTER, web.TWITTER_LICENSE, web.Twitter),
|
|
|
|
|
"Wikipedia": (web.MEDIAWIKI, web.WIKIPEDIA_LICENSE, web.Wikipedia),
|
|
|
|
|
"Wikia": (web.MEDIAWIKI, web.MEDIAWIKI_LICENSE, web.Wikia),
|
|
|
|
|
"Flickr": (web.FLICKR, web.FLICKR_LICENSE, web.Flickr),
|
|
|
|
|
"Facebook": (web.FACEBOOK, web.FACEBOOK_LICENSE, web.Facebook),
|
|
|
|
|
"ProductWiki": (web.PRODUCTWIKI, web.PRODUCTWIKI_LICENSE, web.ProductWiki)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _test_search_engine(self, api, source, license, Engine, query="today", type=web.SEARCH):
|
|
|
|
|
# Assert SearchEngine standard interface for any api:
|
|
|
|
|
# Google, Yahoo, Bing, Twitter, Wikipedia, Flickr, Facebook, ProductWiki, Newsfeed.
|
|
|
|
|
# SearchEngine.search() returns a list of Result objects with unicode fields,
|
|
|
|
|
# except Wikipedia which returns a WikipediaArticle (MediaWikiArticle subclass).
|
|
|
|
|
if api == "Yahoo" and license == ("", ""):
|
|
|
|
|
return
|
|
|
|
|
t = time.time()
|
|
|
|
|
e = Engine(license=license, throttle=0.25, language="en")
|
|
|
|
|
v = e.search(query, type, start=1, count=1, cached=False)
|
|
|
|
|
t = time.time() - t
|
|
|
|
|
self.assertTrue(t >= 0.25)
|
|
|
|
|
self.assertEqual(e.license, license)
|
|
|
|
|
self.assertEqual(e.throttle, 0.25)
|
|
|
|
|
self.assertEqual(e.language, "en")
|
|
|
|
|
self.assertEqual(v.query, query)
|
|
|
|
|
if source != web.MEDIAWIKI:
|
|
|
|
|
self.assertEqual(v.source, source)
|
|
|
|
|
self.assertEqual(v.type, type)
|
|
|
|
|
self.assertEqual(len(v), 1)
|
|
|
|
|
self.assertTrue(isinstance(v[0], web.Result))
|
|
|
|
|
self.assertTrue(isinstance(v[0].url, str))
|
|
|
|
|
self.assertTrue(isinstance(v[0].title, str))
|
|
|
|
|
self.assertTrue(isinstance(v[0].description, str))
|
|
|
|
|
self.assertTrue(isinstance(v[0].language, str))
|
|
|
|
|
self.assertTrue(isinstance(v[0].author, (str, tuple)))
|
|
|
|
|
self.assertTrue(isinstance(v[0].date, str))
|
|
|
|
|
else:
|
|
|
|
|
self.assertTrue(isinstance(v, web.MediaWikiArticle))
|
|
|
|
|
# Assert zero results for start < 1 and count < 1.
|
|
|
|
|
v1 = e.search(query, start=0)
|
|
|
|
|
v2 = e.search(query, count=0)
|
|
|
|
|
if source != web.MEDIAWIKI:
|
|
|
|
|
self.assertEqual(len(v1), 0)
|
|
|
|
|
self.assertEqual(len(v2), 0)
|
|
|
|
|
else:
|
|
|
|
|
self.assertTrue(isinstance(v1, web.MediaWikiArticle))
|
|
|
|
|
self.assertEqual(v2, None)
|
|
|
|
|
# Assert SearchEngineTypeError for unknown type.
|
|
|
|
|
self.assertRaises(web.SearchEngineTypeError, e.search, query, type="crystall-ball")
|
|
|
|
|
print("pattern.web.%s.search()" % api)
|
|
|
|
|
|
|
|
|
|
def test_search_google(self):
|
|
|
|
|
self._test_search_engine("Google", *self.api["Google"])
|
|
|
|
|
|
|
|
|
|
def test_search_yahoo(self):
|
|
|
|
|
self._test_search_engine("Yahoo", *self.api["Yahoo"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Bing Search API has no free quota')
|
|
|
|
|
def test_search_bing(self):
|
|
|
|
|
self._test_search_engine("Bing", *self.api["Bing"])
|
|
|
|
|
|
|
|
|
|
def test_search_twitter(self):
|
|
|
|
|
self._test_search_engine("Twitter", *self.api["Twitter"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Mediawiki/Wikipedia API or appearance changed')
|
|
|
|
|
def test_search_wikipedia(self):
|
|
|
|
|
self._test_search_engine("Wikipedia", *self.api["Wikipedia"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Mediawiki API or appearance changed')
|
|
|
|
|
def test_search_wikia(self):
|
|
|
|
|
self._test_search_engine("Wikia", *self.api["Wikia"], **{"query": "games"})
|
|
|
|
|
|
|
|
|
|
def test_search_flickr(self):
|
|
|
|
|
self._test_search_engine("Flickr", *self.api["Flickr"], **{"type": web.IMAGE})
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Facebook API changed')
|
|
|
|
|
def test_search_facebook(self):
|
|
|
|
|
self._test_search_engine("Facebook", *self.api["Facebook"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('ProductWiki is deprecated')
|
|
|
|
|
def test_search_productwiki(self):
|
|
|
|
|
self._test_search_engine("ProductWiki", *self.api["ProductWiki"], **{"query": "computer"})
|
|
|
|
|
|
|
|
|
|
def test_search_newsfeed(self):
|
|
|
|
|
for feed, url in web.feeds.items():
|
|
|
|
|
self._test_search_engine("Newsfeed", url, None, web.Newsfeed, query=url, type=web.NEWS)
|
|
|
|
|
|
|
|
|
|
def _test_results(self, api, source, license, Engine, type=web.SEARCH, query="today", baseline=[6, 6, 6, 0]):
|
|
|
|
|
# Assert SearchEngine result content.
|
|
|
|
|
# We expect to find http:// URL's and descriptions containing the search query.
|
|
|
|
|
if api == "Yahoo" and license == ("", ""):
|
|
|
|
|
return
|
|
|
|
|
i1 = 0
|
|
|
|
|
i2 = 0
|
|
|
|
|
i3 = 0
|
|
|
|
|
i4 = 0
|
|
|
|
|
e = Engine(license=license, language="en", throttle=0.25)
|
|
|
|
|
for result in e.search(query, type, count=10, cached=False):
|
|
|
|
|
i1 += int(result.url.startswith("http"))
|
|
|
|
|
i2 += int(query in result.url.lower())
|
|
|
|
|
i2 += int(query in result.title.lower())
|
|
|
|
|
i2 += int(query in result.description.lower())
|
|
|
|
|
i3 += int(result.language == "en")
|
|
|
|
|
i4 += int(result.url.endswith(("jpg", "png", "gif")))
|
|
|
|
|
#print(result.url)
|
|
|
|
|
#print(result.title)
|
|
|
|
|
#print(result.description)
|
|
|
|
|
#print(i1, i2, i3, i4)
|
|
|
|
|
self.assertTrue(i1 >= baseline[0]) # url's starting with "http"
|
|
|
|
|
self.assertTrue(i2 >= baseline[1]) # query in url + title + description
|
|
|
|
|
self.assertTrue(i3 >= baseline[2]) # language "en"
|
|
|
|
|
self.assertTrue(i4 >= baseline[3]) # url's ending with "jpg", "png" or "gif"
|
|
|
|
|
print("pattern.web.%s.Result(type=%s)" % (api, type.upper()))
|
|
|
|
|
|
|
|
|
|
def test_results_google(self):
|
|
|
|
|
self._test_results("Google", *self.api["Google"])
|
|
|
|
|
|
|
|
|
|
def test_results_yahoo(self):
|
|
|
|
|
self._test_results("Yahoo", *self.api["Yahoo"])
|
|
|
|
|
|
|
|
|
|
def test_results_yahoo_images(self):
|
|
|
|
|
self._test_results("Yahoo", *self.api["Yahoo"], **{"type": web.IMAGE, "baseline": [6, 6, 0, 6]})
|
|
|
|
|
|
|
|
|
|
def test_results_yahoo_news(self):
|
|
|
|
|
self._test_results("Yahoo", *self.api["Yahoo"], **{"type": web.NEWS})
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Bing API changed')
|
|
|
|
|
def test_results_bing(self):
|
|
|
|
|
self._test_results("Bing", *self.api["Bing"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Bing API changed')
|
|
|
|
|
def test_results_bing_images(self):
|
|
|
|
|
self._test_results("Bing", *self.api["Bing"], **{"type": web.IMAGE, "baseline": [6, 6, 0, 6]})
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Bing API changed')
|
|
|
|
|
def test_results_bing_news(self):
|
|
|
|
|
self._test_results("Bing", *self.api["Bing"], **{"type": web.NEWS})
|
|
|
|
|
|
|
|
|
|
def test_results_twitter(self):
|
|
|
|
|
self._test_results("Twitter", *self.api["Twitter"])
|
|
|
|
|
|
|
|
|
|
def test_results_flickr(self):
|
|
|
|
|
self._test_results("Flickr", *self.api["Flickr"], **{"baseline": [6, 6, 0, 6]})
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Facebook API changed')
|
|
|
|
|
def test_results_facebook(self):
|
|
|
|
|
self._test_results("Facebook", *self.api["Facebook"], **{"baseline": [0, 1, 0, 0]})
|
|
|
|
|
|
|
|
|
|
def test_google_translate(self):
|
|
|
|
|
try:
|
|
|
|
|
# Assert Google Translate API.
|
|
|
|
|
# Requires license with billing enabled.
|
|
|
|
|
source, license, Engine = self.api["Google"]
|
|
|
|
|
v = Engine(license, throttle=0.25).translate("thé", input="fr", output="en", cached=False)
|
|
|
|
|
self.assertEqual(v, "tea")
|
|
|
|
|
print("pattern.web.Google.translate()")
|
|
|
|
|
except web.HTTP401Authentication:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_google_identify(self):
|
|
|
|
|
try:
|
|
|
|
|
# Assert Google Translate API (language detection).
|
|
|
|
|
# Requires license with billing enabled.
|
|
|
|
|
source, license, Engine = self.api["Google"]
|
|
|
|
|
v = Engine(license, throttle=0.25).identify("L'essence des mathématiques, c'est la liberté!", cached=False)
|
|
|
|
|
self.assertEqual(v[0], "fr")
|
|
|
|
|
print("pattern.web.Google.identify()")
|
|
|
|
|
except web.HTTP401Authentication:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_twitter_author(self):
|
|
|
|
|
self.assertEqual(web.author("me"), "from:me")
|
|
|
|
|
print("pattern.web.author()")
|
|
|
|
|
|
|
|
|
|
def test_twitter_hashtags(self):
|
|
|
|
|
self.assertEqual(web.hashtags("#cat #dog"), ["#cat", "#dog"])
|
|
|
|
|
print("pattern.web.hashtags()")
|
|
|
|
|
|
|
|
|
|
def test_twitter_retweets(self):
|
|
|
|
|
self.assertEqual(web.retweets("RT @me: blah"), ["@me"])
|
|
|
|
|
print("pattern.web.retweets()")
|
|
|
|
|
|
|
|
|
|
def _test_search_image_size(self, api, source, license, Engine):
|
|
|
|
|
# Assert image URL's for different sizes actually exist.
|
|
|
|
|
if api == "Yahoo" and license == ("", ""):
|
|
|
|
|
return
|
|
|
|
|
e = Engine(license, throttle=0.25)
|
|
|
|
|
for size in (web.TINY, web.SMALL, web.MEDIUM, web.LARGE):
|
|
|
|
|
v = e.search("cats", type=web.IMAGE, count=1, size=size, cached=False)
|
|
|
|
|
self.assertEqual(web.URL(v[0].url).exists, True)
|
|
|
|
|
print("pattern.web.%s.search(type=IMAGE, size=%s)" % (api, size.upper()))
|
|
|
|
|
|
|
|
|
|
def test_yahoo_image_size(self):
|
|
|
|
|
self._test_search_image_size("Yahoo", *self.api["Yahoo"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Bing Search API has no free quota')
|
|
|
|
|
def test_bing_image_size(self):
|
|
|
|
|
self._test_search_image_size("Bing", *self.api["Bing"])
|
|
|
|
|
|
|
|
|
|
def test_flickr_image_size(self):
|
|
|
|
|
self._test_search_image_size("Flickr", *self.api["Flickr"])
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Mediawiki/Wikipedia API or appearance changed')
|
|
|
|
|
def test_wikipedia_list(self):
|
|
|
|
|
# Assert WikipediaArticle.list(), an iterator over all article titles.
|
|
|
|
|
source, license, Engine = self.api["Wikipedia"]
|
|
|
|
|
v = Engine(license).list(start="a", count=1)
|
|
|
|
|
v = [next(v) for i in range(2)]
|
|
|
|
|
self.assertTrue(len(v) == 2)
|
|
|
|
|
self.assertTrue(v[0].lower().startswith("a"))
|
|
|
|
|
self.assertTrue(v[1].lower().startswith("a"))
|
|
|
|
|
print("pattern.web.Wikipedia.list()")
|
|
|
|
|
|
|
|
|
|
def test_wikipedia_all(self):
|
|
|
|
|
# Assert WikipediaArticle.all(), an iterator over WikipediaArticle objects.
|
|
|
|
|
source, license, Engine = self.api["Wikipedia"]
|
|
|
|
|
v = Engine(license).all(start="a", count=1)
|
|
|
|
|
v = [next(v) for i in range(1)]
|
|
|
|
|
self.assertTrue(len(v) == 1)
|
|
|
|
|
self.assertTrue(isinstance(v[0], web.WikipediaArticle))
|
|
|
|
|
self.assertTrue(v[0].title.lower().startswith("a"))
|
|
|
|
|
print("pattern.web.Wikipedia.all()")
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Mediawiki/Wikipedia API or appearance changed')
|
|
|
|
|
def test_wikipedia_article(self):
|
|
|
|
|
source, license, Engine = self.api["Wikipedia"]
|
|
|
|
|
v = Engine(license).search("cat", cached=False)
|
|
|
|
|
# Assert WikipediaArticle properties.
|
|
|
|
|
self.assertTrue(isinstance(v.title, str))
|
|
|
|
|
self.assertTrue(isinstance(v.string, str))
|
|
|
|
|
self.assertTrue(isinstance(v.links, list))
|
|
|
|
|
self.assertTrue(isinstance(v.categories, list))
|
|
|
|
|
self.assertTrue(isinstance(v.external, list))
|
|
|
|
|
self.assertTrue(isinstance(v.media, list))
|
|
|
|
|
self.assertTrue(isinstance(v.languages, dict))
|
|
|
|
|
# Assert WikipediaArticle properties content.
|
|
|
|
|
self.assertTrue(v.string == v.plaintext())
|
|
|
|
|
self.assertTrue(v.html == v.source)
|
|
|
|
|
self.assertTrue("</div>" in v.source)
|
|
|
|
|
self.assertTrue("cat" in v.title.lower())
|
|
|
|
|
self.assertTrue("Felis" in v.links)
|
|
|
|
|
self.assertTrue("Felines" in v.categories)
|
|
|
|
|
self.assertTrue("en" == v.language)
|
|
|
|
|
self.assertTrue("fr" in v.languages)
|
|
|
|
|
self.assertTrue("chat" in v.languages["fr"].lower())
|
|
|
|
|
self.assertTrue(v.external[0].startswith("http"))
|
|
|
|
|
self.assertTrue(v.media[0].endswith(("jpg", "png", "gif", "svg")))
|
|
|
|
|
print("pattern.web.WikipediaArticle")
|
|
|
|
|
|
|
|
|
|
@unittest.skip('Mediawiki/Wikipedia API or appearance changed')
|
|
|
|
|
def test_wikipedia_article_sections(self):
|
|
|
|
|
# Assert WikipediaArticle.sections structure.
|
|
|
|
|
# The test may need to be modified if the Wikipedia "Cat" article changes.
|
|
|
|
|
source, license, Engine = self.api["Wikipedia"]
|
|
|
|
|
v = Engine(license).search("cat", cached=False)
|
|
|
|
|
s1 = s2 = s3 = None
|
|
|
|
|
for section in v.sections:
|
|
|
|
|
if section.title == "Behavior":
|
|
|
|
|
s1 = section
|
|
|
|
|
if section.title == "Grooming":
|
|
|
|
|
s2 = section
|
|
|
|
|
if section.title == "Play":
|
|
|
|
|
s3 = section
|
|
|
|
|
self.assertTrue(section.article == v)
|
|
|
|
|
self.assertTrue(section.level == 0 or section.string.startswith(section.title))
|
|
|
|
|
# Test section depth.
|
|
|
|
|
self.assertTrue(s1.level == 1)
|
|
|
|
|
self.assertTrue(s2.level == 2)
|
|
|
|
|
self.assertTrue(s2.level == 2)
|
|
|
|
|
# Test section parent-child structure.
|
|
|
|
|
self.assertTrue(s2 in s1.children) # Behavior => Grooming
|
|
|
|
|
self.assertTrue(s3 in s1.children) # Behavior => Play
|
|
|
|
|
self.assertTrue(s2.parent == s1)
|
|
|
|
|
self.assertTrue(s3.parent == s1)
|
|
|
|
|
# Test section content.
|
|
|
|
|
self.assertTrue("hairballs" in s2.content)
|
|
|
|
|
self.assertTrue("laser pointer" in s3.content)
|
|
|
|
|
# Test section tables.
|
|
|
|
|
# XXX should test <td colspan="x"> more thoroughly.
|
|
|
|
|
self.assertTrue(len(v.sections[1].tables) > 0)
|
|
|
|
|
print("pattern.web.WikipediaSection")
|
|
|
|
|
|
|
|
|
|
@unittest.skip('ProductWiki is deprecated')
|
|
|
|
|
def test_productwiki(self):
|
|
|
|
|
# Assert product reviews and score.
|
|
|
|
|
source, license, Engine = self.api["ProductWiki"]
|
|
|
|
|
v = Engine(license).search("computer", cached=False)
|
|
|
|
|
self.assertTrue(isinstance(v[0].reviews, list))
|
|
|
|
|
self.assertTrue(isinstance(v[0].score, int))
|
|
|
|
|
print("pattern.web.ProductWiki.Result.reviews")
|
|
|
|
|
print("pattern.web.ProductWiki.Result.score")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDOM(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
# Test HTML document.
|
|
|
|
|
self.html = """
|
|
|
|
|
<!doctype html>
|
|
|
|
|
<html lang="en">
|
|
|
|
|
<head>
|
|
|
|
|
<title>title</title>
|
|
|
|
|
<meta charset="utf-8" />
|
|
|
|
|
</head>
|
|
|
|
|
<body id="front" class="comments">
|
|
|
|
|
<script type="text/javascript">alert(0);</script>
|
|
|
|
|
<div id="navigation">
|
|
|
|
|
<a href="nav1.html">nav1</a> |
|
|
|
|
|
<a href="nav2.html">nav2</a> |
|
|
|
|
|
<a href="nav3.html">nav3</a>
|
|
|
|
|
</div>
|
|
|
|
|
<div id="content">
|
|
|
|
|
<P class="comment">
|
|
|
|
|
<span class="date">today</span>
|
|
|
|
|
<span class="author">me</span>
|
|
|
|
|
Blah blah
|
|
|
|
|
</P>
|
|
|
|
|
<P class="class1 class2">
|
|
|
|
|
Blah blah
|
|
|
|
|
</P>
|
|
|
|
|
<p>Read more</p>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def test_node_document(self):
|
|
|
|
|
# Assert Node properties.
|
|
|
|
|
v1 = web.Document(self.html)
|
|
|
|
|
self.assertEqual(v1.type, web.DOCUMENT)
|
|
|
|
|
self.assertEqual(v1.source[:10], "<!DOCTYPE ") # Note: BeautifulSoup strips whitespace.
|
|
|
|
|
self.assertEqual(v1.parent, None)
|
|
|
|
|
# Assert Node traversal.
|
|
|
|
|
v2 = v1.children[0].next
|
|
|
|
|
self.assertEqual(v2.type, web.ELEMENT)
|
|
|
|
|
self.assertEqual(v2.previous, v1.children[0])
|
|
|
|
|
# Assert Document properties.
|
|
|
|
|
v3 = v1.declaration
|
|
|
|
|
self.assertEqual(v3, v1.children[0])
|
|
|
|
|
self.assertEqual(v3.parent, v1)
|
|
|
|
|
self.assertEqual(v3.source, "html")
|
|
|
|
|
self.assertEqual(v1.head.type, web.ELEMENT)
|
|
|
|
|
self.assertEqual(v1.body.type, web.ELEMENT)
|
|
|
|
|
self.assertTrue(v1.head.source.startswith("<head"))
|
|
|
|
|
self.assertTrue(v1.body.source.startswith("<body"))
|
|
|
|
|
print("pattern.web.Node")
|
|
|
|
|
print("pattern.web.DOM")
|
|
|
|
|
|
|
|
|
|
def test_node_traverse(self):
|
|
|
|
|
# Assert Node.traverse() (must visit all child nodes recursively).
|
|
|
|
|
self.b = False
|
|
|
|
|
|
|
|
|
|
def visit(node):
|
|
|
|
|
if node.type == web.ELEMENT and node.tag == "span":
|
|
|
|
|
self.b = True
|
|
|
|
|
v = web.DOM(self.html)
|
|
|
|
|
v.traverse(visit)
|
|
|
|
|
self.assertEqual(self.b, True)
|
|
|
|
|
print("pattern.web.Node.traverse()")
|
|
|
|
|
|
|
|
|
|
def test_element(self):
|
|
|
|
|
# Assert Element properties (test <body>).
|
|
|
|
|
v = web.DOM(self.html).body
|
|
|
|
|
self.assertEqual(v.tag, "body")
|
|
|
|
|
self.assertEqual(v.attributes["id"], "front")
|
|
|
|
|
self.assertEqual(v.attributes["class"], ["comments"])
|
|
|
|
|
self.assertTrue(v.content.startswith("\n<script"))
|
|
|
|
|
# Assert Element.getElementsByTagname() (test navigation links).
|
|
|
|
|
a = v.by_tag("a")
|
|
|
|
|
self.assertEqual(len(a), 3)
|
|
|
|
|
self.assertEqual(a[0].content, "nav1")
|
|
|
|
|
self.assertEqual(a[1].content, "nav2")
|
|
|
|
|
self.assertEqual(a[2].content, "nav3")
|
|
|
|
|
# Assert Element.getElementsByClassname() (test <p class="comment">).
|
|
|
|
|
a = v.by_class("comment")
|
|
|
|
|
self.assertEqual(a[0].tag, "p")
|
|
|
|
|
self.assertEqual(a[0].by_tag("span")[0].attributes["class"], ["date"])
|
|
|
|
|
self.assertEqual(a[0].by_tag("span")[1].attributes["class"], ["author"])
|
|
|
|
|
for selector in (".comment", "p.comment", "*.comment"):
|
|
|
|
|
self.assertEqual(v.by_tag(selector)[0], a[0])
|
|
|
|
|
# Assert Element.getElementById() (test <div id="content">).
|
|
|
|
|
e = v.by_id("content")
|
|
|
|
|
self.assertEqual(e.tag, "div")
|
|
|
|
|
self.assertEqual(e, a[0].parent)
|
|
|
|
|
for selector in ("#content", "div#content", "*#content"):
|
|
|
|
|
self.assertEqual(v.by_tag(selector)[0], e)
|
|
|
|
|
# Assert Element.getElementByAttribute() (test on <a href="">).
|
|
|
|
|
a = v.by_attribute(href="nav1.html")
|
|
|
|
|
self.assertEqual(a[0].content, "nav1")
|
|
|
|
|
print("pattern.web.Element")
|
|
|
|
|
print("pattern.web.Element.by_tag()")
|
|
|
|
|
print("pattern.web.Element.by_class()")
|
|
|
|
|
print("pattern.web.Element.by_id()")
|
|
|
|
|
print("pattern.web.Element.by_attribute()")
|
|
|
|
|
|
|
|
|
|
def test_selector(self):
|
|
|
|
|
# Assert DOM CSS selectors with multiple classes.
|
|
|
|
|
v = web.DOM(self.html).body
|
|
|
|
|
p = v("p.class1")
|
|
|
|
|
self.assertEqual(len(p), 1)
|
|
|
|
|
self.assertTrue("class1" in p[0].attributes["class"])
|
|
|
|
|
p = v("p.class2")
|
|
|
|
|
self.assertEqual(len(p), 1)
|
|
|
|
|
self.assertTrue("class2" in p[0].attributes["class"])
|
|
|
|
|
p = v("p.class1.class2")
|
|
|
|
|
self.assertEqual(len(p), 1)
|
|
|
|
|
self.assertTrue("class1" in p[0].attributes["class"])
|
|
|
|
|
self.assertTrue("class2" in p[0].attributes["class"])
|
|
|
|
|
e = p[0]
|
|
|
|
|
self.assertEqual(e, v("p[class='class1 class2']")[0])
|
|
|
|
|
self.assertEqual(e, v("p[class^='class1']")[0])
|
|
|
|
|
self.assertEqual(e, v("p[class$='class2']")[0])
|
|
|
|
|
self.assertEqual(e, v("p[class*='class']")[0])
|
|
|
|
|
self.assertEqual(e, v("p:contains('blah')")[1])
|
|
|
|
|
self.assertTrue(web.Selector("p[class='class1 class2']").match(e))
|
|
|
|
|
print("pattern.web.Selector()")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDocumentParser(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_pdf(self):
|
|
|
|
|
# Assert PDF to string parser.
|
|
|
|
|
s = web.parsedoc(os.path.join(PATH, "corpora", "carroll-wonderland.pdf"))
|
|
|
|
|
self.assertTrue("Curiouser and curiouser!" in s)
|
|
|
|
|
self.assertTrue(isinstance(s, str))
|
|
|
|
|
print("pattern.web.parsepdf()")
|
|
|
|
|
|
|
|
|
|
def test_docx(self):
|
|
|
|
|
# Assert PDF to string parser.
|
|
|
|
|
s = web.parsedoc(os.path.join(PATH, "corpora", "carroll-lookingglass.docx"))
|
|
|
|
|
self.assertTrue("'Twas brillig, and the slithy toves" in s)
|
|
|
|
|
self.assertTrue(isinstance(s, str))
|
|
|
|
|
print("pattern.web.parsedocx()")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestLocale(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_encode_language(self):
|
|
|
|
|
# Assert "Dutch" => "nl".
|
|
|
|
|
self.assertEqual(web.locale.encode_language("dutch"), "nl")
|
|
|
|
|
self.assertEqual(web.locale.encode_language("?????"), None)
|
|
|
|
|
print("pattern.web.locale.encode_language()")
|
|
|
|
|
|
|
|
|
|
def test_decode_language(self):
|
|
|
|
|
# Assert "nl" => "Dutch".
|
|
|
|
|
self.assertEqual(web.locale.decode_language("nl"), "Dutch")
|
|
|
|
|
self.assertEqual(web.locale.decode_language("NL"), "Dutch")
|
|
|
|
|
self.assertEqual(web.locale.decode_language("??"), None)
|
|
|
|
|
print("pattern.web.locale.decode_language()")
|
|
|
|
|
|
|
|
|
|
def test_encode_region(self):
|
|
|
|
|
# Assert "Belgium" => "BE".
|
|
|
|
|
self.assertEqual(web.locale.encode_region("belgium"), "BE")
|
|
|
|
|
self.assertEqual(web.locale.encode_region("???????"), None)
|
|
|
|
|
print("pattern.web.locale.encode_region()")
|
|
|
|
|
|
|
|
|
|
def test_decode_region(self):
|
|
|
|
|
# Assert "BE" => "Belgium".
|
|
|
|
|
self.assertEqual(web.locale.decode_region("be"), "Belgium")
|
|
|
|
|
self.assertEqual(web.locale.decode_region("BE"), "Belgium")
|
|
|
|
|
self.assertEqual(web.locale.decode_region("??"), None)
|
|
|
|
|
print("pattern.web.locale.decode_region()")
|
|
|
|
|
|
|
|
|
|
def test_languages(self):
|
|
|
|
|
# Assert "BE" => "fr" + "nl".
|
|
|
|
|
self.assertEqual(web.locale.languages("be"), ["fr", "nl"])
|
|
|
|
|
print("pattern.web.locale.languages()")
|
|
|
|
|
|
|
|
|
|
def test_regions(self):
|
|
|
|
|
# Assert "nl" => "NL" + "BE".
|
|
|
|
|
self.assertEqual(web.locale.regions("nl"), ["NL", "BE"])
|
|
|
|
|
print("pattern.web.locale.regions()")
|
|
|
|
|
|
|
|
|
|
def test_regionalize(self):
|
|
|
|
|
# Assert "nl" => "nl-NL" + "nl-BE".
|
|
|
|
|
self.assertEqual(web.locale.regionalize("nl"), ["nl-NL", "nl-BE"])
|
|
|
|
|
print("pattern.web.locale.regionalize()")
|
|
|
|
|
|
|
|
|
|
def test_geocode(self):
|
|
|
|
|
# Assert region geocode.
|
|
|
|
|
v = web.locale.geocode("brussels")
|
|
|
|
|
self.assertAlmostEqual(v[0], 50.83, places=2)
|
|
|
|
|
self.assertAlmostEqual(v[1], 4.33, places=2)
|
|
|
|
|
self.assertEqual(v[2], "nl")
|
|
|
|
|
self.assertEqual(v[3], "Belgium")
|
|
|
|
|
print("pattern.web.locale.geocode()")
|
|
|
|
|
|
|
|
|
|
def test_correlation(self):
|
|
|
|
|
# Test the correlation between locale.LANGUAGE_REGION and locale.GEOCODE.
|
|
|
|
|
# It should increase as new languages and locations are added.
|
|
|
|
|
i = 0
|
|
|
|
|
n = len(web.locale.GEOCODE)
|
|
|
|
|
for city, (latitude, longitude, language, region) in web.locale.GEOCODE.items():
|
|
|
|
|
if web.locale.encode_region(region) is not None:
|
|
|
|
|
i += 1
|
|
|
|
|
self.assertTrue(float(i) / n > 0.60)
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
# You need to define a username, password and mailbox to test on.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestMail(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
self.username = ""
|
|
|
|
|
self.password = ""
|
|
|
|
|
self.service = web.GMAIL
|
|
|
|
|
self.port = 993
|
|
|
|
|
self.SSL = True
|
|
|
|
|
self.query1 = "google" # FROM-field query in Inbox.
|
|
|
|
|
self.query2 = "viagra" # SUBJECT-field query in Spam.
|
|
|
|
|
|
|
|
|
|
def test_mail(self):
|
|
|
|
|
if not self.username or not self.password:
|
|
|
|
|
return
|
|
|
|
|
# Assert web.imap.Mail.
|
|
|
|
|
m = web.Mail(self.username, self.password, service=self.service, port=self.port, secure=self.SSL)
|
|
|
|
|
# Assert web.imap.MailFolder (assuming GMail folders).
|
|
|
|
|
print(m.folders)
|
|
|
|
|
self.assertTrue(len(m.folders) > 0)
|
|
|
|
|
self.assertTrue(len(m.inbox) > 0)
|
|
|
|
|
print("pattern.web.Mail")
|
|
|
|
|
|
|
|
|
|
def test_mail_message1(self):
|
|
|
|
|
if not self.username or not self.password or not self.query1:
|
|
|
|
|
return
|
|
|
|
|
# Assert web.imap.Mailfolder.search().
|
|
|
|
|
m = web.Mail(self.username, self.password, service=self.service, port=self.port, secure=self.SSL)
|
|
|
|
|
a = m.inbox.search(self.query1, field=web.FROM)
|
|
|
|
|
self.assertTrue(isinstance(a[0], int))
|
|
|
|
|
# Assert web.imap.Mailfolder.read().
|
|
|
|
|
e = m.inbox.read(a[0], attachments=False, cached=False)
|
|
|
|
|
# Assert web.imap.Message.
|
|
|
|
|
self.assertTrue(isinstance(e, web.imap.Message))
|
|
|
|
|
self.assertTrue(isinstance(e.author, str))
|
|
|
|
|
self.assertTrue(isinstance(e.email_address, str))
|
|
|
|
|
self.assertTrue(isinstance(e.date, str))
|
|
|
|
|
self.assertTrue(isinstance(e.subject, str))
|
|
|
|
|
self.assertTrue(isinstance(e.body, str))
|
|
|
|
|
self.assertTrue(self.query1 in e.author.lower())
|
|
|
|
|
self.assertTrue("@" in e.email_address)
|
|
|
|
|
print("pattern.web.Mail.search(field=FROM)")
|
|
|
|
|
print("pattern.web.Mail.read()")
|
|
|
|
|
|
|
|
|
|
def test_mail_message2(self):
|
|
|
|
|
if not self.username or not self.password or not self.query2:
|
|
|
|
|
return
|
|
|
|
|
# Test if we can download some mail attachments.
|
|
|
|
|
# Set query2 to a mail subject of a spam e-mail you know contains an attachment.
|
|
|
|
|
m = web.Mail(self.username, self.password, service=self.service, port=self.port, secure=self.SSL)
|
|
|
|
|
if "spam" in m.folders:
|
|
|
|
|
for id in m.spam.search(self.query2, field=web.SUBJECT):
|
|
|
|
|
e = m.spam.read(id, attachments=True, cached=False)
|
|
|
|
|
if len(e.attachments) > 0:
|
|
|
|
|
self.assertTrue(isinstance(e.attachments[0][1], bytes))
|
|
|
|
|
self.assertTrue(len(e.attachments[0][1]) > 0)
|
|
|
|
|
print("pattern.web.Message.attachments (MIME-type: %s)" % e.attachments[0][0])
|
|
|
|
|
print("pattern.web.Mail.search(field=SUBJECT)")
|
|
|
|
|
print("pattern.web.Mail.read()")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestCrawler(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def test_link(self):
|
|
|
|
|
# Assert web.Link parser and properties.
|
|
|
|
|
v = web.HTMLLinkParser().parse("""
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<title>title</title>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<div id="navigation">
|
|
|
|
|
<a href="http://www.domain1.com/?p=1" title="1" rel="a">nav1</a>
|
|
|
|
|
<a href="http://www.domain2.com/?p=2" title="2" rel="b">nav1</a>
|
|
|
|
|
</div>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
""", "http://www.domain.com/")
|
|
|
|
|
self.assertTrue(v[0].url, "http://www.domain1.com/?p=1")
|
|
|
|
|
self.assertTrue(v[1].url, "http://www.domain1.com/?p=2")
|
|
|
|
|
self.assertTrue(v[0].description, "1")
|
|
|
|
|
self.assertTrue(v[1].description, "2")
|
|
|
|
|
self.assertTrue(v[0].relation, "a")
|
|
|
|
|
self.assertTrue(v[1].relation, "b")
|
|
|
|
|
self.assertTrue(v[0].referrer, "http://www.domain.com/")
|
|
|
|
|
self.assertTrue(v[1].referrer, "http://www.domain.com/")
|
|
|
|
|
self.assertTrue(v[0] < v[1])
|
|
|
|
|
print("pattern.web.HTMLLinkParser")
|
|
|
|
|
|
|
|
|
|
def test_crawler_crawl(self):
|
|
|
|
|
# Assert domain filter.
|
|
|
|
|
v = web.Crawler(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=0.5)
|
|
|
|
|
while len(v.visited) < 4:
|
|
|
|
|
v.crawl(throttle=0.1, cached=False)
|
|
|
|
|
for url in v.visited:
|
|
|
|
|
self.assertTrue("nodebox.net" in url)
|
|
|
|
|
self.assertTrue(len(v.history) == 2)
|
|
|
|
|
print("pattern.web.Crawler.crawl()")
|
|
|
|
|
|
|
|
|
|
def test_crawler_delay(self):
|
|
|
|
|
# Assert delay for several crawls to a single domain.
|
|
|
|
|
v = web.Crawler(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=1.2)
|
|
|
|
|
v.crawl()
|
|
|
|
|
t = time.time()
|
|
|
|
|
while not v.crawl(throttle=0.1, cached=False):
|
|
|
|
|
pass
|
|
|
|
|
t = time.time() - t
|
|
|
|
|
self.assertTrue(t > 1.0)
|
|
|
|
|
print("pattern.web.Crawler.delay")
|
|
|
|
|
|
|
|
|
|
def test_crawler_breadth(self):
|
|
|
|
|
# Assert BREADTH cross-domain preference.
|
|
|
|
|
v = web.Crawler(links=["http://nodebox.net/"], delay=10)
|
|
|
|
|
while len(v.visited) < 4:
|
|
|
|
|
v.crawl(throttle=0.1, cached=False, method=web.BREADTH)
|
|
|
|
|
self.assertTrue(list(v.history.keys())[0] != list(v.history.keys())[1])
|
|
|
|
|
self.assertTrue(list(v.history.keys())[0] != list(v.history.keys())[2])
|
|
|
|
|
self.assertTrue(list(v.history.keys())[1] != list(v.history.keys())[2])
|
|
|
|
|
print("pattern.web.Crawler.crawl(method=BREADTH)")
|
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def suite():
|
|
|
|
|
suite = unittest.TestSuite()
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestCache))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestUnicode))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestURL))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestPlaintext))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestSearchEngine))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestDOM))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestDocumentParser))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestLocale))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestMail))
|
|
|
|
|
suite.addTest(unittest.TestLoader().loadTestsFromTestCase(TestCrawler))
|
|
|
|
|
return suite
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
result = unittest.TextTestRunner(verbosity=1).run(suite())
|
|
|
|
|
sys.exit(not result.wasSuccessful())
|