#### PATTERN | WEB ################################################################################# # -*- coding: utf-8 -*- # Copyright (c) 2010 University of Antwerp, Belgium # Author: Tom De Smedt # License: BSD (see LICENSE.txt for details). # http://www.clips.ua.ac.be/pages/pattern #################################################################################################### # Python API interface for various web services (Google, Twitter, Wikipedia, ...) from __future__ import absolute_import from __future__ import unicode_literals from __future__ import print_function from __future__ import division from builtins import str, bytes, dict, int, chr from builtins import map, filter, zip from builtins import object, range, next from .utils import get_url_query, get_form_action, stringify_values, json_iter_parse import os import sys import threading import time import socket import requests import datetime import ssl from io import open try: # Python 3 from urllib.parse import urlparse, urljoin, urlsplit, urlencode, quote_plus, unquote_plus from urllib.request import urlopen, Request, HTTPHandler, HTTPRedirectHandler, ProxyHandler, HTTPCookieProcessor, install_opener, build_opener from urllib.error import HTTPError as UrllibHTTPError from urllib.error import URLError as UrllibURLError except ImportError: # Python 2 from urlparse import urlparse, urljoin, urlsplit from urllib import urlencode, quote_plus, unquote_plus from urllib2 import urlopen, Request, HTTPHandler, HTTPRedirectHandler, ProxyHandler, HTTPCookieProcessor, install_opener, build_opener from urllib2 import HTTPError as UrllibHTTPError from urllib2 import URLError as UrllibURLError import base64 from html.entities import name2codepoint try: # Python 2 import httplib except ImportError: # Python 3 import http.client as httplib from html.parser import HTMLParser as _HTMLParser try: # Python 3 import cookielib except ImportError: # Python 2 import http.cookiejar as cookielib import re import xml.dom.minidom import unicodedata import string try: # Python 2 from cStringIO import StringIO except ImportError: # Python 3 from io import StringIO import bisect import itertools try: # Python 2 import new except ImportError: # Python 3: We don't actually need it (in this case) new = None import feedparser import json from . import api from . import oauth from . import locale import bs4 as BeautifulSoup try: # Import persistent Cache. # If this module is used separately, # a dict is used (i.e. this Python session only). from .cache import Cache, cache, TMP except: cache = {} try: from .imap import Mail, MailFolder, Message, GMAIL from .imap import MailError, MailServiceError, MailLoginError, MailNotLoggedIn from .imap import FROM, SUBJECT, DATE, BODY, ATTACHMENTS except: pass try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" #### UNICODE ####################################################################################### # Latin-1 (ISO-8859-1) encoding is identical to Windows-1252 except for the code points 128-159: # Latin-1 assigns control codes in this range, Windows-1252 has characters, punctuation, symbols # assigned to these code points. from pattern.helpers import encode_string, decode_string u = decode_utf8 = decode_string s = encode_utf8 = encode_string GREMLINS = set([ 0x0152, 0x0153, 0x0160, 0x0161, 0x0178, 0x017E, 0x017D, 0x0192, 0x02C6, 0x02DC, 0x2013, 0x2014, 0x201A, 0x201C, 0x201D, 0x201E, 0x2018, 0x2019, 0x2020, 0x2021, 0x2022, 0x2026, 0x2030, 0x2039, 0x203A, 0x20AC, 0x2122 ]) def fix(s, ignore=""): """ Returns a Unicode string that fixes common encoding problems (Latin-1, Windows-1252). For example: fix("cliché") => "cliché". """ # http://blog.luminoso.com/2012/08/20/fix-unicode-mistakes-with-python/ if not isinstance(s, str): s = s.decode("utf-8") # If this doesn't work, # copy & paste string in a Unicode .txt, # and then pass open(f).read() to fix(). u = [] i = 0 for j, ch in enumerate(s): if ch in ignore: continue if ord(ch) < 128: # ASCII continue if ord(ch) in GREMLINS: ch = ch.encode("windows-1252") else: try: ch = ch.encode("latin-1") except: ch = ch.encode("utf-8") u.append(s[i:j].encode("utf-8")) u.append(ch) i = j + 1 u.append(s[i:].encode("utf-8")) u = b"".join(u) u = u.decode("utf-8", "replace") u = u.replace("\n", "\n ") u = u.split(" ") # Revert words that have the replacement character, # i.e., fix("cliché") should not return "clich�". for i, (w1, w2) in enumerate(zip(s.split(" "), u)): if "\ufffd" in w2: # � u[i] = w1 u = " ".join(u) u = u.replace("\n ", "\n") return u def latin(s): """ Returns True if the string contains only Latin-1 characters (no Chinese, Japanese, Arabic, Cyrillic, Hebrew, Greek, ...). """ if not isinstance(s, str): s = s.decode("utf-8") return all(unicodedata.name(ch).startswith("LATIN") for ch in s if ch.isalpha()) # For clearer source code: bytestring = b = s #### ASYNCHRONOUS REQUEST ########################################################################## class AsynchronousRequest(object): def __init__(self, function, *args, **kwargs): """ Executes the function in the background. AsynchronousRequest.done is False as long as it is busy, but the program will not halt in the meantime. AsynchronousRequest.value contains the function's return value once done. AsynchronousRequest.error contains the Exception raised by an erronous function. For example, this is useful for running live web requests while keeping an animation running. For good reasons, there is no way to interrupt a background process (i.e. Python thread). You are responsible for ensuring that the given function doesn't hang. """ self._response = None # The return value of the given function. self._error = None # The exception (if any) raised by the function. self._time = time.time() self._function = function self._thread = threading.Thread(target=self._fetch, args=(function,) + args, kwargs=kwargs) self._thread.start() def _fetch(self, function, *args, **kwargs): """ Executes the function and sets AsynchronousRequest.response. """ try: self._response = function(*args, **kwargs) except Exception as e: self._error = e def now(self): """ Waits for the function to finish and yields its return value. """ self._thread.join() return self._response @property def elapsed(self): return time.time() - self._time @property def done(self): return not self._thread.isAlive() @property def value(self): return self._response @property def error(self): return self._error def __repr__(self): return "AsynchronousRequest(function='%s')" % self._function.__name__ def asynchronous(function, *args, **kwargs): """ Returns an AsynchronousRequest object for the given function. """ return AsynchronousRequest(function, *args, **kwargs) send = asynchronous #### URL ########################################################################################### # User agent and referrer. # Used to identify the application accessing the web. USER_AGENT = "Pattern/2.6 +http://www.clips.ua.ac.be/pattern" REFERRER = "http://www.clips.ua.ac.be/pattern" # Mozilla user agent. # Websites can include code to block out any application except browsers. MOZILLA = "Mozilla/5.0" # HTTP request method. GET = "get" # Data is encoded in the URL. POST = "post" # Data is encoded in the message body. # URL parts. # protocol://username:password@domain:port/path/page?query_string#anchor PROTOCOL, USERNAME, PASSWORD, DOMAIN, PORT, PATH, PAGE, QUERY, ANCHOR = \ "protocol", "username", "password", "domain", "port", "path", "page", "query", "anchor" # MIME type. MIMETYPE_WEBPAGE = ["text/html"] MIMETYPE_STYLESHEET = ["text/css"] MIMETYPE_PLAINTEXT = ["text/plain"] MIMETYPE_PDF = ["application/pdf"] MIMETYPE_NEWSFEED = ["application/rss+xml", "application/atom+xml"] MIMETYPE_IMAGE = ["image/gif", "image/jpeg", "image/png", "image/tiff"] MIMETYPE_AUDIO = ["audio/mpeg", "audio/mp4", "audio/x-aiff", "audio/x-wav"] MIMETYPE_VIDEO = ["video/mpeg", "video/mp4", "video/avi", "video/quicktime", "video/x-flv"] MIMETYPE_ARCHIVE = ["application/x-stuffit", "application/x-tar", "application/zip"] MIMETYPE_SCRIPT = ["application/javascript", "application/ecmascript"] def extension(filename): """ Returns the extension in the given filename: "cat.jpg" => ".jpg". """ return os.path.splitext(filename)[1] def urldecode(query): """ Inverse operation of urlencode. Returns a dictionary of (name, value)-items from a URL query string. """ def _format(s): if s == "" or s == "None": return None if s.lstrip("-").isdigit(): return int(s) try: return float(s) except: return s if query: query = query.lstrip("?").split("&") query = ((kv.split("=") + [None])[:2] for kv in query) if sys.version > "3": query = ((u(unquote_plus(k)), _format(u(unquote_plus(v)))) for k, v in query if k != "") else: query = ((u(unquote_plus(bytestring(k))), _format(u(unquote_plus(bytestring(v))))) for k, v in query if k != "") return dict(query) return {} url_decode = urldecode def proxy(host, protocol="https"): """ Returns the value for the URL.open() proxy parameter. - host: host address of the proxy server. """ return (host, protocol) class Error(Exception): """ Base class for pattern.web errors. """ def __init__(self, *args, **kwargs): Exception.__init__(self, *args) self.src = kwargs.pop("src", None) self.url = kwargs.pop("url", None) @property def headers(self): return dict(list(self.src.headers.items())) class URLError(Error): pass # URL contains errors (e.g. a missing t in htp://). class URLTimeout(URLError): pass # URL takes to long to load. class HTTPError(URLError): pass # URL causes an error on the contacted server. class HTTP301Redirect(HTTPError): pass # Too many redirects. # The site may be trying to set a cookie and waiting for you to return it, # or taking other measures to discern a browser from a script. # For specific purposes you should build your own urllib2.HTTPRedirectHandler # and pass it to urllib2.build_opener() in URL.open() class HTTP400BadRequest(HTTPError): pass # URL contains an invalid request. class HTTP401Authentication(HTTPError): pass # URL requires a login and password. class HTTP403Forbidden(HTTPError): pass # URL is not accessible (user-agent?) class HTTP404NotFound(HTTPError): pass # URL doesn't exist on the internet. class HTTP414RequestURITooLong(HTTPError): pass # URL is too long. class HTTP420Error(HTTPError): pass # Used by Twitter for rate limiting. class HTTP429TooMayRequests(HTTPError): pass # Used by Twitter for rate limiting. class HTTP500InternalServerError(HTTPError): pass # Generic server error. class HTTP503ServiceUnavailable(HTTPError): pass # Used by Bing for rate limiting. class URL(object): def __init__(self, string="", method=GET, query={}, **kwargs): """ URL object with the individual parts available as attributes: For protocol://username:password@domain:port/path/page?query_string#anchor: - URL.protocol: http, https, ftp, ... - URL.username: username for restricted domains. - URL.password: password for restricted domains. - URL.domain : the domain name, e.g. nodebox.net. - URL.port : the server port to connect to. - URL.path : the server path of folders, as a list, e.g. ['news', '2010'] - URL.page : the page name, e.g. page.html. - URL.query : the query string as a dictionary of (name, value)-items. - URL.anchor : the page anchor. If method is POST, the query string is sent with HTTP POST. """ self.__dict__["method"] = method # Use __dict__ directly since __setattr__ is overridden. self.__dict__["_string"] = u(string) self.__dict__["_parts"] = None self.__dict__["_headers"] = None self.__dict__["_redirect"] = None if isinstance(string, URL): self.__dict__["method"] = string.method self.query.update(string.query) if len(query) > 0: # Requires that we parse the string first (see URL.__setattr__). self.query.update(query) if len(kwargs) > 0: # Requires that we parse the string first (see URL.__setattr__). self.parts.update(kwargs) def _parse(self): """ Parses all the parts of the URL string to a dictionary. URL format: protocal://username:password@domain:port/path/page?querystring#anchor For example: http://user:pass@example.com:992/animal/bird?species=seagull&q#wings This is a cached method that is only invoked when necessary, and only once. """ p = urlsplit(self._string) P = {PROTOCOL: p[0], # http USERNAME: "", # user PASSWORD: "", # pass DOMAIN: p[1], # example.com PORT: "", # 992 PATH: p[2], # [animal] PAGE: "", # bird QUERY: urldecode(p[3]), # {"species": "seagull", "q": None} ANCHOR: p[4] # wings } # Split the username and password from the domain. if "@" in P[DOMAIN]: P[USERNAME], \ P[PASSWORD] = (p[1].split("@")[0].split(":") + [""])[:2] P[DOMAIN] = p[1].split("@")[1] # Split the port number from the domain. if ":" in P[DOMAIN]: P[DOMAIN], \ P[PORT] = P[DOMAIN].split(":") P[PORT] = P[PORT].isdigit() and int(P[PORT]) or P[PORT] # Split the base page from the path. if "/" in P[PATH]: P[PAGE] = p[2].split("/")[-1] P[PATH] = p[2][:len(p[2]) - len(P[PAGE])].strip("/").split("/") P[PATH] = list(filter(lambda v: v != "", P[PATH])) else: P[PAGE] = p[2].strip("/") P[PATH] = [] self.__dict__["_parts"] = P # URL.string yields unicode(URL) by joining the different parts, # if the URL parts have been modified. def _get_string(self): return str(self) def _set_string(self, v): self.__dict__["_string"] = u(v) self.__dict__["_parts"] = None string = property(_get_string, _set_string) @property def parts(self): """ Yields a dictionary with the URL parts. """ if not self._parts: self._parse() return self._parts @property def querystring(self): """ Yields the URL querystring: "www.example.com?page=1" => "page=1" """ s = self.parts[QUERY].items() s = dict((bytestring(k), v if v is not None else "") for k, v in s) if sys.version > "3": # Python 3 s = urlencode(s) else: # Python 2: urlencode() expects byte strings t = {key: value.encode("utf-8") if isinstance(value, str) else value for key, value in s.items()} s = urlencode(t).decode("utf-8") return s def __getattr__(self, k): if k in self.__dict__: return self.__dict__[k] if k in self.parts: return self.__dict__["_parts"][k] raise AttributeError("'URL' object has no attribute '%s'" % k) def __setattr__(self, k, v): if k in self.__dict__: self.__dict__[k] = u(v) return if k == "string": self._set_string(v) return if k == "query": self.parts[k] = v return if k in self.parts: self.__dict__["_parts"][k] = u(v) return raise AttributeError("'URL' object has no attribute '%s'" % k) def open(self, timeout=10, proxy=None, user_agent=USER_AGENT, referrer=REFERRER, authentication=None): """ Returns a connection to the url from which data can be retrieved with connection.read(). When the timeout amount of seconds is exceeded, raises a URLTimeout. When an error occurs, raises a URLError (e.g. HTTP404NotFound). """ url = self.string # Handle local files directly if os.path.exists(url): return urlopen(url) # Handle method=POST with query string as a separate parameter. post = self.method == POST and self.querystring or None socket.setdefaulttimeout(timeout) # Handle proxies and cookies. handlers = [] if proxy: handlers.append(ProxyHandler({proxy[1]: proxy[0]})) handlers.append(HTTPCookieProcessor(cookielib.CookieJar())) handlers.append(HTTPHandler) install_opener(build_opener(*handlers)) # Send request. try: request = Request(url, post, { "User-Agent": user_agent, "Referer": referrer }) # Basic authentication is established with authentication=(username, password). if authentication is not None: authentication = tuple(encode_utf8(x) for x in authentication) request.add_header("Authorization", "Basic %s" % decode_utf8(base64.b64encode(b'%s:%s' % authentication))) return urlopen(request) except UrllibHTTPError as e: if e.code == 301: raise HTTP301Redirect(src=e, url=url) if e.code == 400: raise HTTP400BadRequest(src=e, url=url) if e.code == 401: raise HTTP401Authentication(src=e, url=url) if e.code == 403: raise HTTP403Forbidden(src=e, url=url) if e.code == 404: raise HTTP404NotFound(src=e, url=url) if e.code == 414: raise HTTP414RequestURITooLong(src=e, url=url) if e.code == 420: raise HTTP420Error(src=e, url=url) if e.code == 429: raise HTTP429TooMayRequests(src=e, url=url) if e.code == 500: raise HTTP500InternalServerError(src=e, url=url) if e.code == 503: raise HTTP503ServiceUnavailable(src=e, url=url) raise HTTPError(str(e), src=e, url=url) except httplib.BadStatusLine as e: raise HTTPError(str(e), src=e, url=url) except socket.timeout as e: raise URLTimeout(src=e, url=url) except socket.error as e: if "timed out" in str((e.args + ("", ""))[0]) \ or "timed out" in str((e.args + ("", ""))[1]): raise URLTimeout(src=e, url=url) raise URLError(str(e), src=e, url=url) except UrllibURLError as e: if "timed out" in str(e.reason): raise URLTimeout(src=e, url=url) raise URLError(str(e), src=e, url=url) except ValueError as e: raise URLError(str(e), src=e, url=url) def download(self, timeout=10, cached=True, throttle=0, proxy=None, user_agent=USER_AGENT, referrer=REFERRER, authentication=None, unicode=False, **kwargs): """ Downloads the content at the given URL (by default it will be cached locally). Unless unicode=False, the content is returned as a unicode string. """ # Filter OAuth parameters from cache id (they will be unique for each request). if self._parts is None and self.method == GET and "oauth_" not in self._string: id = self._string else: id = repr(self.parts) id = re.sub("u{0,1}'oauth_.*?': u{0,1}'.*?', ", "", id) # Keep a separate cache of unicode and raw download for same URL. if unicode is True: id = "u" + id if cached and id in cache: if isinstance(cache, dict): # Not a Cache object. return cache[id] if unicode is True: return cache[id] if unicode is False: return cache.get(id, unicode=False) t = time.time() # Open a connection with the given settings, read it and (by default) cache the data. try: data = self.open(timeout, proxy, user_agent, referrer, authentication).read() except socket.timeout as e: raise URLTimeout(src=e, url=self.string) if unicode is True: data = u(data) if cached: cache[id] = data if throttle: time.sleep(max(throttle - (time.time() - t), 0)) return data def read(self, *args, **kwargs): return self.open(**kwargs).read(*args) @property def exists(self, timeout=10): """ Yields False if the URL generates a HTTP404NotFound error. """ try: self.open(timeout) except HTTP404NotFound: return False except HTTPError: return True except URLTimeout: return True except URLError: return False except: return True return True @property def mimetype(self, timeout=10): """ Yields the MIME-type of the document at the URL, or None. MIME is more reliable than simply checking the document extension. You can then do: URL.mimetype in MIMETYPE_IMAGE. """ try: return self.headers["content-type"].split(";")[0] except KeyError: return None @property def headers(self, timeout=10): """ Yields a dictionary with the HTTP response headers. """ if self.__dict__["_headers"] is None: try: h = dict(self.open(timeout).info()) except URLError: h = {} self.__dict__["_headers"] = h # Backward compatibility (Python 2) if "Content-Type" in self.__dict__["_headers"]: self.__dict__["_headers"]["content-type"] = self.__dict__["_headers"]["Content-Type"] return self.__dict__["_headers"] @property def redirect(self, timeout=10): """ Yields the redirected URL, or None. """ if self.__dict__["_redirect"] is None: try: r = u(self.open(timeout).geturl()) except URLError: r = None self.__dict__["_redirect"] = r != self.string and r or "" return self.__dict__["_redirect"] or None def __str__(self): # The string representation includes the query attributes with HTTP GET. P = self.parts u = [] if P[PROTOCOL]: u.append("%s://" % P[PROTOCOL]) if P[USERNAME]: u.append("%s:%s@" % (P[USERNAME], P[PASSWORD])) if P[DOMAIN]: u.append(P[DOMAIN]) if P[PORT]: u.append(":%s" % P[PORT]) if P[PORT] or P[DOMAIN] and not P[PATH] and not P[PAGE]: u.append("/") if P[PATH]: u.append("/%s/" % "/".join(P[PATH])) if P[PAGE] and len(u) > 0: u[-1] = u[-1].rstrip("/") if P[PAGE]: u.append("/%s" % P[PAGE]) if P[QUERY] and self.method == GET: u.append("?%s" % self.querystring) if P[ANCHOR]: u.append("#%s" % P[ANCHOR]) u = "".join(u) u = u.lstrip("/") return u def __repr__(self): return "URL(%s, method=%s)" % (repr(self.string), repr(self.method)) def copy(self): return URL(self.string, self.method, self.query) def download(url="", method=GET, query={}, timeout=10, cached=True, throttle=0, proxy=None, user_agent=USER_AGENT, referrer=REFERRER, authentication=None, unicode=False): """ Downloads the content at the given URL (by default it will be cached locally). Unless unicode=False, the content is returned as a unicode string. """ return URL(url, method, query).download(timeout, cached, throttle, proxy, user_agent, referrer, authentication, unicode) #url = URL("http://user:pass@example.com:992/animal/bird?species#wings") #print(url.parts) #print(url.query) #print(url.string) #--- STREAMING URL BUFFER -------------------------------------------------------------------------- def bind(object, method, function): """ Attaches the function as a method with the given name to the given object. """ if new: # Python 2 setattr(object, method, new.instancemethod(function, object)) else: # Python 3: There is no good reason to use this function in Python 3. setattr(object, method, function) class Stream(list): def __init__(self, url, delimiter="\n", **kwargs): """ Buffered stream of data from a given URL. """ self.socket = URL(url).open(**kwargs) self.buffer = "" self.delimiter = delimiter def update(self, bytes=1024): """ Reads a number of bytes from the stream. If a delimiter is encountered, calls Stream.parse() on the packet. """ packets = [] self.buffer += self.socket.read(bytes).decode("utf-8") self.buffer = self.buffer.split(self.delimiter, 1) while len(self.buffer) > 1: data = self.buffer[0] data = self.parse(data) if data is not None: packets.append(data) self.buffer = self.buffer[-1] self.buffer = self.buffer.split(self.delimiter, 1) self.buffer = self.buffer[-1] self.extend(packets) return packets def parse(self, data): """ Must be overridden in a subclass. """ return data def clear(self): list.__init__(self, []) def stream(url, delimiter="\n", parse=lambda data: data, **kwargs): """ Returns a new Stream with the given parse method. """ stream = Stream(url, delimiter, **kwargs) bind(stream, "parse", lambda stream, data: parse(data)) return stream #--- FIND URLs ------------------------------------------------------------------------------------- # Functions for parsing URL's and e-mail adresses from strings. RE_URL_PUNCTUATION = ("\"'{(>", "\"'.,;)}") RE_URL_HEAD = r"[%s|\[|\s]" % "|".join(RE_URL_PUNCTUATION[0]) # Preceded by space, parenthesis or HTML tag. RE_URL_TAIL = r"[%s|\]]*[\s|\<]" % "|".join(RE_URL_PUNCTUATION[1]) # Followed by space, punctuation or HTML tag. RE_URL1 = r"(https?://.*?)" + RE_URL_TAIL # Starts with http:// or https:// RE_URL2 = RE_URL_HEAD + r"(www\..*?\..*?)" + RE_URL_TAIL # Starts with www. RE_URL3 = RE_URL_HEAD + r"([\w|-]*?\.(com|net|org|edu|de|uk))" + RE_URL_TAIL RE_URL1, RE_URL2, RE_URL3 = ( re.compile(RE_URL1, re.I), re.compile(RE_URL2, re.I), re.compile(RE_URL3, re.I)) def find_urls(string, unique=True): """ Returns a list of URLs parsed from the string. Works on http://, https://, www. links or domain names ending in .com, .org, .net. Links can be preceded by leading punctuation (open parens) and followed by trailing punctuation (period, comma, close parens). """ string = u(string) string = string.replace("\u2024", ".") string = string.replace(" ", " ") matches = [] for p in (RE_URL1, RE_URL2, RE_URL3): for m in p.finditer(" %s " % string): s = m.group(1) s = s.split("\">")[0].split("'>")[0] # google.com">Google => google.com if not unique or s not in matches: matches.append(s) return matches links = find_urls RE_EMAIL = re.compile(r"[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+") # tom.de+smedt@clips.ua.ac.be def find_email(string, unique=True): """ Returns a list of e-mail addresses parsed from the string. """ string = u(string).replace("\u2024", ".") matches = [] for m in RE_EMAIL.finditer(string): s = m.group(0) if not unique or s not in matches: matches.append(s) return matches def find_between(a, b, string): """ Returns a list of substrings between a and b in the given string. """ p = "%s(.*?)%s" % (a, b) p = re.compile(p, re.DOTALL | re.I) return [m for m in p.findall(string)] #### PLAIN TEXT #################################################################################### # Functions for stripping HTML tags from strings. BLOCK = [ "title", "h1", "h2", "h3", "h4", "h5", "h6", "p", "center", "blockquote", "div", "table", "ul", "ol", "dl", "pre", "code", "form" ] SELF_CLOSING = ["br", "hr", "img"] # Element tag replacements for a stripped version of HTML source with strip_tags(). # Block-level elements are followed by linebreaks, # list items are preceded by an asterisk ("*"). LIST_ITEM = "*" blocks = dict.fromkeys(BLOCK + ["br", "tr", "td"], ("", "\n\n")) blocks.update({ "li": ("%s " % LIST_ITEM, "\n"), "img": ("", ""), "br": ("", "\n"), "th": ("", "\n"), "tr": ("", "\n"), "td": ("", "\t"), }) class HTMLParser(_HTMLParser): def clean(self, html): html = decode_utf8(html) html = html.replace("/>", " />") html = html.replace(" />", " />") html = html.replace(") removed. - exclude : a list of tags to keep. Element attributes are stripped. To preserve attributes a dict of (tag name, [attribute])-items can be given. - replace : a dictionary of (tag name, (replace_before, replace_after))-items. By default, block-level elements are separated with linebreaks. """ if html is None: return None self._exclude = isinstance(exclude, dict) and exclude or dict.fromkeys(exclude, []) self._replace = replace self._data = [] self.feed(self.clean(html)) self.close() self.reset() return "".join(self._data) def handle_starttag(self, tag, attributes): if tag in BLOCK and self._data and self._data[-1][-1:] != "\n": # Block-level elements always break to a new line. self._data.append("\n") if tag in self._exclude: # Create the tag attribute string, # including attributes defined in the HTMLTagStripper._exclude dict. a = len(self._exclude[tag]) > 0 and attributes or [] a = ["%s=\"%s\"" % (k, v) for k, v in a if k in self._exclude[tag]] a = (" " + " ".join(a)).rstrip() self._data.append("<%s%s>" % (tag, a)) if tag in self._replace: self._data.append(self._replace[tag][0]) if tag in self._replace and tag in SELF_CLOSING: self._data.append(self._replace[tag][1]) def handle_endtag(self, tag): if tag in self._exclude and self._data and self._data[-1].startswith("<" + tag): # Never keep empty elements (e.g. ). self._data.pop(-1) return if tag in self._exclude: self._data.append("" % tag) if tag in self._replace: self._data.append(self._replace[tag][1]) def handle_data(self, data): self._data.append(data.strip("\n\t")) def handle_comment(self, comment): if "comment" in self._exclude or \ "!--" in self._exclude: self._data.append("" % comment) # As a function: strip_tags = HTMLTagstripper().strip def strip_element(string, tag, attributes=""): """ Removes all elements with the given tagname and attributes from the string. Open and close tags are kept in balance. No HTML parser is used: strip_element(s, "a", 'class="x"') matches '' or '' but not "". """ s = string.lower() # Case-insensitive. t = tag.strip("") a = (" " + attributes.lower().strip()).rstrip() i = 0 j = 0 while j >= 0: #i = s.find("<%s%s" % (t, a), i) m = re.search(r"<%s[^\>]*?%s" % (t, a), s[i:]) i = i + m.start() if m else -1 j = s.find("" % t, i + 1) opened, closed = s[i:j].count("<%s" % t), 1 while opened > closed and j >= 0: k = s.find("" % t, j + 1) opened += s[j:k].count("<%s" % t) closed += 1 j = k if i < 0: return string if j < 0: return string[:i] string = string[:i] + string[j + len(t) + 3:]; s = string.lower() return string def strip_between(a, b, string): """ Removes anything between (and including) string a and b inside the given string. """ p = "%s.*?%s" % (a, b) p = re.compile(p, re.DOTALL | re.I) return re.sub(p, "", string) def strip_javascript(html): return strip_between("", "", html) def strip_inline_css(html): return strip_between("", "", html) def strip_comments(html): return strip_between("", html) def strip_forms(html): return strip_between("", "", html) RE_AMPERSAND = re.compile("\&(?!\#)") # & not followed by # RE_UNICODE = re.compile(r'&(#?)(x|X?)(\w+);') # É def encode_entities(string): """ Encodes HTML entities in the given string ("<" => "<"). For example, to display "hello" in a browser, we need to pass "<em>hello</em>" (otherwise "hello" in italic is displayed). """ if isinstance(string, str): string = RE_AMPERSAND.sub("&", string) string = string.replace("<", "<") string = string.replace(">", ">") string = string.replace('"', """) string = string.replace("'", "'") return string def decode_entities(string): """ Decodes HTML entities in the given string ("<" => "<"). """ # http://snippets.dzone.com/posts/show/4569 def replace_entity(match): hash, hex, name = match.group(1), match.group(2), match.group(3) if hash == "#" or name.isdigit(): if hex == "": return chr(int(name)) # "&" => "&" if hex.lower() == "x": return chr(int("0x" + name, 16)) # "&" = > "&" else: cp = name2codepoint.get(name) # "&" => "&" return chr(cp) if cp else match.group() # "&foo;" => "&foo;" if isinstance(string, str): return RE_UNICODE.subn(replace_entity, string)[0] return string def encode_url(string): return quote_plus(bytestring(string)) # "black/white" => "black%2Fwhite". def decode_url(string): return u(unquote_plus(bytestring(string))) RE_SPACES = re.compile("( |\xa0)+", re.M) # Matches one or more spaces. RE_TABS = re.compile(r"\t+", re.M) # Matches one or more tabs. def collapse_spaces(string, indentation=False, replace=" "): """ Returns a string with consecutive spaces collapsed to a single space. Whitespace on empty lines and at the end of each line is removed. With indentation=True, retains leading whitespace on each line. """ p = [] for x in string.splitlines(): n = indentation and len(x) - len(x.lstrip()) or 0 p.append(x[:n] + RE_SPACES.sub(replace, x[n:]).strip()) return "\n".join(p) def collapse_tabs(string, indentation=False, replace=" "): """ Returns a string with (consecutive) tabs replaced by a single space. Whitespace on empty lines and at the end of each line is removed. With indentation=True, retains leading whitespace on each line. """ p = [] for x in string.splitlines(): n = indentation and len(x) - len(x.lstrip()) or 0 p.append(x[:n] + RE_TABS.sub(replace, x[n:]).strip()) return "\n".join(p) def collapse_linebreaks(string, threshold=1): """ Returns a string with consecutive linebreaks collapsed to at most the given threshold. Whitespace on empty lines and at the end of each line is removed. """ n = "\n" * threshold p = [s.rstrip() for s in string.splitlines()] string = "\n".join(p) string = re.sub(n + r"+", n, string) return string def plaintext(html, keep=[], replace=blocks, linebreaks=2, indentation=False): """ Returns a string with all HTML tags removed. Content inside HTML comments, the