You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
356 lines
12 KiB
Python
356 lines
12 KiB
Python
# publications office for reading and writing
|
|
from base64 import b64encode
|
|
from datetime import datetime
|
|
import io
|
|
import json
|
|
import multiprocessing.pool
|
|
import os
|
|
import random
|
|
import string
|
|
import tempfile
|
|
import threading
|
|
import urllib.request, urllib.parse, urllib.error
|
|
|
|
import bleach
|
|
import feedparser
|
|
import kode256
|
|
import lxml.html
|
|
import PIL
|
|
from readability import readability
|
|
import requests
|
|
|
|
from bureau import Bureau, add_command, add_api
|
|
|
|
|
|
class Publications(Bureau):
|
|
"""
|
|
The Publications Office serves as a kind of screenless content management
|
|
system. Create, update and organize your sites while doing most of the work
|
|
on paper or anything you can photograph.
|
|
"""
|
|
|
|
name = "Publications Office"
|
|
prefix = "PB"
|
|
version = 0
|
|
|
|
def __init__(self):
|
|
Bureau.__init__(self)
|
|
|
|
# set up db for published sites
|
|
# TODO: rename this to something less ambiguous
|
|
self.db = os.path.expanduser("~/.screenless/PB.data")
|
|
if not os.path.exists(self.db):
|
|
os.mkdir(self.db)
|
|
|
|
# set up urldb for short-codes
|
|
self.urldb = self.dbenv.open_db(b"urldb")
|
|
self.rev_urldb = self.dbenv.open_db(b"rev_urldb")
|
|
|
|
def _make_shorturl(self, url):
|
|
def _shortcode():
|
|
return ''.join(random.choice(string.ascii_letters + string.digits)
|
|
for _ in range(5))
|
|
|
|
# we only have about a billion so make sure we don't collide keys
|
|
with self.dbenv.begin(write=True, db=self.urldb) as txn:
|
|
res = "not None"
|
|
while res is not None:
|
|
tmpcode = _shortcode()
|
|
res = txn.get(tmpcode.encode())
|
|
txn.put(tmpcode.encode(), url.encode())
|
|
|
|
# chop wierdly long urls to be 500 chars (LMDB limit for keys)
|
|
# TODO: make sure we're not truncating some multi-byte unicode
|
|
if len(url.encode()) > 500:
|
|
url_key = url.encode()[0:500]
|
|
else:
|
|
url_key = url.encode()
|
|
with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:
|
|
txn.put(url_key, tmpcode.encode())
|
|
|
|
return tmpcode
|
|
|
|
|
|
def _get_url(self, shortcode):
|
|
"""look up a URL from a shortcode
|
|
returns full unicode url
|
|
"""
|
|
with self.dbenv.begin(db=self.urldb) as txn:
|
|
return txn.get(shortcode.encode()).decode()
|
|
|
|
@add_command("new", "Create a new Publication/Site")
|
|
def new_site(self):
|
|
"""
|
|
Create a new Publication/Site, set up config and tace a picture from
|
|
the document camera as the index page. Finally, it will print out
|
|
the main page with commands for working with the site.
|
|
"""
|
|
site_dir = os.path.join(self.db, "1")
|
|
site_id = 1
|
|
while os.path.exists(site_dir):
|
|
site_id += 1
|
|
site_dir = os.path.join(self.db, str(site_id))
|
|
os.mkdir(site_dir)
|
|
|
|
root_d = {"template": "default", "id": site_id}
|
|
with open(os.path.join(site_dir, "root.json", "w")) as root_json:
|
|
root_json.write(json.dumps(root_d))
|
|
|
|
photo = self.send("PX", "photo")["photo"]
|
|
|
|
# TODO: come up with a generic set of img form operations for Bureau
|
|
# should map regions defined with percentages to names
|
|
form_img = PIL.Image.open(photo)
|
|
fx, fy = form_img.size
|
|
title_region = (0, 0, 0.5 * fx, 0.125 * fy)
|
|
title_img = form_img.crop(title_region)
|
|
content_region = (0, 0.125 * fy, fx, fy)
|
|
content_img = form_img.crop(content_region)
|
|
|
|
def _update_page(self, site, page):
|
|
pass
|
|
|
|
@add_command("news", "Print a personalized daily newspaper")
|
|
def daily_news(self):
|
|
"""
|
|
Print out a selection of up-to-the-minute news items culled from various
|
|
sources on the internet. Current unread Post, weather and finance
|
|
information can also be shown.
|
|
"""
|
|
news = self._get_news()
|
|
# TODO: get weather
|
|
# TODO: get finance
|
|
inbox = self.send("PO", "unread")
|
|
date = datetime.today().strftime("%A %B %e, %Y")
|
|
if inbox is None:
|
|
inbox = [] # if IMAP times out just move on...
|
|
self.print_full("news.html", news=news, inbox=inbox, date=date)
|
|
|
|
@add_command("r", "Print a web page for reading")
|
|
def print_url(self, data):
|
|
"""
|
|
Print out a web page for reading. The command requires a short-code,
|
|
typically referenced via barcode. Short-codes refer to full resource
|
|
URIs recorded in the Publications office 'urldb' database.
|
|
"""
|
|
shortcode, _ = data.split(".")
|
|
with self.dbenv.begin(db=self.urldb) as txn:
|
|
self.log.debug("looking up short-code:" + shortcode)
|
|
url = txn.get(shortcode.encode('utf-8'))
|
|
|
|
if not url:
|
|
self.log.warning("no valid URL in db for short code: " + shortcode)
|
|
return
|
|
else:
|
|
url = url.decode()
|
|
|
|
# download page with requests
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
try:
|
|
resp = requests.get(url, timeout=20.0, headers=headers)
|
|
except requests.ReadTimeout:
|
|
self.log.warning("Timeout reading url %s", url)
|
|
self.print_small("Error: timed out reading " + url)
|
|
return
|
|
except requests.ConnectionError as e:
|
|
self.log.warning("Error reading url %s", url)
|
|
self.print_small("Error: connect error on " + url)
|
|
return
|
|
|
|
# re-render with readability
|
|
doc = readability.Document(resp.text,
|
|
url=url)
|
|
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
|
|
html = lxml.html.document_fromstring(doc.summary())
|
|
|
|
notecount = 0
|
|
# store links then make corresponding svg barcodes
|
|
for link in html.findall(".//a"):
|
|
if "href" in link.attrib:
|
|
notecount += 1
|
|
else:
|
|
continue # skip bogus links
|
|
tmpcode = self._make_shorturl(link.attrib["href"])
|
|
|
|
footlink = html.makeelement("span")
|
|
footlink.attrib["class"] = "footlink"
|
|
footlink.text = str(notecount)
|
|
link.append(footlink)
|
|
|
|
svg = kode256.svg("PBr." + tmpcode)
|
|
#svg = self.bc_svg("PBr." + tmpcode, height=7.0)
|
|
|
|
footnote = html.makeelement("div")
|
|
footnote.attrib["class"] = "footnote"
|
|
notetext = html.makeelement("div")
|
|
notetext.text = str(notecount) + ". " + link.attrib["href"]
|
|
footnote.append(notetext)
|
|
#TODO: make this barcode inline thing a util method
|
|
encoded_svg = b64encode(svg.encode()).decode()
|
|
encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
|
|
svg = '<img class="endnotebc" src="%s"/>' % encoded_data
|
|
footnote.append(lxml.html.fromstring(svg))
|
|
html.append(footnote)
|
|
|
|
self.print_full("article.html", title=doc.title(),
|
|
article=lxml.html.tostring(html).decode("utf-8"),
|
|
url=url, date=timestamp)
|
|
|
|
def _get_ogdata(self, url):
|
|
"""
|
|
returns an object with OpenGraph metadata if available
|
|
"""
|
|
ogdata = {}
|
|
|
|
# download page with requests
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
try:
|
|
resp = requests.get(url, timeout=20.0, headers=headers)
|
|
except (requests.ReadTimeout, requests.exceptions.ReadTimeout):
|
|
self.log.warning("Timeout fetching OpenGraph data from document %s",
|
|
url)
|
|
return ogdata
|
|
except (requests.exceptions.ConnectionError):
|
|
self.log.warning("Connection errors fetching OpenGraph from %s",
|
|
url)
|
|
return ogdata
|
|
except requests.exceptions.MissingSchema:
|
|
self.log.warning("Can't get OpenGraph data from bogus URL %s", url)
|
|
return ogdata
|
|
|
|
html = lxml.html.document_fromstring(resp.text)
|
|
|
|
#find all elements with property="og:<something>"
|
|
elements = html.findall(".//*[@property]")
|
|
for element in elements:
|
|
prop = element.get("property")
|
|
val = element.get("content")
|
|
if prop.startswith("og:"):
|
|
prop = prop[3:]
|
|
ogdata[prop] = val
|
|
|
|
return ogdata
|
|
|
|
def _get_news(self):
|
|
"""fetch a set of latest news entries from sources specified in config
|
|
"""
|
|
feeds = self.config["newsfeeds"]
|
|
entries = []
|
|
feed_data = {}
|
|
threads = []
|
|
|
|
for source in feeds:
|
|
url = source["url"]
|
|
feedlock = threading.RLock()
|
|
|
|
def fetch_feed(url, feed_data):
|
|
"""
|
|
get feed data with requests using a timeout
|
|
"""
|
|
try:
|
|
resp = requests.get(url, timeout=20.0)
|
|
with feedlock:
|
|
feed_data[url] = io.BytesIO(resp.content)
|
|
except requests.ReadTimeout:
|
|
self.log.warning("Timeout reading RSS feed %s", url)
|
|
with feedlock:
|
|
feed_data[url] = None
|
|
return
|
|
|
|
thread = threading.Thread(target=fetch_feed, args=(url, feed_data))
|
|
threads.append(thread)
|
|
thread.start()
|
|
|
|
for thread in threads:
|
|
thread.join()
|
|
|
|
for source in feeds:
|
|
feed = feedparser.parse(feed_data[source["url"]])
|
|
if feed is None:
|
|
continue
|
|
num_entries = source["count"]
|
|
|
|
# work around if we don't have enough news
|
|
if num_entries > len(feed.entries):
|
|
num_entries = len(feed.entries)
|
|
|
|
en_count = 0
|
|
while en_count < num_entries:
|
|
try:
|
|
entry = feed.entries.pop(0)
|
|
except IndexError:
|
|
# we are out of entries - quit
|
|
en_count = num_entries
|
|
continue
|
|
|
|
# ignore the old news we've already seen
|
|
#
|
|
with self.dbenv.begin(db=self.rev_urldb) as txn:
|
|
res = txn.get(entry.link.encode())
|
|
if res is not None:
|
|
continue
|
|
|
|
entry.source = feed.feed.title
|
|
entry.dbhash = self._make_shorturl(entry.link)
|
|
entry.svg = kode256.svg("PBr." + entry.dbhash)
|
|
#entry.svg = self.bc_svg("PBr." + entry.dbhash, width=0.24,
|
|
# height=7.0)
|
|
encoded_svg = b64encode(entry.svg.encode()).decode()
|
|
encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
|
|
entry.svg = '<img src="%s"/>' % encoded_data
|
|
|
|
# skip bogus entries with no text
|
|
if not hasattr(entry, "summary"):
|
|
continue
|
|
|
|
# limit summary to the last space below 500 characters
|
|
if len(entry.summary) > 500:
|
|
end = entry.summary.rfind(" ", 0, 499)
|
|
entry.summary = entry.summary[0:end] + "…"
|
|
|
|
entry.summary = bleach.clean(entry.summary, strip=True)
|
|
|
|
entries.append(entry)
|
|
en_count += 1
|
|
|
|
# do this multi-threaded cuz downloads can be slow
|
|
# NOTE: this could be further optimized with 2 threads per host (chunks)
|
|
def fetch_og(entry):
|
|
"""
|
|
get OpenGraph data for entry
|
|
and download image
|
|
TODO: add microdata support here to get author
|
|
"""
|
|
og_data = self._get_ogdata(entry.link)
|
|
if hasattr(entry, "media_thumbnail"):
|
|
entry.img = entry.media_thumbnail[-1]["url"]
|
|
elif "image" in og_data:
|
|
entry.img = og_data["image"]
|
|
else:
|
|
entry.img = " "
|
|
|
|
if entry.img != " ":
|
|
fileext = "." + entry.img.rsplit(".",1)[1]
|
|
if len(fileext) > 4:
|
|
fileext = ".jpg"
|
|
filename = tempfile.mktemp(fileext)
|
|
print("fetching", entry.img, filename)
|
|
urllib.request.urlretrieve(entry.img, filename)
|
|
entry.img = "file://" + filename
|
|
|
|
return entry
|
|
|
|
fetcher = multiprocessing.pool.ThreadPool(processes=2)
|
|
entries_fetched = fetcher.map(fetch_og, entries)
|
|
|
|
return entries_fetched
|
|
|
|
|
|
def main():
|
|
pub = Publications()
|
|
pub.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|