You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

408 lines
14 KiB
Python

# publications office for reading and writing
from base64 import b64encode
from datetime import datetime
import io
import json
import logging
import multiprocessing.pool
import os
import random
import string
import tempfile
import threading
import urllib.request, urllib.parse, urllib.error
import bleach
import feedparser
import kode256
import lxml.html
import PIL
import readabilipy
import requests
from bureau import Bureau, add_command, add_api
from . import weather
class Publications(Bureau):
"""
The Publications Office serves as a kind of screenless content management
system. Create, update and organize your sites while doing most of the work
on paper or anything you can photograph.
"""
name = "Publications Office"
prefix = "PB"
version = 0
def __init__(self):
Bureau.__init__(self)
# set up db for published sites
# TODO: rename this to something less ambiguous
self.db = os.path.expanduser("~/.screenless/PB.data")
if not os.path.exists(self.db):
os.mkdir(self.db)
# set up urldb for short-codes
self.urldb = self.dbenv.open_db(b"urldb")
self.rev_urldb = self.dbenv.open_db(b"rev_urldb")
def _make_shorturl(self, url):
def _shortcode():
return ''.join(random.choice(string.ascii_letters + string.digits)
for _ in range(5))
# we only have about a billion so make sure we don't collide keys
with self.dbenv.begin(write=True, db=self.urldb) as txn:
res = "not None"
while res is not None:
tmpcode = _shortcode()
res = txn.get(tmpcode.encode())
txn.put(tmpcode.encode(), url.encode())
# chop wierdly long urls to be 500 chars (LMDB limit for keys)
# TODO: make sure we're not truncating some multi-byte unicode
if len(url.encode()) > 500:
url_key = url.encode()[0:500]
else:
url_key = url.encode()
with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:
txn.put(url_key, tmpcode.encode())
return tmpcode
def _get_url(self, shortcode):
"""look up a URL from a shortcode
returns full unicode url
"""
with self.dbenv.begin(db=self.urldb) as txn:
return txn.get(shortcode.encode()).decode()
@add_command("new", "Create a new Publication/Site")
def new_site(self):
"""
Create a new Publication/Site, set up config and tace a picture from
the document camera as the index page. Finally, it will print out
the main page with commands for working with the site.
"""
self.print_small("SORRY! the publication/site feature is not done yet. So this does nothing for now.")
return
#TODO: finish and test this stuff
site_dir = os.path.join(self.db, "1")
site_id = 1
while os.path.exists(site_dir):
site_id += 1
site_dir = os.path.join(self.db, str(site_id))
os.mkdir(site_dir)
root_d = {"template": "default", "id": site_id}
with open(os.path.join(site_dir, "root.json", "w")) as root_json:
root_json.write(json.dumps(root_d))
photo = self.send("PX", "photo")["photo"]
# TODO: come up with a generic set of img form operations for Bureau
# should map regions defined with percentages to names
form_img = PIL.Image.open(photo)
fx, fy = form_img.size
title_region = (0, 0, 0.5 * fx, 0.125 * fy)
title_img = form_img.crop(title_region)
content_region = (0, 0.125 * fy, fx, fy)
content_img = form_img.crop(content_region)
def _update_page(self, site, page):
pass
@add_command("news", "Print a personalized daily newspaper")
def daily_news(self):
"""
Print out a selection of up-to-the-minute news items culled from various
sources on the internet. Current unread Post, weather and finance
information can also be shown.
"""
news = self._get_news()
# TODO: get finance
inbox = self.send("PO", "unread")
date = datetime.today().strftime("%A %B %e, %Y")
if inbox is None:
inbox = [] # if IMAP times out just move on...
lat, lon = self.config["latlon"]
forecast = weather.get_forecast(lat, lon)
self.print_full("news.html", news=news, inbox=inbox, date=date,
forecast=forecast)
@add_command("wttr", "Print out the local weather forecast")
def print_weather(self):
"""
Use the small printer to output the current local weather forecast
pulled from met.no api.
"""
#TODO: refactor this to not mess with opening and closing the printer
lat, lon = self.config["latlon"]
forecast = weather.get_forecast(lat, lon)
prn = self._get_small_printer()
prn.textln("CURRENT WEATHER")
day_count = 0
current = "Today"
for period in forecast:
#TODO: make some nicer ascii art or b/w pngs for weather
# maybe steal these from the wego project?
prn.text(period["day"])
if period["day"] != current:
day_count += 1
prn.textln(" - " + period["period"])
prn.textln(str(period["mintemp"]) + " - " + str(period["maxtemp"]) + "°C")
prn.image(period["png"])
prn.ln()
if day_count > 4:
break
prn.ln()
prn.cut()
self._free_small_printer(prn)
@add_command("r", "Print a web page for reading")
def print_url(self, data):
"""
Print out a web page for reading. The command requires a short-code,
typically referenced via barcode. Short-codes refer to full resource
URIs recorded in the Publications office 'urldb' database.
"""
shortcode, _ = data.split(".")
with self.dbenv.begin(db=self.urldb) as txn:
self.log.debug("looking up short-code:" + shortcode)
url = txn.get(shortcode.encode('utf-8'))
if not url:
self.log.warning("no valid URL in db for short code: " + shortcode)
return
else:
url = url.decode()
# download page with requests
headers = {'User-Agent': 'Mozilla/5.0'}
try:
resp = requests.get(url, timeout=20.0, headers=headers)
except requests.ReadTimeout:
self.log.warning("Timeout reading url %s", url)
self.print_small("Error: timed out reading " + url)
return
except requests.ConnectionError as e:
self.log.warning("Error reading url %s", url)
self.print_small("Error: connect error on " + url)
return
# re-render with readability
# TODO: might be cool to try to use the "byline" and "title" fields of the doc
if self.log.getEffectiveLevel() == logging.DEBUG:
with open("/tmp/raw_article.html", "w") as html_out:
html_out.write(resp.text)
self.log.debug("raw article html saved to /tmp/raw_article.html")
doc = readabilipy.simple_json_from_html_string(resp.text, use_readability=True)
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
html = lxml.html.document_fromstring(doc["content"])
notecount = 0
# store links then make corresponding svg barcodes
for link in html.findall(".//a"):
if "href" in link.attrib:
notecount += 1
else:
continue # skip bogus links
tmpcode = self._make_shorturl(link.attrib["href"])
footlink = html.makeelement("span")
footlink.attrib["class"] = "footlink"
footlink.text = str(notecount)
link.append(footlink)
svg = kode256.svg("PBr." + tmpcode)
#svg = self.bc_svg("PBr." + tmpcode, height=7.0)
footnote = html.makeelement("div")
footnote.attrib["class"] = "footnote"
notetext = html.makeelement("div")
notetext.text = str(notecount) + ". " + link.attrib["href"]
footnote.append(notetext)
#TODO: make this barcode inline thing a util method
encoded_svg = b64encode(svg.encode()).decode()
encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
svg = '<img class="endnotebc" src="%s"/>' % encoded_data
footnote.append(lxml.html.fromstring(svg))
html.append(footnote)
self.print_full("article.html", title=doc["title"],
article=lxml.html.tostring(html).decode("utf-8"),
url=url, date=timestamp)
def _get_ogdata(self, url):
"""
returns an object with OpenGraph metadata if available
"""
ogdata = {}
# download page with requests
headers = {'User-Agent': 'Mozilla/5.0'}
try:
resp = requests.get(url, timeout=20.0, headers=headers)
except (requests.ReadTimeout, requests.exceptions.ReadTimeout):
self.log.warning("Timeout fetching OpenGraph data from document %s",
url)
return ogdata
except (requests.exceptions.ConnectionError):
self.log.warning("Connection errors fetching OpenGraph from %s",
url)
return ogdata
except requests.exceptions.MissingSchema:
self.log.warning("Can't get OpenGraph data from bogus URL %s", url)
return ogdata
html = lxml.html.document_fromstring(resp.text)
#find all elements with property="og:<something>"
elements = html.findall(".//*[@property]")
for element in elements:
prop = element.get("property")
val = element.get("content")
if prop.startswith("og:"):
prop = prop[3:]
ogdata[prop] = val
return ogdata
def _get_news(self):
"""fetch a set of latest news entries from sources specified in config
"""
feeds = self.config["newsfeeds"]
entries = []
feed_data = {}
threads = []
for source in feeds:
url = source["url"]
feedlock = threading.RLock()
def fetch_feed(url, feed_data):
"""
get feed data with requests using a timeout
"""
try:
resp = requests.get(url, timeout=20.0)
with feedlock:
feed_data[url] = io.BytesIO(resp.content)
except requests.ReadTimeout:
self.log.warning("Timeout reading RSS feed %s", url)
with feedlock:
feed_data[url] = None
return
thread = threading.Thread(target=fetch_feed, args=(url, feed_data))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
for source in feeds:
try:
feed = feedparser.parse(feed_data[source["url"]])
except KeyError:
self.log.debug("Skipping missing feed data (network problem?):%s", source["url"])
continue
if feed is None:
continue
num_entries = source["count"]
# work around if we don't have enough news
if num_entries > len(feed.entries):
num_entries = len(feed.entries)
en_count = 0
while en_count < num_entries:
try:
entry = feed.entries.pop(0)
except IndexError:
# we are out of entries - quit
en_count = num_entries
continue
# ignore the old news we've already seen
#
with self.dbenv.begin(db=self.rev_urldb) as txn:
res = txn.get(entry.link.encode())
if res is not None:
continue
entry.source = feed.feed.title
entry.dbhash = self._make_shorturl(entry.link)
entry.svg = kode256.svg("PBr." + entry.dbhash)
#entry.svg = self.bc_svg("PBr." + entry.dbhash, width=0.24,
# height=7.0)
encoded_svg = b64encode(entry.svg.encode()).decode()
encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
entry.svg = '<img src="%s"/>' % encoded_data
# skip bogus entries with no text
if not hasattr(entry, "summary"):
continue
# limit summary to the last space below 500 characters
if len(entry.summary) > 500:
end = entry.summary.rfind(" ", 0, 499)
entry.summary = entry.summary[0:end] + ""
entry.summary = bleach.clean(entry.summary, strip=True)
entries.append(entry)
en_count += 1
# do this multi-threaded cuz downloads can be slow
# NOTE: this could be further optimized with 2 threads per host (chunks)
def fetch_og(entry):
"""
get OpenGraph data for entry
and download image
TODO: add microdata support here to get author
"""
og_data = self._get_ogdata(entry.link)
if hasattr(entry, "media_thumbnail"):
entry.img = entry.media_thumbnail[-1]["url"]
elif "image" in og_data:
entry.img = og_data["image"]
else:
entry.img = " "
if entry.img != " ":
fileext = "." + entry.img.rsplit(".",1)[1]
if len(fileext) > 4:
fileext = ".jpg"
filename = tempfile.mktemp(fileext)
try:
print("fetching", entry.img, filename)
urllib.request.urlretrieve(entry.img, filename)
entry.img = "file://" + filename
except (urllib.error.HTTPError, ValueError) as err:
self.log.error(err)
entry.img = " "
return entry
fetcher = multiprocessing.pool.ThreadPool(processes=2)
entries_fetched = fetcher.map(fetch_og, entries)
return entries_fetched
def main():
pub = Publications()
pub.run()
if __name__ == "__main__":
main()