article scraping, printing and template. barcodes and db for links.

workspace
Brendan Howell 9 years ago
parent 85438784f8
commit 44914e90e7

@ -0,0 +1,55 @@
<!DOCTYPE html>
<html>
<title>${title}</title>
<meta charset="utf-8">
<style type="text/css">
body {
font-family: Junicode;
font-size: 10pt;
line-height: 1;
counter-reset: sidenote-counter;
}
h1 {
font-variant: small-caps;
width: 67%;
}
p {
width: 67%;
}
img {
width: 67%;
}
a {
text-decoration: none;
color: inherit;
}
a::after {
counter-increment: sidenote-counter;
content: counter(sidenote-counter);
font-size: smaller;
vertical-align: super;
}
#date #article-src {
padding-bottom: 10%;
font-style: italic;
}
.sidenote {
float: right;
clear: right;
margin-right: -30%;
width: 30%;
height: 30%;
position: relative;
}
.sidenote svg {
width: 100%;
height: 100%;
}
</style>
<body>
<h1>${title}</h1>
<p id="date">${date}</p>
${article}
<p id="article-src">Article Source: ${url}</p>
</body>
</html>

@ -1,7 +1,15 @@
# publications office for reading and writing
from datetime import datetime
import json import json
import os import os
import random
import string
import urllib.request, urllib.parse, urllib.error
import code128
import lxml.html
import PIL import PIL
from readability import readability
from bureau import Bureau, add_command, add_api from bureau import Bureau, add_command, add_api
@ -19,10 +27,16 @@ class Publications(Bureau):
def __init__(self): def __init__(self):
Bureau.__init__(self) Bureau.__init__(self)
# set up db for published sites
# TODO: rename this to something less ambiguous
self.db = os.path.expanduser("~/.screenless/PB.data") self.db = os.path.expanduser("~/.screenless/PB.data")
if not os.path.exists(self.db): if not os.path.exists(self.db):
os.mkdir(self.db) os.mkdir(self.db)
# set up urldb for short-codes
self.urldb = self.dbenv.open_db(b"urldb")
@add_command("new", "Create a new Publication/Site") @add_command("new", "Create a new Publication/Site")
def new_site(self): def new_site(self):
@ -56,7 +70,54 @@ class Publications(Bureau):
def _update_page(self, site, page): def _update_page(self, site, page):
pass pass
@add_command("r", "Print a web page for reading")
def print_url(self, data):
"""
Print out a web page for reading. The command requires a short-code,
typically referenced via barcode. Short-codes refer to full resource
URIs recorded in the Publications office 'urldb' database.
"""
shortcode, _ = data.split(".")
with self.dbenv.begin(db=self.urldb) as txn:
print("looking up short-code:", shortcode)
url = txn.get(shortcode.encode('utf-8')).decode()
if not url:
print("ERROR: no valid URL in db for short code: ", shortcode)
return
# download
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(url, None, headers)
urldata = urllib.request.urlopen(req)
# re-render with readability
doc = readability.Document(urldata.read(),
url=url)
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
html = lxml.html.document_fromstring(doc.summary())
# store links then make corresponding svg barcodes
for link in html.findall(".//a"):
tmpcode = ''.join(random.choice(string.ascii_letters +\
string.digits)\
for _ in range(5))
with self.dbenv.begin(write=True, db=self.urldb) as txn:
if "href" in link.attrib:
txn.put(tmpcode.encode(), link.attrib["href"].encode())
svg = code128.svg("PBr." + tmpcode)
sidenote = html.makeelement("span")
sidenote.attrib["class"] = "sidenote"
sidenote.append(lxml.html.fromstring(svg.encode()))
link.addnext(sidenote)
self.print_full("article.html", title=doc.title(),
article=lxml.html.tostring(html).decode("utf-8"),
url=url, date=timestamp)
def main(): def main():

Loading…
Cancel
Save