the-screenless-office/screenless/bureau/publications/publications.py

# publications office for reading and writing
from base64 import b64encode
from datetime import datetime
import io
import json
import os
import random
import string
import tempfile
import threading
import urllib.request, urllib.parse, urllib.error

import bleach
import feedparser
import kode256
import lxml.html
import PIL
from readability import readability
import requests

from bureau import Bureau, add_command, add_api


class Publications(Bureau):
    """
    The Publications Office serves as a kind of screenless content management
    system.  Create, update and organize your sites while doing most of the work
    on paper or anything you can photograph.
    """

    name = "Publications Office"
    prefix = "PB"
    version = 0

    def __init__(self):
        Bureau.__init__(self)

        # set up db for published sites
        # TODO: rename this to something less ambiguous
        self.db = os.path.expanduser("~/.screenless/PB.data")
        if not os.path.exists(self.db):
            os.mkdir(self.db)

        # set up urldb for short-codes
        self.urldb = self.dbenv.open_db(b"urldb")
        self.rev_urldb = self.dbenv.open_db(b"rev_urldb")

    def _make_shorturl(self, url):
        def _shortcode():
            return ''.join(random.choice(string.ascii_letters + string.digits)
                            for _ in range(5))
        
        # we only have about a billion so make sure we don't collide keys
        with self.dbenv.begin(write=True, db=self.urldb) as txn:
            res = "not None"
            while res is not None:
              tmpcode = _shortcode()
              res = txn.get(tmpcode.encode())
            txn.put(tmpcode.encode(), url.encode())

        # chop wierdly long urls to be 500 chars (LMDB limit for keys)
        # TODO: make sure we're not truncating some multi-byte unicode
        if len(url.encode()) > 500:
            url_key = url.encode()[0:500]
        else:
            url_key = url.encode()
        with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:
            txn.put(url_key, tmpcode.encode())

        return tmpcode


    def _get_url(self, shortcode):
        """look up a URL from a shortcode
        returns full unicode url
        """
        with self.dbenv.begin(db=self.urldb) as txn:
            return txn.get(shortcode.encode()).decode()

    @add_command("new", "Create a new Publication/Site")
    def new_site(self):
        """
        Create a new Publication/Site, set up config and tace a picture from
        the document camera as the index page.  Finally, it will print out
        the main page with commands for working with the site.
        """
        site_dir = os.path.join(self.db, "1")
        site_id = 1
        while os.path.exists(site_dir):
            site_id += 1
            site_dir = os.path.join(self.db, str(site_id))
        os.mkdir(site_dir)

        root_d = {"template": "default", "id": site_id}
        with open(os.path.join(site_dir, "root.json", "w")) as root_json:
            root_json.write(json.dumps(root_d))

        photo = self.send("PX", "photo")["photo"]

        # TODO: come up with a generic set of img form operations for Bureau
        #       should map regions defined with percentages to names
        form_img = PIL.Image.open(photo)
        fx, fy = form_img.size
        title_region = (0, 0, 0.5 * fx, 0.125 * fy)
        title_img = form_img.crop(title_region)
        content_region = (0, 0.125 * fy, fx, fy)
        content_img = form_img.crop(content_region)

    def _update_page(self, site, page):
        pass

    @add_command("news", "Print a personalized daily newspaper")
    def daily_news(self):
        """
        Print out a selection of up-to-the-minute news items culled from various
        sources on the internet. Current unread Post, weather and finance 
        information can also be shown.
        """
        news = self._get_news()
        # TODO: get weather
        # TODO: get finance
        inbox = self.send("PO", "unread")
        date = datetime.today().strftime("%A %B %e, %Y")
        if inbox is None:
            inbox = []  # if IMAP times out just move on...
        self.print_full("news.html", news=news, inbox=inbox, date=date)

    @add_command("r", "Print a web page for reading")
    def print_url(self, data):
        """
        Print out a web page for reading.  The command requires a short-code,
        typically referenced via barcode.  Short-codes refer to full resource
        URIs recorded in the Publications office 'urldb' database.
        """
        shortcode, _ = data.split(".")
        with self.dbenv.begin(db=self.urldb) as txn:
            self.log.debug("looking up short-code:" + shortcode)
            url = txn.get(shortcode.encode('utf-8'))

        if not url:
            self.log.warning("no valid URL in db for short code: " + shortcode)
            return
        else:
            url = url.decode()

        # download page with requests
        headers = {'User-Agent': 'Mozilla/5.0'}
        try:
            resp = requests.get(url, timeout=20.0, headers=headers)
        except requests.ReadTimeout:
            self.log.warning("Timeout reading url %s", url)
            self.print_small("Error: timed out reading " + url)
            return  
        except requests.ConnectionError as e:
            self.log.warning("Error reading url %s", url)
            self.print_small("Error: connect error on " + url)
            return  

        # re-render with readability
        doc = readability.Document(resp.text,
                                   url=url)
        timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
        html = lxml.html.document_fromstring(doc.summary())

        notecount = 0
        # store links then make corresponding svg barcodes
        for link in html.findall(".//a"):
            if "href" in link.attrib:
                notecount += 1
            else:
                continue  # skip bogus links
            tmpcode = self._make_shorturl(link.attrib["href"])

            footlink = html.makeelement("span")
            footlink.attrib["class"] = "footlink"
            footlink.text = str(notecount)
            link.append(footlink)

            svg = kode256.svg("PBr." + tmpcode)
            #svg = self.bc_svg("PBr." + tmpcode, height=7.0)

            footnote = html.makeelement("div")
            footnote.attrib["class"] = "footnote"
            notetext = html.makeelement("div")
            notetext.text = str(notecount) + ". " + link.attrib["href"]
            footnote.append(notetext)
            #TODO: make this barcode inline thing a util method
            encoded_svg = b64encode(svg.encode()).decode()
            encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
            svg = '<img class="endnotebc" src="%s"/>' % encoded_data
            footnote.append(lxml.html.fromstring(svg))
            html.append(footnote)

        self.print_full("article.html", title=doc.title(),
                        article=lxml.html.tostring(html).decode("utf-8"),
                        url=url, date=timestamp)

    def _get_ogdata(self, url):
        """
        returns an object with OpenGraph metadata if available
        """
        ogdata = {}

        # download page with requests
        headers = {'User-Agent': 'Mozilla/5.0'}
        try:
            resp = requests.get(url, timeout=20.0, headers=headers)
        except (requests.ReadTimeout, requests.exceptions.ReadTimeout):
            self.log.warning("Timeout fetching OpenGraph data from document %s",
                             url)
            return ogdata
        except (requests.exceptions.ConnectionError):
            self.log.warning("Connection errors fetching OpenGraph from %s",
                             url)
            return ogdata
        except requests.exceptions.MissingSchema:
            self.log.warning("Can't get OpenGraph data from bogus URL %s", url)
            return ogdata

        html = lxml.html.document_fromstring(resp.text)

        #find all elements with property="og:<something>"
        elements = html.findall(".//*[@property]")
        for element in elements:
            prop = element.get("property")
            val = element.get("content")
            if prop.startswith("og:"):
                prop = prop[3:]
                ogdata[prop] = val

        return ogdata

    def _get_news(self):
        """fetch a set of latest news entries from sources specified in config
        """
        feeds = self.config["newsfeeds"]
        entries = []
        feed_data = {}
        threads = []

        for source in feeds:
            url = source["url"]
            feedlock = threading.RLock()

            def fetch_feed(url, feed_data):
                """
                get feed data with requests using a timeout
                """
                try:
                    resp = requests.get(url, timeout=20.0)
                    with feedlock:
                        feed_data[url] = io.BytesIO(resp.content)
                except requests.ReadTimeout:
                    self.log.warning("Timeout reading RSS feed %s", url)
                    with feedlock:
                        feed_data[url] = None
                    return

            thread = threading.Thread(target=fetch_feed, args=(url, feed_data))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        for source in feeds:
            feed = feedparser.parse(feed_data[source["url"]])
            if feed is None:
                continue
            num_entries = source["count"]

            # work around if we don't have enough news
            if num_entries > len(feed.entries):
                num_entries = len(feed.entries)

            en_count = 0
            while en_count < num_entries:
                try:
                    entry = feed.entries.pop(0)
                except IndexError:
                    # we are out of entries - quit
                    en_count = num_entries
                    continue

                # ignore the old news we've already seen
                # 
                with self.dbenv.begin(db=self.rev_urldb) as txn:
                    res = txn.get(entry.link.encode())
                    if res is not None:
                        continue

                entry.source = feed.feed.title
                entry.dbhash = self._make_shorturl(entry.link)
                entry.svg = kode256.svg("PBr." + entry.dbhash)
                #entry.svg = self.bc_svg("PBr." + entry.dbhash, width=0.24,
                #                        height=7.0)
                encoded_svg = b64encode(entry.svg.encode()).decode()
                encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg
                entry.svg = '<img src="%s"/>' % encoded_data

                # skip bogus entries with no text
                if not hasattr(entry, "summary"):
                    continue

                # limit summary to the last space below 500 characters
                if len(entry.summary) > 500:
                    end = entry.summary.rfind(" ", 0, 499)
                    entry.summary = entry.summary[0:end] + "…"

                entry.summary = bleach.clean(entry.summary, strip=True)

                entries.append(entry)
                en_count += 1

        # do this multi-threaded cuz downloads can be slow
        threads = []
        for i in range(len(entries)):
            entry = entries[i]
            def fetch_og(entry):
                """
                get OpenGraph data for entry
                and download image
                TODO: add microdata support here to get author
                """
                og_data = self._get_ogdata(entry.link)
                if hasattr(entry, "media_thumbnail"):
                    entry.img = entry.media_thumbnail[-1]["url"]
                elif "image" in og_data:
                    entry.img = og_data["image"]
                else:
                    entry.img = " "

                if entry.img != " ":
                    fileext = "." + entry.img.rsplit(".",1)[1]
                    filename = tempfile.mktemp(fileext)
                    print("fetching", entry.img, filename)
                    urllib.request.urlretrieve(entry.img, filename)
                    entry.img = "file://" + filename


            thread = threading.Thread(target=fetch_og, args=(entry,))
            threads.append(thread)
            thread.start()

        # wait till we're done
        for thread in threads:
            thread.join()

        return entries


def main():
    pub = Publications()
    pub.run()


if __name__ == "__main__":
    main()
article scraping, printing and template. barcodes and db for links. 9 years ago			`# publications office for reading and writing`
try using b64 encoded inline svgs 6 years ago			`from base64 import b64encode`
article scraping, printing and template. barcodes and db for links. 9 years ago			`from datetime import datetime`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago			`import io`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago			`import json`
			`import os`
article scraping, printing and template. barcodes and db for links. 9 years ago			`import random`
			`import string`
try rolling our own image downloader for news 5 years ago			`import tempfile`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`import threading`
article scraping, printing and template. barcodes and db for links. 9 years ago			`import urllib.request, urllib.parse, urllib.error`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
use bleach to clean up cruddy html 8 years ago			`import bleach`
fix missing module 8 years ago			`import feedparser`
ignore bogus links when assembling article footnotes 5 years ago			`import kode256`
article scraping, printing and template. barcodes and db for links. 9 years ago			`import lxml.html`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago			`import PIL`
article scraping, printing and template. barcodes and db for links. 9 years ago			`from readability import readability`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago			`import requests`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
			`from bureau import Bureau, add_command, add_api`


			`class Publications(Bureau):`
			`"""`
			`The Publications Office serves as a kind of screenless content management`
			`system. Create, update and organize your sites while doing most of the work`
			`on paper or anything you can photograph.`
			`"""`

			`name = "Publications Office"`
			`prefix = "PB"`
			`version = 0`

			`def __init__(self):`
			`Bureau.__init__(self)`
article scraping, printing and template. barcodes and db for links. 9 years ago
			`# set up db for published sites`
			`# TODO: rename this to something less ambiguous`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago			`self.db = os.path.expanduser("~/.screenless/PB.data")`
			`if not os.path.exists(self.db):`
			`os.mkdir(self.db)`
article scraping, printing and template. barcodes and db for links. 9 years ago
			`# set up urldb for short-codes`
			`self.urldb = self.dbenv.open_db(b"urldb")`
ignore already read articles in feed 7 years ago			`self.rev_urldb = self.dbenv.open_db(b"rev_urldb")`
port news to the new system. 8 years ago
			`def _make_shorturl(self, url):`
			`def _shortcode():`
			`return ''.join(random.choice(string.ascii_letters + string.digits)`
			`for _ in range(5))`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
port news to the new system. 8 years ago			`# we only have about a billion so make sure we don't collide keys`
			`with self.dbenv.begin(write=True, db=self.urldb) as txn:`
			`res = "not None"`
			`while res is not None:`
			`tmpcode = _shortcode()`
news now working. template needs stylesheet work. 8 years ago			`res = txn.get(tmpcode.encode())`
port news to the new system. 8 years ago			`txn.put(tmpcode.encode(), url.encode())`

- Fixes for urls that are too long - started to clean up tweet detail print-out 7 years ago			`# chop wierdly long urls to be 500 chars (LMDB limit for keys)`
			`# TODO: make sure we're not truncating some multi-byte unicode`
			`if len(url.encode()) > 500:`
			`url_key = url.encode()[0:500]`
			`else:`
			`url_key = url.encode()`
ignore already read articles in feed 7 years ago			`with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:`
- Fixes for urls that are too long - started to clean up tweet detail print-out 7 years ago			`txn.put(url_key, tmpcode.encode())`
ignore already read articles in feed 7 years ago
port news to the new system. 8 years ago			`return tmpcode`


			`def _get_url(self, shortcode):`
			`"""look up a URL from a shortcode`
			`returns full unicode url`
			`"""`
			`with self.dbenv.begin(db=self.urldb) as txn:`
			`return txn.get(shortcode.encode()).decode()`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
			`@add_command("new", "Create a new Publication/Site")`
			`def new_site(self):`
			`"""`
			`Create a new Publication/Site, set up config and tace a picture from`
			`the document camera as the index page. Finally, it will print out`
			`the main page with commands for working with the site.`
			`"""`
			`site_dir = os.path.join(self.db, "1")`
			`site_id = 1`
			`while os.path.exists(site_dir):`
			`site_id += 1`
			`site_dir = os.path.join(self.db, str(site_id))`
			`os.mkdir(site_dir)`

			`root_d = {"template": "default", "id": site_id}`
			`with open(os.path.join(site_dir, "root.json", "w")) as root_json:`
			`root_json.write(json.dumps(root_d))`

			`photo = self.send("PX", "photo")["photo"]`

			`# TODO: come up with a generic set of img form operations for Bureau`
			`# should map regions defined with percentages to names`
			`form_img = PIL.Image.open(photo)`
			`fx, fy = form_img.size`
			`title_region = (0, 0, 0.5 * fx, 0.125 * fy)`
			`title_img = form_img.crop(title_region)`
			`content_region = (0, 0.125 * fy, fx, fy)`
			`content_img = form_img.crop(content_region)`

			`def _update_page(self, site, page):`
			`pass`

port news to the new system. 8 years ago			`@add_command("news", "Print a personalized daily newspaper")`
			`def daily_news(self):`
update photography temporarily for logitech cam. docstring cleanup. 8 years ago			`"""`
			`Print out a selection of up-to-the-minute news items culled from various`
			`sources on the internet. Current unread Post, weather and finance`
			`information can also be shown.`
			`"""`
port news to the new system. 8 years ago			`news = self._get_news()`
			`# TODO: get weather`
			`# TODO: get finance`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago			`inbox = self.send("PO", "unread")`
cleaned up layout and data for news 8 years ago			`date = datetime.today().strftime("%A %B %e, %Y")`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago			`if inbox is None:`
			`inbox = [] # if IMAP times out just move on...`
cleaned up layout and data for news 8 years ago			`self.print_full("news.html", news=news, inbox=inbox, date=date)`
port news to the new system. 8 years ago
article scraping, printing and template. barcodes and db for links. 9 years ago			`@add_command("r", "Print a web page for reading")`
			`def print_url(self, data):`
			`"""`
			`Print out a web page for reading. The command requires a short-code,`
			`typically referenced via barcode. Short-codes refer to full resource`
			`URIs recorded in the Publications office 'urldb' database.`
			`"""`
			`shortcode, _ = data.split(".")`
			`with self.dbenv.begin(db=self.urldb) as txn:`
properly deal with unknown url short codes 8 years ago			`self.log.debug("looking up short-code:" + shortcode)`
			`url = txn.get(shortcode.encode('utf-8'))`
article scraping, printing and template. barcodes and db for links. 9 years ago
			`if not url:`
properly deal with unknown url short codes 8 years ago			`self.log.warning("no valid URL in db for short code: " + shortcode)`
article scraping, printing and template. barcodes and db for links. 9 years ago			`return`
properly deal with unknown url short codes 8 years ago			`else:`
			`url = url.decode()`
article scraping, printing and template. barcodes and db for links. 9 years ago
use requests, cope with site timeouts 8 years ago			`# download page with requests`
article scraping, printing and template. barcodes and db for links. 9 years ago			`headers = {'User-Agent': 'Mozilla/5.0'}`
use requests, cope with site timeouts 8 years ago			`try:`
			`resp = requests.get(url, timeout=20.0, headers=headers)`
			`except requests.ReadTimeout:`
add some nicer error handling 5 years ago			`self.log.warning("Timeout reading url %s", url)`
			`self.print_small("Error: timed out reading " + url)`
			`return`
			`except requests.ConnectionError as e:`
			`self.log.warning("Error reading url %s", url)`
			`self.print_small("Error: connect error on " + url)`
			`return`
article scraping, printing and template. barcodes and db for links. 9 years ago
			`# re-render with readability`
use requests, cope with site timeouts 8 years ago			`doc = readability.Document(resp.text,`
article scraping, printing and template. barcodes and db for links. 9 years ago			`url=url)`
			`timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")`
			`html = lxml.html.document_fromstring(doc.summary())`

article template 2-col layout. typography tweaks. 9 years ago			`notecount = 0`
article scraping, printing and template. barcodes and db for links. 9 years ago			`# store links then make corresponding svg barcodes`
			`for link in html.findall(".//a"):`
skip bogus links in articles 7 years ago			`if "href" in link.attrib:`
			`notecount += 1`
			`else:`
			`continue # skip bogus links`
port news to the new system. 8 years ago			`tmpcode = self._make_shorturl(link.attrib["href"])`
article scraping, printing and template. barcodes and db for links. 9 years ago
ignore bogus links when assembling article footnotes 5 years ago			`footlink = html.makeelement("span")`
			`footlink.attrib["class"] = "footlink"`
			`footlink.text = str(notecount)`
			`link.append(footlink)`

			`svg = kode256.svg("PBr." + tmpcode)`
			`#svg = self.bc_svg("PBr." + tmpcode, height=7.0)`
article scraping, printing and template. barcodes and db for links. 9 years ago
article template 2-col layout. typography tweaks. 9 years ago			`footnote = html.makeelement("div")`
			`footnote.attrib["class"] = "footnote"`
			`notetext = html.makeelement("div")`
			`notetext.text = str(notecount) + ". " + link.attrib["href"]`
			`footnote.append(notetext)`
clean up article style. embed custom fonts. remove cruft from print_full mmethod 6 years ago			`#TODO: make this barcode inline thing a util method`
refactor barcode generation for svgs. 5 years ago			`encoded_svg = b64encode(svg.encode()).decode()`
clean up article style. embed custom fonts. remove cruft from print_full mmethod 6 years ago			`encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg`
layout tweaks to articles 6 years ago			`svg = '<img class="endnotebc" src="%s"/>' % encoded_data`
fix svg barcode inlines on article views 6 years ago			`footnote.append(lxml.html.fromstring(svg))`
article template 2-col layout. typography tweaks. 9 years ago			`html.append(footnote)`
article scraping, printing and template. barcodes and db for links. 9 years ago
			`self.print_full("article.html", title=doc.title(),`
			`article=lxml.html.tostring(html).decode("utf-8"),`
			`url=url, date=timestamp)`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
support OpenGraph data for news 7 years ago			`def _get_ogdata(self, url):`
			`"""`
			`returns an object with OpenGraph metadata if available`
			`"""`
			`ogdata = {}`

			`# download page with requests`
			`headers = {'User-Agent': 'Mozilla/5.0'}`
			`try:`
			`resp = requests.get(url, timeout=20.0, headers=headers)`
give up on slow loading news pages 7 years ago			`except (requests.ReadTimeout, requests.exceptions.ReadTimeout):`
support OpenGraph data for news 7 years ago			`self.log.warning("Timeout fetching OpenGraph data from document %s",`
			`url)`
			`return ogdata`
ignore messed up web servers 7 years ago			`except (requests.exceptions.ConnectionError):`
			`self.log.warning("Connection errors fetching OpenGraph from %s",`
			`url)`
			`return ogdata`
deal with bogus URLs 7 years ago			`except requests.exceptions.MissingSchema:`
			`self.log.warning("Can't get OpenGraph data from bogus URL %s", url)`
			`return ogdata`
support OpenGraph data for news 7 years ago
			`html = lxml.html.document_fromstring(resp.text)`

			`#find all elements with property="og:<something>"`
xpath to find non-meta tags with OG data 7 years ago			`elements = html.findall(".//*[@property]")`
support OpenGraph data for news 7 years ago			`for element in elements:`
			`prop = element.get("property")`
			`val = element.get("content")`
			`if prop.startswith("og:"):`
			`prop = prop[3:]`
			`ogdata[prop] = val`

			`return ogdata`

port news to the new system. 8 years ago			`def _get_news(self):`
			`"""fetch a set of latest news entries from sources specified in config`
			`"""`
RSS feeds now defined in YAML config file 7 years ago			`feeds = self.config["newsfeeds"]`
port news to the new system. 8 years ago			`entries = []`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`feed_data = {}`
			`threads = []`
port news to the new system. 8 years ago
			`for source in feeds:`
RSS feeds now defined in YAML config file 7 years ago			`url = source["url"]`
use locking on rss entries 5 years ago			`feedlock = threading.RLock()`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`def fetch_feed(url, feed_data):`
			`"""`
			`get feed data with requests using a timeout`
			`"""`
			`try:`
			`resp = requests.get(url, timeout=20.0)`
use locking on rss entries 5 years ago			`with feedlock:`
			`feed_data[url] = io.BytesIO(resp.content)`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`except requests.ReadTimeout:`
			`self.log.warning("Timeout reading RSS feed %s", url)`
use locking on rss entries 5 years ago			`with feedlock:`
			`feed_data[url] = None`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`return`

			`thread = threading.Thread(target=fetch_feed, args=(url, feed_data))`
			`threads.append(thread)`
			`thread.start()`

			`for thread in threads:`
			`thread.join()`
switch to using requests with timeout for rss. cope with slow imap servers. 8 years ago
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`for source in feeds:`
			`feed = feedparser.parse(feed_data[source["url"]])`
			`if feed is None:`
			`continue`
			`num_entries = source["count"]`
port news to the new system. 8 years ago
			`# work around if we don't have enough news`
			`if num_entries > len(feed.entries):`
			`num_entries = len(feed.entries)`

ignore already read articles in feed 7 years ago			`en_count = 0`
			`while en_count < num_entries:`
			`try:`
			`entry = feed.entries.pop(0)`
			`except IndexError:`
			`# we are out of entries - quit`
			`en_count = num_entries`
			`continue`

			`# ignore the old news we've already seen`
- Fixes for urls that are too long - started to clean up tweet detail print-out 7 years ago			`#`
ignore already read articles in feed 7 years ago			`with self.dbenv.begin(db=self.rev_urldb) as txn:`
			`res = txn.get(entry.link.encode())`
			`if res is not None:`
			`continue`

port news to the new system. 8 years ago			`entry.source = feed.feed.title`
			`entry.dbhash = self._make_shorturl(entry.link)`
ignore bogus links when assembling article footnotes 5 years ago			`entry.svg = kode256.svg("PBr." + entry.dbhash)`
			`#entry.svg = self.bc_svg("PBr." + entry.dbhash, width=0.24,`
			`# height=7.0)`
refactor barcode generation for svgs. 5 years ago			`encoded_svg = b64encode(entry.svg.encode()).decode()`
try using b64 encoded inline svgs 6 years ago			`encoded_data = "data:image/svg+xml;charset=utf-8;base64," + encoded_svg`
			`entry.svg = '<img src="%s"/>' % encoded_data`
port news to the new system. 8 years ago
skip bogus or empty news feed items 7 years ago			`# skip bogus entries with no text`
			`if not hasattr(entry, "summary"):`
			`continue`

port news to the new system. 8 years ago			`# limit summary to the last space below 500 characters`
			`if len(entry.summary) > 500:`
			`end = entry.summary.rfind(" ", 0, 499)`
			`entry.summary = entry.summary[0:end] + "…"`

use bleach to clean up cruddy html 8 years ago			`entry.summary = bleach.clean(entry.summary, strip=True)`
refactor email message formatting. redo unread lister. keep track of imapid-shortcode mappings. 8 years ago
port news to the new system. 8 years ago			`entries.append(entry)`
ignore already read articles in feed 7 years ago			`en_count += 1`
port news to the new system. 8 years ago
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`# do this multi-threaded cuz downloads can be slow`
			`threads = []`
			`for i in range(len(entries)):`
			`entry = entries[i]`
			`def fetch_og(entry):`
			`"""`
			`get OpenGraph data for entry`
try rolling our own image downloader for news 5 years ago			`and download image`
			`TODO: add microdata support here to get author`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`"""`
			`og_data = self._get_ogdata(entry.link)`
			`if hasattr(entry, "media_thumbnail"):`
			`entry.img = entry.media_thumbnail[-1]["url"]`
			`elif "image" in og_data:`
			`entry.img = og_data["image"]`
			`else:`
			`entry.img = " "`

try rolling our own image downloader for news 5 years ago			`if entry.img != " ":`
try rolling our own image downloader for news 5 years ago			`fileext = "." + entry.img.rsplit(".",1)[1]`
			`filename = tempfile.mktemp(fileext)`
			`print("fetching", entry.img, filename)`
			`urllib.request.urlretrieve(entry.img, filename)`
			`entry.img = "file://" + filename`


force args to a tuple 7 years ago			`thread = threading.Thread(target=fetch_og, args=(entry,))`
try some multi-threading to deal with slow downloads of feeds and metadata 7 years ago			`threads.append(thread)`
			`thread.start()`

			`# wait till we're done`
			`for thread in threads:`
			`thread.join()`

port news to the new system. 8 years ago			`return entries`

big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago
			`def main():`
port news to the new system. 8 years ago			`pub = Publications()`
			`pub.run()`
big cleanup and rearrange into modules. added skeleton for publications CMS. added a sales dept to play vids from barcodes. 9 years ago

			`if __name__ == "__main__":`
			`main()`