From dcaa15aeec738875159c296e95d9189fb797ce21 Mon Sep 17 00:00:00 2001 From: Brendan Howell Date: Sun, 29 Oct 2017 22:29:36 +0100 Subject: [PATCH] ignore already read articles in feed --- .../bureau/publications/publications.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/screenless/bureau/publications/publications.py b/screenless/bureau/publications/publications.py index cce59a3..590c09d 100644 --- a/screenless/bureau/publications/publications.py +++ b/screenless/bureau/publications/publications.py @@ -41,6 +41,7 @@ class Publications(Bureau): # set up urldb for short-codes self.urldb = self.dbenv.open_db(b"urldb") + self.rev_urldb = self.dbenv.open_db(b"rev_urldb") def _make_shorturl(self, url): def _shortcode(): @@ -55,6 +56,9 @@ class Publications(Bureau): res = txn.get(tmpcode.encode()) txn.put(tmpcode.encode(), url.encode()) + with self.dbenv.begin(write=True, db=self.rev_urldb) as txn: + txn.put(url.encode(), tmpcode.encode()) + return tmpcode @@ -243,8 +247,21 @@ class Publications(Bureau): if num_entries > len(feed.entries): num_entries = len(feed.entries) - for _ in range(num_entries): - entry = feed.entries.pop(0) + en_count = 0 + while en_count < num_entries: + try: + entry = feed.entries.pop(0) + except IndexError: + # we are out of entries - quit + en_count = num_entries + continue + + # ignore the old news we've already seen + with self.dbenv.begin(db=self.rev_urldb) as txn: + res = txn.get(entry.link.encode()) + if res is not None: + continue + entry.source = feed.feed.title entry.dbhash = self._make_shorturl(entry.link) entry.svg = code128.svg("PBr." + entry.dbhash) @@ -261,6 +278,7 @@ class Publications(Bureau): entry.summary = bleach.clean(entry.summary, strip=True) entries.append(entry) + en_count += 1 # do this multi-threaded cuz downloads can be slow threads = []