From dcaa15aeec738875159c296e95d9189fb797ce21 Mon Sep 17 00:00:00 2001
From: Brendan Howell <brendan@howell-ersatz.com>
Date: Sun, 29 Oct 2017 22:29:36 +0100
Subject: [PATCH] ignore already read articles in feed

---
 .../bureau/publications/publications.py       | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/screenless/bureau/publications/publications.py b/screenless/bureau/publications/publications.py
index cce59a3..590c09d 100644
--- a/screenless/bureau/publications/publications.py
+++ b/screenless/bureau/publications/publications.py
@@ -41,6 +41,7 @@ class Publications(Bureau):
 
         # set up urldb for short-codes
         self.urldb = self.dbenv.open_db(b"urldb")
+        self.rev_urldb = self.dbenv.open_db(b"rev_urldb")
 
     def _make_shorturl(self, url):
         def _shortcode():
@@ -55,6 +56,9 @@ class Publications(Bureau):
               res = txn.get(tmpcode.encode())
             txn.put(tmpcode.encode(), url.encode())
 
+        with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:
+            txn.put(url.encode(), tmpcode.encode())
+
         return tmpcode
 
 
@@ -243,8 +247,21 @@ class Publications(Bureau):
             if num_entries > len(feed.entries):
                 num_entries = len(feed.entries)
 
-            for _ in range(num_entries):
-                entry = feed.entries.pop(0)
+            en_count = 0
+            while en_count < num_entries:
+                try:
+                    entry = feed.entries.pop(0)
+                except IndexError:
+                    # we are out of entries - quit
+                    en_count = num_entries
+                    continue
+
+                # ignore the old news we've already seen
+                with self.dbenv.begin(db=self.rev_urldb) as txn:
+                    res = txn.get(entry.link.encode())
+                    if res is not None:
+                        continue
+
                 entry.source = feed.feed.title
                 entry.dbhash = self._make_shorturl(entry.link)
                 entry.svg = code128.svg("PBr." + entry.dbhash)
@@ -261,6 +278,7 @@ class Publications(Bureau):
                 entry.summary = bleach.clean(entry.summary, strip=True)
 
                 entries.append(entry)
+                en_count += 1
 
         # do this multi-threaded cuz downloads can be slow
         threads = []