ignore already read articles in feed

workspace
Brendan Howell 7 years ago
parent 79d5489f67
commit dcaa15aeec

@ -41,6 +41,7 @@ class Publications(Bureau):
# set up urldb for short-codes
self.urldb = self.dbenv.open_db(b"urldb")
self.rev_urldb = self.dbenv.open_db(b"rev_urldb")
def _make_shorturl(self, url):
def _shortcode():
@ -55,6 +56,9 @@ class Publications(Bureau):
res = txn.get(tmpcode.encode())
txn.put(tmpcode.encode(), url.encode())
with self.dbenv.begin(write=True, db=self.rev_urldb) as txn:
txn.put(url.encode(), tmpcode.encode())
return tmpcode
@ -243,8 +247,21 @@ class Publications(Bureau):
if num_entries > len(feed.entries):
num_entries = len(feed.entries)
for _ in range(num_entries):
entry = feed.entries.pop(0)
en_count = 0
while en_count < num_entries:
try:
entry = feed.entries.pop(0)
except IndexError:
# we are out of entries - quit
en_count = num_entries
continue
# ignore the old news we've already seen
with self.dbenv.begin(db=self.rev_urldb) as txn:
res = txn.get(entry.link.encode())
if res is not None:
continue
entry.source = feed.feed.title
entry.dbhash = self._make_shorturl(entry.link)
entry.svg = code128.svg("PBr." + entry.dbhash)
@ -261,6 +278,7 @@ class Publications(Bureau):
entry.summary = bleach.clean(entry.summary, strip=True)
entries.append(entry)
en_count += 1
# do this multi-threaded cuz downloads can be slow
threads = []

Loading…
Cancel
Save