try some multi-threading to deal with slow downloads of feeds and metadata

7 years ago · 74ca1e5aaf
parent 1ad025c043
commit 74ca1e5aaf
1 changed files with 50 additions and 16 deletions
--- a/screenless/bureau/publications/publications.py
+++ b/screenless/bureau/publications/publications.py
@ -5,6 +5,7 @@ import json
 import os
 import random
 import string
+import threading
 import urllib.request, urllib.parse, urllib.error

 import bleach
@ -204,20 +205,36 @@ class Publications(Bureau):
        """
        feeds = self.config["newsfeeds"]
        entries = []
+        feed_data = {}
+        threads = []

        for source in feeds:
            url = source["url"]
-            num_entries = source["count"]

-            # get feed data with requests using a timeout
+            def fetch_feed(url, feed_data):
+                """
+                get feed data with requests using a timeout
+                """
                try:
                    resp = requests.get(url, timeout=20.0)
+                    feed_data[url] = io.BytesIO(resp.content)
                except requests.ReadTimeout:
                    self.log.warning("Timeout reading RSS feed %s", url)
-                continue
+                    feed_data[url] = None
+                    return
+
+            thread = threading.Thread(target=fetch_feed, args=(url, feed_data))
+            threads.append(thread)
+            thread.start()

-            feed_data = io.BytesIO(resp.content)
-            feed = feedparser.parse(feed_data)
+        for thread in threads:
+            thread.join()
+
+        for source in feeds:
+            feed = feedparser.parse(feed_data[source["url"]])
+            if feed is None:
+                continue
+            num_entries = source["count"]

            # work around if we don't have enough news
            if num_entries > len(feed.entries):
@ -228,13 +245,6 @@ class Publications(Bureau):
                entry.source = feed.feed.title
                entry.dbhash = self._make_shorturl(entry.link)
                entry.svg = code128.svg("PBr." + entry.dbhash)
-                og_data = self._get_ogdata(entry.link)
-                if hasattr(entry, "media_thumbnail"):
-                    entry.img = entry.media_thumbnail[-1]["url"]
-                elif "image" in og_data:
-                    entry.img = og_data["image"]
-                else:
-                    entry.img = " "

                # skip bogus entries with no text
                if not hasattr(entry, "summary"):
@ -249,6 +259,30 @@ class Publications(Bureau):

                entries.append(entry)

+        # do this multi-threaded cuz downloads can be slow
+        threads = []
+        for i in range(len(entries)):
+            entry = entries[i]
+            def fetch_og(entry):
+                """
+                get OpenGraph data for entry
+                """
+                og_data = self._get_ogdata(entry.link)
+                if hasattr(entry, "media_thumbnail"):
+                    entry.img = entry.media_thumbnail[-1]["url"]
+                elif "image" in og_data:
+                    entry.img = og_data["image"]
+                else:
+                    entry.img = " "
+
+            thread = threading.Thread(target=fetch_og, args=(entry))
+            threads.append(thread)
+            thread.start()
+
+        # wait till we're done
+        for thread in threads:
+            thread.join()
+
        return entries