try some multi-threading to deal with slow downloads of feeds and metadata

workspace
Brendan Howell 7 years ago
parent 1ad025c043
commit 74ca1e5aaf

@ -5,6 +5,7 @@ import json
import os import os
import random import random
import string import string
import threading
import urllib.request, urllib.parse, urllib.error import urllib.request, urllib.parse, urllib.error
import bleach import bleach
@ -204,20 +205,36 @@ class Publications(Bureau):
""" """
feeds = self.config["newsfeeds"] feeds = self.config["newsfeeds"]
entries = [] entries = []
feed_data = {}
threads = []
for source in feeds: for source in feeds:
url = source["url"] url = source["url"]
num_entries = source["count"]
# get feed data with requests using a timeout def fetch_feed(url, feed_data):
try: """
resp = requests.get(url, timeout=20.0) get feed data with requests using a timeout
except requests.ReadTimeout: """
self.log.warning("Timeout reading RSS feed %s", url) try:
continue resp = requests.get(url, timeout=20.0)
feed_data[url] = io.BytesIO(resp.content)
except requests.ReadTimeout:
self.log.warning("Timeout reading RSS feed %s", url)
feed_data[url] = None
return
thread = threading.Thread(target=fetch_feed, args=(url, feed_data))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
feed_data = io.BytesIO(resp.content) for source in feeds:
feed = feedparser.parse(feed_data) feed = feedparser.parse(feed_data[source["url"]])
if feed is None:
continue
num_entries = source["count"]
# work around if we don't have enough news # work around if we don't have enough news
if num_entries > len(feed.entries): if num_entries > len(feed.entries):
@ -228,13 +245,6 @@ class Publications(Bureau):
entry.source = feed.feed.title entry.source = feed.feed.title
entry.dbhash = self._make_shorturl(entry.link) entry.dbhash = self._make_shorturl(entry.link)
entry.svg = code128.svg("PBr." + entry.dbhash) entry.svg = code128.svg("PBr." + entry.dbhash)
og_data = self._get_ogdata(entry.link)
if hasattr(entry, "media_thumbnail"):
entry.img = entry.media_thumbnail[-1]["url"]
elif "image" in og_data:
entry.img = og_data["image"]
else:
entry.img = " "
# skip bogus entries with no text # skip bogus entries with no text
if not hasattr(entry, "summary"): if not hasattr(entry, "summary"):
@ -249,6 +259,30 @@ class Publications(Bureau):
entries.append(entry) entries.append(entry)
# do this multi-threaded cuz downloads can be slow
threads = []
for i in range(len(entries)):
entry = entries[i]
def fetch_og(entry):
"""
get OpenGraph data for entry
"""
og_data = self._get_ogdata(entry.link)
if hasattr(entry, "media_thumbnail"):
entry.img = entry.media_thumbnail[-1]["url"]
elif "image" in og_data:
entry.img = og_data["image"]
else:
entry.img = " "
thread = threading.Thread(target=fetch_og, args=(entry))
threads.append(thread)
thread.start()
# wait till we're done
for thread in threads:
thread.join()
return entries return entries

Loading…
Cancel
Save