diff --git a/screenless/bureau/publications/publications.py b/screenless/bureau/publications/publications.py index 71f2bd7..f53aeb8 100644 --- a/screenless/bureau/publications/publications.py +++ b/screenless/bureau/publications/publications.py @@ -3,6 +3,7 @@ from base64 import b64encode from datetime import datetime import io import json +import multiprocessing import os import random import string @@ -313,40 +314,32 @@ class Publications(Bureau): en_count += 1 # do this multi-threaded cuz downloads can be slow - threads = [] - for i in range(len(entries)): - entry = entries[i] - def fetch_og(entry): - """ - get OpenGraph data for entry - and download image - TODO: add microdata support here to get author - """ - og_data = self._get_ogdata(entry.link) - if hasattr(entry, "media_thumbnail"): - entry.img = entry.media_thumbnail[-1]["url"] - elif "image" in og_data: - entry.img = og_data["image"] - else: - entry.img = " " - - if entry.img != " ": - fileext = "." + entry.img.rsplit(".",1)[1] - filename = tempfile.mktemp(fileext) - print("fetching", entry.img, filename) - urllib.request.urlretrieve(entry.img, filename) - entry.img = "file://" + filename - - - thread = threading.Thread(target=fetch_og, args=(entry,)) - threads.append(thread) - thread.start() + # NOTE: this could be further optimized with 2 threads per host (chunks) + def fetch_og(entry): + """ + get OpenGraph data for entry + and download image + TODO: add microdata support here to get author + """ + og_data = self._get_ogdata(entry.link) + if hasattr(entry, "media_thumbnail"): + entry.img = entry.media_thumbnail[-1]["url"] + elif "image" in og_data: + entry.img = og_data["image"] + else: + entry.img = " " - # wait till we're done - for thread in threads: - thread.join() + if entry.img != " ": + fileext = "." + entry.img.rsplit(".",1)[1] + filename = tempfile.mktemp(fileext) + print("fetching", entry.img, filename) + urllib.request.urlretrieve(entry.img, filename) + entry.img = "file://" + filename + + fetcher = multiprocessing.pool.ThreadPool(processes=2) + entries_fetched = fetcher.map(fetch_og, entries) - return entries + return entries_fetched def main():