tweak connection pooling for news

workspace
Brendan Howell 4 years ago
parent 4c54748be3
commit 580c1888f5

@ -3,6 +3,7 @@ from base64 import b64encode
from datetime import datetime
import io
import json
import multiprocessing
import os
import random
import string
@ -313,40 +314,32 @@ class Publications(Bureau):
en_count += 1
# do this multi-threaded cuz downloads can be slow
threads = []
for i in range(len(entries)):
entry = entries[i]
def fetch_og(entry):
"""
get OpenGraph data for entry
and download image
TODO: add microdata support here to get author
"""
og_data = self._get_ogdata(entry.link)
if hasattr(entry, "media_thumbnail"):
entry.img = entry.media_thumbnail[-1]["url"]
elif "image" in og_data:
entry.img = og_data["image"]
else:
entry.img = " "
if entry.img != " ":
fileext = "." + entry.img.rsplit(".",1)[1]
filename = tempfile.mktemp(fileext)
print("fetching", entry.img, filename)
urllib.request.urlretrieve(entry.img, filename)
entry.img = "file://" + filename
thread = threading.Thread(target=fetch_og, args=(entry,))
threads.append(thread)
thread.start()
# NOTE: this could be further optimized with 2 threads per host (chunks)
def fetch_og(entry):
"""
get OpenGraph data for entry
and download image
TODO: add microdata support here to get author
"""
og_data = self._get_ogdata(entry.link)
if hasattr(entry, "media_thumbnail"):
entry.img = entry.media_thumbnail[-1]["url"]
elif "image" in og_data:
entry.img = og_data["image"]
else:
entry.img = " "
# wait till we're done
for thread in threads:
thread.join()
if entry.img != " ":
fileext = "." + entry.img.rsplit(".",1)[1]
filename = tempfile.mktemp(fileext)
print("fetching", entry.img, filename)
urllib.request.urlretrieve(entry.img, filename)
entry.img = "file://" + filename
fetcher = multiprocessing.pool.ThreadPool(processes=2)
entries_fetched = fetcher.map(fetch_og, entries)
return entries
return entries_fetched
def main():

Loading…
Cancel
Save