use requests, cope with site timeouts

workspace
Brendan Howell 8 years ago
parent 7e5ad5cd71
commit 1ff5b3e886

@ -124,13 +124,16 @@ class Publications(Bureau):
else: else:
url = url.decode() url = url.decode()
# download # download page with requests
headers = {'User-Agent': 'Mozilla/5.0'} headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(url, None, headers) try:
urldata = urllib.request.urlopen(req) resp = requests.get(url, timeout=20.0, headers=headers)
except requests.ReadTimeout:
self.log.warning("Timeout reading RSS feed %s", url)
return # TODO: do we need to spit out an error?
# re-render with readability # re-render with readability
doc = readability.Document(urldata.read(), doc = readability.Document(resp.text,
url=url) url=url)
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M") timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
html = lxml.html.document_fromstring(doc.summary()) html = lxml.html.document_fromstring(doc.summary())
@ -175,7 +178,7 @@ class Publications(Bureau):
try: try:
resp = requests.get(url, timeout=20.0) resp = requests.get(url, timeout=20.0)
except requests.ReadTimeout: except requests.ReadTimeout:
self.log("Timeout reading RSS feed %s", url) self.log.warning("Timeout reading RSS feed %s", url)
continue continue
feed_data = io.BytesIO(resp.content) feed_data = io.BytesIO(resp.content)

Loading…
Cancel
Save