use requests, cope with site timeouts

workspace
Brendan Howell 7 years ago
parent 7e5ad5cd71
commit 1ff5b3e886

@ -124,13 +124,16 @@ class Publications(Bureau):
else:
url = url.decode()
# download
# download page with requests
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(url, None, headers)
urldata = urllib.request.urlopen(req)
try:
resp = requests.get(url, timeout=20.0, headers=headers)
except requests.ReadTimeout:
self.log.warning("Timeout reading RSS feed %s", url)
return # TODO: do we need to spit out an error?
# re-render with readability
doc = readability.Document(urldata.read(),
doc = readability.Document(resp.text,
url=url)
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
html = lxml.html.document_fromstring(doc.summary())
@ -175,7 +178,7 @@ class Publications(Bureau):
try:
resp = requests.get(url, timeout=20.0)
except requests.ReadTimeout:
self.log("Timeout reading RSS feed %s", url)
self.log.warning("Timeout reading RSS feed %s", url)
continue
feed_data = io.BytesIO(resp.content)

Loading…
Cancel
Save