|
|
@ -124,13 +124,16 @@ class Publications(Bureau):
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
url = url.decode()
|
|
|
|
url = url.decode()
|
|
|
|
|
|
|
|
|
|
|
|
# download
|
|
|
|
# download page with requests
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
|
|
req = urllib.request.Request(url, None, headers)
|
|
|
|
try:
|
|
|
|
urldata = urllib.request.urlopen(req)
|
|
|
|
resp = requests.get(url, timeout=20.0, headers=headers)
|
|
|
|
|
|
|
|
except requests.ReadTimeout:
|
|
|
|
|
|
|
|
self.log.warning("Timeout reading RSS feed %s", url)
|
|
|
|
|
|
|
|
return # TODO: do we need to spit out an error?
|
|
|
|
|
|
|
|
|
|
|
|
# re-render with readability
|
|
|
|
# re-render with readability
|
|
|
|
doc = readability.Document(urldata.read(),
|
|
|
|
doc = readability.Document(resp.text,
|
|
|
|
url=url)
|
|
|
|
url=url)
|
|
|
|
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
|
|
|
|
timestamp = datetime.now().strftime("Sourced %d %B, %Y at %H:%M")
|
|
|
|
html = lxml.html.document_fromstring(doc.summary())
|
|
|
|
html = lxml.html.document_fromstring(doc.summary())
|
|
|
@ -175,7 +178,7 @@ class Publications(Bureau):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
resp = requests.get(url, timeout=20.0)
|
|
|
|
resp = requests.get(url, timeout=20.0)
|
|
|
|
except requests.ReadTimeout:
|
|
|
|
except requests.ReadTimeout:
|
|
|
|
self.log("Timeout reading RSS feed %s", url)
|
|
|
|
self.log.warning("Timeout reading RSS feed %s", url)
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
feed_data = io.BytesIO(resp.content)
|
|
|
|
feed_data = io.BytesIO(resp.content)
|
|
|
|