|
|
|
@ -164,6 +164,35 @@ class Publications(Bureau):
|
|
|
|
|
article=lxml.html.tostring(html).decode("utf-8"),
|
|
|
|
|
url=url, date=timestamp)
|
|
|
|
|
|
|
|
|
|
def _get_ogdata(self, url):
|
|
|
|
|
"""
|
|
|
|
|
returns an object with OpenGraph metadata if available
|
|
|
|
|
"""
|
|
|
|
|
ogdata = {}
|
|
|
|
|
|
|
|
|
|
# download page with requests
|
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.get(url, timeout=20.0, headers=headers)
|
|
|
|
|
except requests.ReadTimeout:
|
|
|
|
|
self.log.warning("Timeout fetching OpenGraph data from document %s",
|
|
|
|
|
url)
|
|
|
|
|
return ogdata
|
|
|
|
|
|
|
|
|
|
html = lxml.html.document_fromstring(resp.text)
|
|
|
|
|
|
|
|
|
|
#find all elements with property="og:<something>"
|
|
|
|
|
elements = html.findall(".//[@property]")
|
|
|
|
|
for element in elements:
|
|
|
|
|
prop = element.get("property")
|
|
|
|
|
val = element.get("content")
|
|
|
|
|
if prop.startswith("og:"):
|
|
|
|
|
prop = prop[3:]
|
|
|
|
|
ogdata[prop] = val
|
|
|
|
|
print("set og:", prop, "to", val)
|
|
|
|
|
|
|
|
|
|
return ogdata
|
|
|
|
|
|
|
|
|
|
def _get_news(self):
|
|
|
|
|
"""fetch a set of latest news entries from sources specified in config
|
|
|
|
|
"""
|
|
|
|
@ -189,12 +218,15 @@ class Publications(Bureau):
|
|
|
|
|
num_entries = len(feed.entries)
|
|
|
|
|
|
|
|
|
|
for _ in range(num_entries):
|
|
|
|
|
entry = feed.entries.pop()
|
|
|
|
|
entry = feed.entries.pop(0)
|
|
|
|
|
entry.source = feed.feed.title
|
|
|
|
|
entry.dbhash = self._make_shorturl(entry.link)
|
|
|
|
|
entry.svg = code128.svg("PBr." + entry.dbhash)
|
|
|
|
|
og_data = self._get_ogdata(entry.link)
|
|
|
|
|
if hasattr(entry, "media_thumbnail"):
|
|
|
|
|
entry.img = entry.media_thumbnail[-1]["url"]
|
|
|
|
|
elif hasattr(og_data, "image"):
|
|
|
|
|
entry.img = og_data["image"]
|
|
|
|
|
else:
|
|
|
|
|
entry.img = " "
|
|
|
|
|
|
|
|
|
|