diff --git a/screenless/bureau/publications/publications.py b/screenless/bureau/publications/publications.py index 0c61565..1efe2fb 100644 --- a/screenless/bureau/publications/publications.py +++ b/screenless/bureau/publications/publications.py @@ -164,6 +164,35 @@ class Publications(Bureau): article=lxml.html.tostring(html).decode("utf-8"), url=url, date=timestamp) + def _get_ogdata(self, url): + """ + returns an object with OpenGraph metadata if available + """ + ogdata = {} + + # download page with requests + headers = {'User-Agent': 'Mozilla/5.0'} + try: + resp = requests.get(url, timeout=20.0, headers=headers) + except requests.ReadTimeout: + self.log.warning("Timeout fetching OpenGraph data from document %s", + url) + return ogdata + + html = lxml.html.document_fromstring(resp.text) + + #find all elements with property="og:" + elements = html.findall(".//[@property]") + for element in elements: + prop = element.get("property") + val = element.get("content") + if prop.startswith("og:"): + prop = prop[3:] + ogdata[prop] = val + print("set og:", prop, "to", val) + + return ogdata + def _get_news(self): """fetch a set of latest news entries from sources specified in config """ @@ -189,12 +218,15 @@ class Publications(Bureau): num_entries = len(feed.entries) for _ in range(num_entries): - entry = feed.entries.pop() + entry = feed.entries.pop(0) entry.source = feed.feed.title entry.dbhash = self._make_shorturl(entry.link) entry.svg = code128.svg("PBr." + entry.dbhash) + og_data = self._get_ogdata(entry.link) if hasattr(entry, "media_thumbnail"): entry.img = entry.media_thumbnail[-1]["url"] + elif hasattr(og_data, "image"): + entry.img = og_data["image"] else: entry.img = " "