You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

56 lines
1.8 KiB
Python

import logging
logging.getLogger('flask_assistant').setLevel(logging.DEBUG)
from flask import Flask
from flask_assistant import Assistant, ask, tell, event, build_item
# This script scrapes text on a page that contains a keyword
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
# https://pad.xpub.nl/p/prototyping_02102018
import sys
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
app = Flask(__name__)
assist = Assistant(app, route='/')
@assist.action('greeting')
def greet_and_start():
speech = "Hey! What should I check for you at NYTimes.com?"
return ask(speech)
@assist.action('give-keyword')
def read_headlines(keyword):
# Open the page
url = 'https://nytimes.com'
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
# Make lists to put the headings in
text_list = []
# Open the xml tree map of the webpage. Go through all text containing
# the word 'trump'. To ignore javascript having that word, ignore when
# the html tag of the text is script.
# Append each found text to the list with headings.
for x in t.iter():
if x.text != None and keyword in x.text.lower() and x.tag != 'script':
text_list.append(x.text)
# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect text
# in just one file
text_list_strings = "\n".join(text_list)
speech = 'Ok, I found the following news about ' + keyword + '. It is juicy!'
return ask('{} {}'.format(speech, text_list_strings))
if __name__ == '__main__':
app.run(debug=True)