import logging logging.getLogger('flask_assistant').setLevel(logging.DEBUG) from flask import Flask from flask_assistant import Assistant, ask, tell, event, build_item # This script scrapes text on a page that contains a keyword # Based on a tutorial of Michael Murtaugh at Piet Zwart Institute # https://pad.xpub.nl/p/prototyping_02102018 import sys import html5lib import xml.etree.ElementTree as ET from urllib.request import urlopen app = Flask(__name__) assist = Assistant(app, route='/') @assist.action('greeting') def greet_and_start(): speech = "Hey! What should I check for you at NYTimes.com?" return ask(speech) @assist.action('give-keyword') def read_headlines(keyword): # Open the page url = 'https://nytimes.com' with urlopen(url) as f: t = html5lib.parse(f, namespaceHTMLElements=False) # Make lists to put the headings in text_list = [] # Open the xml tree map of the webpage. Go through all text containing # the word 'trump'. To ignore javascript having that word, ignore when # the html tag of the text is script. # Append each found text to the list with headings. for x in t.iter(): if x.text != None and keyword in x.text.lower() and x.tag != 'script': text_list.append(x.text) # Done? Great! # Change the list into a collection of strings # Open a txt file and put them there # In case the file already exists, then just paste it at the bottom # Super handy if you want to run the script for multiple sites and collect text # in just one file text_list_strings = "\n".join(text_list) speech = 'Ok, I found the following news about ' + keyword + '. It is juicy!' return ask('{} {}'.format(speech, text_list_strings)) if __name__ == '__main__': app.run(debug=True)