# This script scrapes text on a page that contains a keyword # Based on a tutorial of Michael Murtaugh at Piet Zwart Institute # https://pad.xpub.nl/p/prototyping_02102018 import sys import html5lib import xml.etree.ElementTree as ET from urllib.request import urlopen # get the url from the terminal input_var_url = input("Enter a url to scrape (include https:// etc.): ") # get the keyword to look for from the terminal input_var_keyword = input("Enter a keyword to look for: ") print ("Great, let's start!") print ("Scraping " + input_var_url + " to look for " + input_var_keyword) # This is the function that actually does the scraping # A function is a block of code which only runs when it is called. # You can pass data, known as parameters, into a function. (in this case the url and keyword) # A function can return data as a result. # First define the function def scrape_text_by_keyword (url, keyword): # Open the page with urlopen(url) as f: t = html5lib.parse(f, namespaceHTMLElements=False) # Make lists to put the headings in text_list = [] # Open the xml tree map of the webpage. Go through all text containing # the word 'trump'. To ignore javascript having that word, ignore when # the html tag of the text is script. # Append each found text to the list with headings. for x in t.iter(): if x.text != None and keyword in x.text.lower() and x.tag != 'script': text_list.append(x.text) # Done? Great! # Change the list into a collection of strings # Open a txt file and put them there # In case the file already exists, then just paste it at the bottom # Super handy if you want to run the script for multiple sites and collect text # in just one file text_list_strings = "\n".join(text_list) print(text_list_strings) text_file = open("results.txt", "a+") text_file.write(text_list_strings) text_file.close() # Let's call the function above # and run it with the url and keyword that is filled in scrape_text_by_keyword(input_var_url, input_var_keyword.lower()) # That's it, folks! print("Done! Run the script again to scrape another page.")