grad_prototypes/workshop_pyratechnic_session_1/scrapescripts/html5lib/scrape_txt_keyword.py

# This script scrapes text on a page that contains a keyword
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
# https://pad.xpub.nl/p/prototyping_02102018


import sys
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen

# get the url from the terminal
input_var_url = input("Enter a url to scrape (include https:// etc.): ")

# get the keyword to look for from the terminal
input_var_keyword = input("Enter a keyword to look for: ")

print ("Great, let's start!") 
print ("Scraping " + input_var_url + " to look for " + input_var_keyword) 

# This is the function that actually does the scraping
# A function is a block of code which only runs when it is called.
# You can pass data, known as parameters, into a function. (in this case the url and keyword)
# A function can return data as a result.

# First define the function
def scrape_text_by_keyword (url, keyword):
    # Open the page
    with urlopen(url) as f:
        t = html5lib.parse(f, namespaceHTMLElements=False)

    # Make lists to put the headings in
    text_list = []


    # Open the xml tree map of the webpage. Go through all text containing
    # the word 'trump'. To ignore javascript having that word, ignore when
    # the html tag of the text is script.
    # Append each found text to the list with headings.

    for x in t.iter():
        if x.text != None and keyword in x.text.lower() and x.tag != 'script':
            text_list.append(x.text)

    # Done? Great!
    # Change the list into a collection of strings
    # Open a txt file and put them there
    # In case the file already exists, then just paste it at the bottom
    # Super handy if you want to run the script for multiple sites and collect text
    # in just one file
    text_list_strings = "\n".join(text_list)
    print(text_list_strings)
    text_file = open("results.txt", "a+")
    text_file.write(text_list_strings)
    text_file.close()


# Let's call the function above
# and run it with the url and keyword that is filled in
scrape_text_by_keyword(input_var_url, input_var_keyword.lower())

# That's it, folks!
print("Done! Run the script again to scrape another page.")
Added sketches from t4 6 years ago			`# This script scrapes text on a page that contains a keyword`
			`# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute`
			`# https://pad.xpub.nl/p/prototyping_02102018`


			`import sys`
			`import html5lib`
			`import xml.etree.ElementTree as ET`
			`from urllib.request import urlopen`

			`# get the url from the terminal`
			`input_var_url = input("Enter a url to scrape (include https:// etc.): ")`

			`# get the keyword to look for from the terminal`
			`input_var_keyword = input("Enter a keyword to look for: ")`

			`print ("Great, let's start!")`
			`print ("Scraping " + input_var_url + " to look for " + input_var_keyword)`

			`# This is the function that actually does the scraping`
			`# A function is a block of code which only runs when it is called.`
			`# You can pass data, known as parameters, into a function. (in this case the url and keyword)`
			`# A function can return data as a result.`

			`# First define the function`
			`def scrape_text_by_keyword (url, keyword):`
			`# Open the page`
			`with urlopen(url) as f:`
			`t = html5lib.parse(f, namespaceHTMLElements=False)`

			`# Make lists to put the headings in`
			`text_list = []`


			`# Open the xml tree map of the webpage. Go through all text containing`
			`# the word 'trump'. To ignore javascript having that word, ignore when`
			`# the html tag of the text is script.`
			`# Append each found text to the list with headings.`

			`for x in t.iter():`
			`if x.text != None and keyword in x.text.lower() and x.tag != 'script':`
			`text_list.append(x.text)`

			`# Done? Great!`
			`# Change the list into a collection of strings`
			`# Open a txt file and put them there`
			`# In case the file already exists, then just paste it at the bottom`
			`# Super handy if you want to run the script for multiple sites and collect text`
			`# in just one file`
			`text_list_strings = "\n".join(text_list)`
			`print(text_list_strings)`
			`text_file = open("results.txt", "a+")`
			`text_file.write(text_list_strings)`
			`text_file.close()`


			`# Let's call the function above`
			`# and run it with the url and keyword that is filled in`
			`scrape_text_by_keyword(input_var_url, input_var_keyword.lower())`

			`# That's it, folks!`
			`print("Done! Run the script again to scrape another page.")`