# Script used during Breaking News 02/10 # Scrapes text on a page based on keywords specified in the code # Based on a tutorial of Michael Murtaugh at Piet Zwart Institute # https://pad.xpub.nl/p/prototyping_02102018 import sys import html5lib import xml.etree.ElementTree as ET from urllib.request import urlopen # get the url from the terminal input_var = input("Enter a url to scrape (include https:// etc.): ") print ("Scraping " + input_var) url = input_var #sys.argv[1] # Open the page with urlopen(url) as f: t = html5lib.parse(f, namespaceHTMLElements=False) # Make lists to put the headings in trump = [] putin = [] jongun = [] # Open the xml tree map of the webpage. Go through all text containing # the word 'trump'. To ignore javascript having that word, ignore when # the html tag of the text is script. # Append each found text to the list with headings. for x in t.iter(): if x.text != None and 'trump' in x.text.lower() and x.tag != 'script': trump.append(x.text) # Same for Putin for y in t.iter(): if y.text != None and 'putin' in y.text.lower() and y.tag != 'script': putin.append(y.text) # Same for Kim Jong-Un for z in t.iter(): if z.text != None and 'jong-un' in z.text.lower() and z.tag != 'script': jongun.append(z.text) # Done? Great! # Change the list into a collection of strings # Open a txt file and put them there # In case the file already exists, then just paste it at the bottom # Super handy if you want to run the script for multiple sites and collect headings # in just one file trumpheadings = " ".join(trump) print(trumpheadings) text_file = open("trump.txt", "a+") text_file.write(trumpheadings) text_file.close() putinheadings = " ".join(putin) print(putinheadings) text_file = open("putin.txt", "a+") text_file.write(putinheadings) text_file.close() unheadings = " ".join(jongun) print(unheadings) text_file = open("jongun.txt", "a+") text_file.write(unheadings) text_file.close() # That's it, folks! print("Done! Run the script again to scrape another page.") # Super sketchy, so a bit repetitive. This could be done more efficiently # Array of keywords, array of websites. And then pass those through a function. # Maybe next time. # BONUS # open a local html file # with open('nytimes.html') as f: # t = html5lib.parse(f, namespaceHTMLElements=False) # Find all links # The notation used to select elements is XPATH # See: https://www.w3schools.com/xml/xpath_syntax.asp # a = t.find('.//a') # Find all links, print the href and the text of the link # for a in t.findall('.//a[@href]'): # href = a.attrib.get('href') # if not href.startswith('https://www.nytimes.com'): # #print(ET.tostring(a, encoding='unicode')) # print(href, a.text) # link, label