grad_prototypes/aced workshop breaking news/Script/heading.py

# Script used during Breaking News 02/10
# Scrapes text on a page based on keywords specified in the code
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
# https://pad.xpub.nl/p/prototyping_02102018


import sys
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen

# get the url from the terminal
input_var = input("Enter a url to scrape (include https:// etc.): ")
print ("Scraping " + input_var)
url = input_var #sys.argv[1]

# Open the page
with urlopen(url) as f:
    t = html5lib.parse(f, namespaceHTMLElements=False)

# Make lists to put the headings in
trump = []
putin = []
jongun = []

# Open the xml tree map of the webpage. Go through all text containing
# the word 'trump'. To ignore javascript having that word, ignore when
# the html tag of the text is script.
# Append each found text to the list with headings.

for x in t.iter():
    if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
        trump.append(x.text)

# Same for Putin
for y in t.iter():
    if y.text != None and 'putin' in y.text.lower() and y.tag != 'script':
        putin.append(y.text)

# Same for Kim Jong-Un
for z in t.iter():
    if z.text != None and 'jong-un' in z.text.lower() and z.tag != 'script':
        jongun.append(z.text)


# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect headings
# in just one file
trumpheadings = " ".join(trump)
print(trumpheadings)
text_file = open("trump.txt", "a+")
text_file.write(trumpheadings)
text_file.close()

putinheadings = " ".join(putin)
print(putinheadings)
text_file = open("putin.txt", "a+")
text_file.write(putinheadings)
text_file.close()

unheadings = " ".join(jongun)
print(unheadings)
text_file = open("jongun.txt", "a+")
text_file.write(unheadings)
text_file.close()


# That's it, folks!
print("Done! Run the script again to scrape another page.")

# Super sketchy, so a bit repetitive. This could be done more efficiently
# Array of keywords, array of websites. And then pass those through a function.
# Maybe next time.


# BONUS

# open a local html file
# with open('nytimes.html') as f:
#     t = html5lib.parse(f, namespaceHTMLElements=False)

# Find all links
# The notation used to select elements is XPATH
# See: https://www.w3schools.com/xml/xpath_syntax.asp
# a = t.find('.//a')

# Find all links, print the href and the text of the link
# for a in t.findall('.//a[@href]'):
#     href = a.attrib.get('href')
#     if not href.startswith('https://www.nytimes.com'):
#         #print(ET.tostring(a, encoding='unicode'))
#         print(href, a.text)  # link, label