You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
2.7 KiB
Python

# Script used during Breaking News 02/10
# Scrapes text on a page based on keywords specified in the code
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
# https://pad.xpub.nl/p/prototyping_02102018
import sys
import html5lib
import xml.etree.ElementTree as ET
from urllib.request import urlopen
# get the url from the terminal
input_var = input("Enter a url to scrape (include https:// etc.): ")
print ("Scraping " + input_var)
url = input_var #sys.argv[1]
# Open the page
with urlopen(url) as f:
t = html5lib.parse(f, namespaceHTMLElements=False)
# Make lists to put the headings in
trump = []
putin = []
jongun = []
# Open the xml tree map of the webpage. Go through all text containing
# the word 'trump'. To ignore javascript having that word, ignore when
# the html tag of the text is script.
# Append each found text to the list with headings.
for x in t.iter():
if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
trump.append(x.text)
# Same for Putin
for y in t.iter():
if y.text != None and 'putin' in y.text.lower() and y.tag != 'script':
putin.append(y.text)
# Same for Kim Jong-Un
for z in t.iter():
if z.text != None and 'jong-un' in z.text.lower() and z.tag != 'script':
jongun.append(z.text)
# Done? Great!
# Change the list into a collection of strings
# Open a txt file and put them there
# In case the file already exists, then just paste it at the bottom
# Super handy if you want to run the script for multiple sites and collect headings
# in just one file
trumpheadings = " ".join(trump)
print(trumpheadings)
text_file = open("trump.txt", "a+")
text_file.write(trumpheadings)
text_file.close()
putinheadings = " ".join(putin)
print(putinheadings)
text_file = open("putin.txt", "a+")
text_file.write(putinheadings)
text_file.close()
unheadings = " ".join(jongun)
print(unheadings)
text_file = open("jongun.txt", "a+")
text_file.write(unheadings)
text_file.close()
# That's it, folks!
print("Done! Run the script again to scrape another page.")
# Super sketchy, so a bit repetitive. This could be done more efficiently
# Array of keywords, array of websites. And then pass those through a function.
# Maybe next time.
# BONUS
# open a local html file
# with open('nytimes.html') as f:
# t = html5lib.parse(f, namespaceHTMLElements=False)
# Find all links
# The notation used to select elements is XPATH
# See: https://www.w3schools.com/xml/xpath_syntax.asp
# a = t.find('.//a')
# Find all links, print the href and the text of the link
# for a in t.findall('.//a[@href]'):
# href = a.attrib.get('href')
# if not href.startswith('https://www.nytimes.com'):
# #print(ET.tostring(a, encoding='unicode'))
# print(href, a.text) # link, label