You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
95 lines
2.7 KiB
Python
95 lines
2.7 KiB
Python
# Script used during Breaking News 02/10
|
|
# Scrapes text on a page based on keywords specified in the code
|
|
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
|
|
# https://pad.xpub.nl/p/prototyping_02102018
|
|
|
|
|
|
import sys
|
|
import html5lib
|
|
import xml.etree.ElementTree as ET
|
|
from urllib.request import urlopen
|
|
|
|
# get the url from the terminal
|
|
input_var = input("Enter a url to scrape (include https:// etc.): ")
|
|
print ("Scraping " + input_var)
|
|
url = input_var #sys.argv[1]
|
|
|
|
# Open the page
|
|
with urlopen(url) as f:
|
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
|
|
|
# Make lists to put the headings in
|
|
trump = []
|
|
putin = []
|
|
jongun = []
|
|
|
|
# Open the xml tree map of the webpage. Go through all text containing
|
|
# the word 'trump'. To ignore javascript having that word, ignore when
|
|
# the html tag of the text is script.
|
|
# Append each found text to the list with headings.
|
|
|
|
for x in t.iter():
|
|
if x.text != None and 'trump' in x.text.lower() and x.tag != 'script':
|
|
trump.append(x.text)
|
|
|
|
# Same for Putin
|
|
for y in t.iter():
|
|
if y.text != None and 'putin' in y.text.lower() and y.tag != 'script':
|
|
putin.append(y.text)
|
|
|
|
# Same for Kim Jong-Un
|
|
for z in t.iter():
|
|
if z.text != None and 'jong-un' in z.text.lower() and z.tag != 'script':
|
|
jongun.append(z.text)
|
|
|
|
|
|
# Done? Great!
|
|
# Change the list into a collection of strings
|
|
# Open a txt file and put them there
|
|
# In case the file already exists, then just paste it at the bottom
|
|
# Super handy if you want to run the script for multiple sites and collect headings
|
|
# in just one file
|
|
trumpheadings = " ".join(trump)
|
|
print(trumpheadings)
|
|
text_file = open("trump.txt", "a+")
|
|
text_file.write(trumpheadings)
|
|
text_file.close()
|
|
|
|
putinheadings = " ".join(putin)
|
|
print(putinheadings)
|
|
text_file = open("putin.txt", "a+")
|
|
text_file.write(putinheadings)
|
|
text_file.close()
|
|
|
|
unheadings = " ".join(jongun)
|
|
print(unheadings)
|
|
text_file = open("jongun.txt", "a+")
|
|
text_file.write(unheadings)
|
|
text_file.close()
|
|
|
|
|
|
# That's it, folks!
|
|
print("Done! Run the script again to scrape another page.")
|
|
|
|
# Super sketchy, so a bit repetitive. This could be done more efficiently
|
|
# Array of keywords, array of websites. And then pass those through a function.
|
|
# Maybe next time.
|
|
|
|
|
|
# BONUS
|
|
|
|
# open a local html file
|
|
# with open('nytimes.html') as f:
|
|
# t = html5lib.parse(f, namespaceHTMLElements=False)
|
|
|
|
# Find all links
|
|
# The notation used to select elements is XPATH
|
|
# See: https://www.w3schools.com/xml/xpath_syntax.asp
|
|
# a = t.find('.//a')
|
|
|
|
# Find all links, print the href and the text of the link
|
|
# for a in t.findall('.//a[@href]'):
|
|
# href = a.attrib.get('href')
|
|
# if not href.startswith('https://www.nytimes.com'):
|
|
# #print(ET.tostring(a, encoding='unicode'))
|
|
# print(href, a.text) # link, label |