You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
64 lines
2.1 KiB
Python
64 lines
2.1 KiB
Python
6 years ago
|
# This script scrapes text on a page that contains a keyword
|
||
|
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
|
||
|
# https://pad.xpub.nl/p/prototyping_02102018
|
||
|
|
||
|
|
||
|
import sys
|
||
|
import html5lib
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from urllib.request import urlopen
|
||
|
|
||
|
# get the url from the terminal
|
||
|
input_var_url = input("Enter a url to scrape (include https:// etc.): ")
|
||
|
|
||
|
# get the keyword to look for from the terminal
|
||
|
input_var_keyword = input("Enter a keyword to look for: ")
|
||
|
|
||
|
print ("Great, let's start!")
|
||
|
print ("Scraping " + input_var_url + " to look for " + input_var_keyword)
|
||
|
|
||
|
# This is the function that actually does the scraping
|
||
|
# A function is a block of code which only runs when it is called.
|
||
|
# You can pass data, known as parameters, into a function. (in this case the url and keyword)
|
||
|
# A function can return data as a result.
|
||
|
|
||
|
# First define the function
|
||
|
def scrape_text_by_keyword (url, keyword):
|
||
|
# Open the page
|
||
|
with urlopen(url) as f:
|
||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||
|
|
||
|
# Make lists to put the headings in
|
||
|
text_list = []
|
||
|
|
||
|
|
||
|
# Open the xml tree map of the webpage. Go through all text containing
|
||
|
# the word 'trump'. To ignore javascript having that word, ignore when
|
||
|
# the html tag of the text is script.
|
||
|
# Append each found text to the list with headings.
|
||
|
|
||
|
for x in t.iter():
|
||
|
if x.text != None and keyword in x.text.lower() and x.tag != 'script':
|
||
|
text_list.append(x.text)
|
||
|
|
||
|
# Done? Great!
|
||
|
# Change the list into a collection of strings
|
||
|
# Open a txt file and put them there
|
||
|
# In case the file already exists, then just paste it at the bottom
|
||
|
# Super handy if you want to run the script for multiple sites and collect text
|
||
|
# in just one file
|
||
|
text_list_strings = "\n".join(text_list)
|
||
|
print(text_list_strings)
|
||
|
text_file = open("results.txt", "a+")
|
||
|
text_file.write(text_list_strings)
|
||
|
text_file.close()
|
||
|
|
||
|
|
||
|
# Let's call the function above
|
||
|
# and run it with the url and keyword that is filled in
|
||
|
scrape_text_by_keyword(input_var_url, input_var_keyword.lower())
|
||
|
|
||
|
# That's it, folks!
|
||
|
print("Done! Run the script again to scrape another page.")
|
||
|
|