You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
6 years ago
|
# This script scrapes text on a page that contains a keyword
|
||
|
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
|
||
|
# https://pad.xpub.nl/p/prototyping_02102018
|
||
|
|
||
|
|
||
|
import sys
|
||
|
import html5lib
|
||
|
import xml.etree.ElementTree as ET
|
||
|
import requests
|
||
|
import urllib.request
|
||
|
from urllib.request import urlopen
|
||
|
from urllib.parse import urljoin
|
||
|
|
||
|
# get the url from the terminal
|
||
|
input_var_url = input("Enter a url to scrape (include https:// etc.): ")
|
||
|
|
||
|
print ("SCRAPE, RINSE, REPEAT! Luvvit")
|
||
|
print ("Scraping links and images from" + input_var_url)
|
||
|
|
||
|
# A function is a block of code which only runs when it is called.
|
||
|
# You can pass data, known as parameters, into a function. (in this case the url)
|
||
|
# A function can return data as a result.
|
||
|
# First define the function
|
||
|
|
||
|
# This function makes a file with all links on a webpage
|
||
|
def scrape_links (url):
|
||
|
# Open the page
|
||
|
with urlopen(url) as f:
|
||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||
|
|
||
|
# Make lists to put the headings in
|
||
|
link_list = []
|
||
|
|
||
|
|
||
|
# Open the xml tree map of the webpage. Catch all elements which are links
|
||
|
# <a> so to say, and then get the href tag and the text tag
|
||
|
print("Links found:")
|
||
|
for a in t.findall('.//a[@href]'): # the syntax used here to select elements is XPATH
|
||
|
href = urljoin(url, a.attrib.get('href'))
|
||
|
|
||
|
link_file = open("link_results.txt", "a+")
|
||
|
print(a.text, href)
|
||
|
print(a.text, href, file=link_file) # Print and write to file
|
||
|
link_file.close()
|
||
|
|
||
|
|
||
|
# This function downloads the images on a webpage
|
||
|
def scrape_images (url):
|
||
|
# Open the page
|
||
|
with urlopen(url) as f:
|
||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||
|
|
||
|
# Make lists to put the source of the images in
|
||
|
img_list = []
|
||
|
|
||
|
# Open the xml tree map of the webpage. Catch all elements that are images
|
||
|
# <img> so to say, and then get the src tag
|
||
|
print('Images found:')
|
||
|
for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
|
||
|
img_src = urljoin(url, img.attrib.get('src'))
|
||
|
print(img_src)
|
||
|
img_list.append(img_src)
|
||
|
|
||
|
# Download the file from `url` and save it locally under the filename specified in the url:
|
||
|
for src in img_list:
|
||
|
file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
|
||
|
with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
|
||
|
data = response.read() # a `bytes` object
|
||
|
out_file.write(data)
|
||
|
|
||
|
|
||
|
|
||
|
# Let's call the functions above
|
||
|
# and run it with the url that is filled in
|
||
|
scrape_links(input_var_url)
|
||
|
scrape_images(input_var_url)
|
||
|
|
||
|
# That's it, folks!
|
||
|
print("Done! Run the script again to scrape another page.")
|
||
|
|