# This script scrapes text on a page that contains a keyword # Based on a tutorial of Michael Murtaugh at Piet Zwart Institute # https://pad.xpub.nl/p/prototyping_02102018 import sys import html5lib import xml.etree.ElementTree as ET import requests import urllib.request from urllib.request import urlopen from urllib.parse import urljoin # get the url from the terminal input_var_url = input("Enter a url to scrape (include https:// etc.): ") print ("SCRAPE, RINSE, REPEAT! Luvvit") print ("Scraping links and images from" + input_var_url) # A function is a block of code which only runs when it is called. # You can pass data, known as parameters, into a function. (in this case the url) # A function can return data as a result. # First define the function # This function makes a file with all links on a webpage def scrape_links (url): # Open the page with urlopen(url) as f: t = html5lib.parse(f, namespaceHTMLElements=False) # Make lists to put the headings in link_list = [] # Open the xml tree map of the webpage. Catch all elements which are links # so to say, and then get the href tag and the text tag print("Links found:") for a in t.findall('.//a[@href]'): # the syntax used here to select elements is XPATH href = urljoin(url, a.attrib.get('href')) link_file = open("link_results.txt", "a+") print(a.text, href) print(a.text, href, file=link_file) # Print and write to file link_file.close() # This function downloads the images on a webpage def scrape_images (url): # Open the page with urlopen(url) as f: t = html5lib.parse(f, namespaceHTMLElements=False) # Make lists to put the source of the images in img_list = [] # Open the xml tree map of the webpage. Catch all elements that are images # so to say, and then get the src tag print('Images found:') for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH img_src = urljoin(url, img.attrib.get('src')) print(img_src) img_list.append(img_src) # Download the file from `url` and save it locally under the filename specified in the url: for src in img_list: file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file: data = response.read() # a `bytes` object out_file.write(data) # Let's call the functions above # and run it with the url that is filled in scrape_links(input_var_url) scrape_images(input_var_url) # That's it, folks! print("Done! Run the script again to scrape another page.")