grad_prototypes/workshop_pyratechnic_session_1/scrapescripts/html5lib/scrape_a_img.py

# This script scrapes text on a page that contains a keyword
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute
# https://pad.xpub.nl/p/prototyping_02102018


import sys
import html5lib
import xml.etree.ElementTree as ET
import requests
import urllib.request
from urllib.request import urlopen
from urllib.parse import urljoin

# get the url from the terminal
input_var_url = input("Enter a url to scrape (include https:// etc.): ")

print ("SCRAPE, RINSE, REPEAT! Luvvit") 
print ("Scraping links and images from" + input_var_url) 

# A function is a block of code which only runs when it is called.
# You can pass data, known as parameters, into a function. (in this case the url)
# A function can return data as a result.
# First define the function

# This function makes a file with all links on a webpage
def scrape_links (url):
    # Open the page
    with urlopen(url) as f:
        t = html5lib.parse(f, namespaceHTMLElements=False)

    # Make lists to put the headings in
    link_list = []


    # Open the xml tree map of the webpage. Catch all elements which are links
    # <a> so to say, and then get the href tag and the text tag
    print("Links found:")
    for a in t.findall('.//a[@href]'): # the syntax used here to select elements is XPATH
        href = urljoin(url, a.attrib.get('href')) 

        link_file = open("link_results.txt", "a+")
        print(a.text, href)
        print(a.text, href, file=link_file) # Print and write to file
        link_file.close()


# This function downloads the images on a webpage
def scrape_images (url):
    # Open the page
    with urlopen(url) as f:
        t = html5lib.parse(f, namespaceHTMLElements=False)

    # Make lists to put the source of the images in
    img_list = []

    # Open the xml tree map of the webpage. Catch all elements that are images
    # <img> so to say, and then get the src tag
    print('Images found:')
    for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
        img_src = urljoin(url, img.attrib.get('src')) 
        print(img_src)
        img_list.append(img_src)
    
    # Download the file from `url` and save it locally under the filename specified in the url:
    for src in img_list:
        file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
        with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
            data = response.read() # a `bytes` object
            out_file.write(data)    


# Let's call the functions above
# and run it with the url that is filled in
scrape_links(input_var_url)
scrape_images(input_var_url)

# That's it, folks!
print("Done! Run the script again to scrape another page.")
Added sketches from t4 6 years ago			`# This script scrapes text on a page that contains a keyword`
			`# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute`
			`# https://pad.xpub.nl/p/prototyping_02102018`


			`import sys`
			`import html5lib`
			`import xml.etree.ElementTree as ET`
			`import requests`
			`import urllib.request`
			`from urllib.request import urlopen`
			`from urllib.parse import urljoin`

			`# get the url from the terminal`
			`input_var_url = input("Enter a url to scrape (include https:// etc.): ")`

			`print ("SCRAPE, RINSE, REPEAT! Luvvit")`
			`print ("Scraping links and images from" + input_var_url)`

			`# A function is a block of code which only runs when it is called.`
			`# You can pass data, known as parameters, into a function. (in this case the url)`
			`# A function can return data as a result.`
			`# First define the function`

			`# This function makes a file with all links on a webpage`
			`def scrape_links (url):`
			`# Open the page`
			`with urlopen(url) as f:`
			`t = html5lib.parse(f, namespaceHTMLElements=False)`

			`# Make lists to put the headings in`
			`link_list = []`


			`# Open the xml tree map of the webpage. Catch all elements which are links`
			`# <a> so to say, and then get the href tag and the text tag`
			`print("Links found:")`
			`for a in t.findall('.//a[@href]'): # the syntax used here to select elements is XPATH`
			`href = urljoin(url, a.attrib.get('href'))`

			`link_file = open("link_results.txt", "a+")`
			`print(a.text, href)`
			`print(a.text, href, file=link_file) # Print and write to file`
			`link_file.close()`


			`# This function downloads the images on a webpage`
			`def scrape_images (url):`
			`# Open the page`
			`with urlopen(url) as f:`
			`t = html5lib.parse(f, namespaceHTMLElements=False)`

			`# Make lists to put the source of the images in`
			`img_list = []`

			`# Open the xml tree map of the webpage. Catch all elements that are images`
			`# <img> so to say, and then get the src tag`
			`print('Images found:')`
			`for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH`
			`img_src = urljoin(url, img.attrib.get('src'))`
			`print(img_src)`
			`img_list.append(img_src)`

			# Download the file from `url` and save it locally under the filename specified in the url:
			`for src in img_list:`
			`file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash`
			`with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:`
			data = response.read() # a `bytes` object
			`out_file.write(data)`



			`# Let's call the functions above`
			`# and run it with the url that is filled in`
			`scrape_links(input_var_url)`
			`scrape_images(input_var_url)`

			`# That's it, folks!`
			`print("Done! Run the script again to scrape another page.")`