# This script scrapes images # Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute import sys import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm import xml.etree.ElementTree as ET import requests import urllib.request from urllib.request import urlopen from urllib.error import HTTPError from urllib.parse import urljoin, urlparse, urlunparse import os, sys # creates a url list input_var_url = ["http://nothat.bad.mn/"] done = [] # it reconstructs the url def nurl(url): p = urlparse(url) return urlunparse((p.scheme, p.netloc, p.path, None, None, None)) while len(input_var_url) > 0: url = input_var_url[0] input_var_url = input_var_url[1:] done.append(url) print('scraping:', url) try: with urlopen(url) as f: p = urlparse(url) # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links if p.netloc == 'nothat.bad.mn': t = html5lib.parse(f, namespaceHTMLElements=False) # check for redirection if url != f.url: print ("REDIRECT", url, f.url) url = f.url done.append(url) # Make list to put the source of the images in img_list = [] # Open the xml tree map of the webpage. Catch all elements that are images # so to say, and then get the src tag print('----- Images found:') for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH img_src = urljoin(url, img.attrib.get('src')) img_list.append(img_src) print(img_src) # find links (dir and subdir) to open and search for more imgs for link in t.findall(".//a[@href]"): href = nurl(urljoin(url, link.attrib.get('href'))) if not href in done: print('adding href:', href) input_var_url.append(href) # Download the file from `url` and save it locally under the filename specified in the url: for src in img_list: file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file: data = response.read() # a `bytes` object out_file.write(data) # not getting the 'icons' folder, which is just unintersting thumbnails # 'os.path.basename' gets the last folder of the path path = os.path.dirname(img_src) print('----- These imgs were inside the folder:', os.path.basename(path)) # for every item inside the img_list that has a certain extension and is not inside 'icons' folder extensions = [".jpg", ".png", ".gif"] for img_src in img_list: if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons': img = ''.format(img_src) stream = 'index.html' f = open(stream, 'a') f.write( """ """ + '\n' + """ """ +'\n' + img) f.close() # creates an stream2 file with no duplicates stream2 = 'index2.html' lines_seen = set() # holds lines already seen f = open(stream2, "w") for line in open(stream, "r"): if line not in lines_seen: # not a duplicate f.write(line) lines_seen.add(line) f.close() # this helps debbugging errors except HTTPError as e: print ("HTTPERROR", e) print("Done! Going to the next url")