diff --git a/img_index/.DS_Store b/img_index/.DS_Store new file mode 100644 index 0000000..f342e6e Binary files /dev/null and b/img_index/.DS_Store differ diff --git a/img_index/img_index.py b/img_index/img_index.py new file mode 100644 index 0000000..d47bcea --- /dev/null +++ b/img_index/img_index.py @@ -0,0 +1,99 @@ +# This script scrapes images +# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute + +import sys +import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm +import xml.etree.ElementTree as ET +import requests +import urllib.request +from urllib.request import urlopen +from urllib.error import HTTPError +from urllib.parse import urljoin, urlparse, urlunparse +import os, sys + +# creates a url list +input_var_url = ["http://nothat.bad.mn/"] +done = [] + +# it reconstructs the url +def nurl(url): + p = urlparse(url) + return urlunparse((p.scheme, p.netloc, p.path, None, None, None)) + +while len(input_var_url) > 0: + url = input_var_url[0] + input_var_url = input_var_url[1:] + done.append(url) + print('scraping:', url) + + try: + with urlopen(url) as f: + p = urlparse(url) + # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links + if p.netloc == 'nothat.bad.mn': + t = html5lib.parse(f, namespaceHTMLElements=False) + # check for redirection + if url != f.url: + print ("REDIRECT", url, f.url) + url = f.url + done.append(url) + + # Make list to put the source of the images in + img_list = [] + + # Open the xml tree map of the webpage. Catch all elements that are images + # so to say, and then get the src tag + print('----- Images found:') + for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH + img_src = urljoin(url, img.attrib.get('src')) + img_list.append(img_src) + print(img_src) + + # find links (dir and subdir) to open and search for more imgs + for link in t.findall(".//a[@href]"): + href = nurl(urljoin(url, link.attrib.get('href'))) + if not href in done: + print('adding href:', href) + input_var_url.append(href) + + # Download the file from `url` and save it locally under the filename specified in the url: + for src in img_list: + file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash + with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file: + data = response.read() # a `bytes` object + out_file.write(data) + + # not getting the 'icons' folder, which is just unintersting thumbnails + # 'os.path.basename' gets the last folder of the path + path = os.path.dirname(img_src) + print('----- These imgs were inside the folder:', os.path.basename(path)) + + # for every item inside the img_list that has a certain extension and is not inside 'icons' folder + extensions = [".jpg", ".png", ".gif"] + + for img_src in img_list: + if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons': + img = ''.format(img_src) + stream = 'index.html' + f = open(stream, 'a') + f.write( """ + """ + '\n' + + """ + """ +'\n' + img) + f.close() + + # creates an stream2 file with no duplicates + stream2 = 'index2.html' + lines_seen = set() # holds lines already seen + f = open(stream2, "w") + for line in open(stream, "r"): + if line not in lines_seen: # not a duplicate + f.write(line) + lines_seen.add(line) + f.close() + +# this helps debbugging errors + except HTTPError as e: + print ("HTTPERROR", e) + + print("Done! Going to the next url") diff --git a/img_index/style.css b/img_index/style.css new file mode 100644 index 0000000..13a1a00 --- /dev/null +++ b/img_index/style.css @@ -0,0 +1,7 @@ +p { + font-size: calc(9px + 0.4vw); +} + +a { + color: black; +}