From 5c97712a2b05461f0bb9e435b21fd6e338c2b327 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rita=20Gra=C3=A7a?= <ritagraca@Ritas-MacBook-Pro.local>
Date: Sat, 30 Mar 2019 16:43:31 +0100
Subject: [PATCH] tool for image scraping

---
 img_index/.DS_Store    | Bin 0 -> 6148 bytes
 img_index/img_index.py |  99 +++++++++++++++++++++++++++++++++++++++++
 img_index/style.css    |   7 +++
 3 files changed, 106 insertions(+)
 create mode 100644 img_index/.DS_Store
 create mode 100644 img_index/img_index.py
 create mode 100644 img_index/style.css
diff --git a/img_index/.DS_Store b/img_index/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f342e6e1f09354bb3308f94679fd489ae9ec2f82
GIT binary patch
literal 6148
zcmeHKyG{c^3>-s>AShB&Ziy%&QPZrVP*C#$pd<)IItWrgBHeX-2VV-~2aw~Sr2t#9
zXT9sWy_@2^3qU51ixXf3U`SWQvxdIux_ZxcVn$Ky8B5$@fonWr+^hb6Lb(T6;ubf!
zB!2Smu{m#6^I5Z+u|{9;lz#MT#u2cGH6E}*!yU(%Gh2UQys+wci(#}3sX!`_3Zw$5
zz`s_2JzH(K@0c+aNCi@X4+Zr9Q0R&^uywRw2ZJ8pb$y_y#<ATch$ZqG*gA5BCQc<f
zRpN>vPUn1yx*FIzIvrv=^Et7z#0^Dkcg`0phg8RmsX!{wRbbcCk@o*H`VaU2E-ABA
zAQkvi3dmr3F`e*}qPO-wPJ3;kU(w%<wN}p1IRr4#R(!ZsS9Hv}8rV7-o%2Q~=0QMp
KNlOKOLxDHxHYqIt

literal 0
HcmV?d00001

diff --git a/img_index/img_index.py b/img_index/img_index.py
new file mode 100644
index 0000000..d47bcea
--- /dev/null
+++ b/img_index/img_index.py
@@ -0,0 +1,99 @@
+# This script scrapes images
+# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute
+
+import sys
+import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
+import xml.etree.ElementTree as ET
+import requests
+import urllib.request
+from urllib.request import urlopen
+from urllib.error import HTTPError
+from urllib.parse import urljoin, urlparse, urlunparse
+import os, sys
+
+# creates a url list
+input_var_url = ["http://nothat.bad.mn/"]
+done = []
+
+# it reconstructs the url
+def nurl(url):
+    p = urlparse(url)
+    return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
+
+while len(input_var_url) > 0:
+    url = input_var_url[0]
+    input_var_url = input_var_url[1:]
+    done.append(url)
+    print('scraping:', url)
+
+    try:
+        with urlopen(url) as f:
+            p = urlparse(url)
+            # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links
+            if p.netloc == 'nothat.bad.mn':
+                t = html5lib.parse(f, namespaceHTMLElements=False)
+                # check for redirection
+                if url != f.url:
+                    print ("REDIRECT", url, f.url)
+                    url = f.url
+                    done.append(url)
+
+        # Make list to put the source of the images in
+        img_list = []
+
+        # Open the xml tree map of the webpage. Catch all elements that are images
+        # <img> so to say, and then get the src tag
+        print('----- Images found:')
+        for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
+            img_src = urljoin(url, img.attrib.get('src'))
+            img_list.append(img_src)
+            print(img_src)
+
+        # find links (dir and subdir) to open and search for more imgs
+        for link in t.findall(".//a[@href]"):
+            href = nurl(urljoin(url, link.attrib.get('href')))
+            if not href in done:
+                print('adding href:', href)
+                input_var_url.append(href)
+
+        # Download the file from `url` and save it locally under the filename specified in the url:
+        for src in img_list:
+            file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
+            with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
+                data = response.read() # a `bytes` object
+                out_file.write(data)
+
+        # not getting the 'icons' folder, which is just unintersting thumbnails
+        # 'os.path.basename' gets the last folder of the path
+        path = os.path.dirname(img_src)
+        print('----- These imgs were inside the folder:', os.path.basename(path))
+
+        # for every item inside the img_list that has a certain extension and is not inside 'icons' folder
+        extensions = [".jpg", ".png", ".gif"]
+
+        for img_src in img_list:
+            if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons':
+                img = '<img class="image" src="{}" width="200">'.format(img_src)
+                stream = 'index.html'
+                f = open(stream, 'a')
+                f.write( """<!DOCTYPE html>
+                        <html lang="en">""" + '\n' +
+                        """<head> <meta charset="UTF-8">
+                        <link rel="stylesheet" type="text/css" href="style.css"/> </head>""" +'\n' + img)
+                f.close()
+
+    # creates an stream2 file with no duplicates
+                stream2 = 'index2.html'
+                lines_seen = set() # holds lines already seen
+                f = open(stream2, "w")
+                for line in open(stream, "r"):
+                    if line not in lines_seen: # not a duplicate
+                        f.write(line)
+                        lines_seen.add(line)
+                f.close()
+
+# this helps debbugging errors
+    except HTTPError as e:
+        print ("HTTPERROR", e)
+
+    print("Done! Going to the next url")
diff --git a/img_index/style.css b/img_index/style.css
new file mode 100644
index 0000000..13a1a00
--- /dev/null
+++ b/img_index/style.css
@@ -0,0 +1,7 @@
+p {
+  font-size: calc(9px + 0.4vw);
+}
+
+a {
+  color: black;
+}