tool for image scraping

6 years ago · 5c97712a2b
parent c39440d879
commit 5c97712a2b
3 changed files with 106 additions and 0 deletions
--- a/img_index/.DS_Store
+++ b/img_index/.DS_Store
--- a/img_index/img_index.py
+++ b/img_index/img_index.py
@ -0,0 +1,99 @@
+# This script scrapes images
+# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute
+
+import sys
+import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
+import xml.etree.ElementTree as ET
+import requests
+import urllib.request
+from urllib.request import urlopen
+from urllib.error import HTTPError
+from urllib.parse import urljoin, urlparse, urlunparse
+import os, sys
+
+# creates a url list
+input_var_url = ["http://nothat.bad.mn/"]
+done = []
+
+# it reconstructs the url
+def nurl(url):
+    p = urlparse(url)
+    return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
+
+while len(input_var_url) > 0:
+    url = input_var_url[0]
+    input_var_url = input_var_url[1:]
+    done.append(url)
+    print('scraping:', url)
+
+    try:
+        with urlopen(url) as f:
+            p = urlparse(url)
+            # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links
+            if p.netloc == 'nothat.bad.mn':
+                t = html5lib.parse(f, namespaceHTMLElements=False)
+                # check for redirection
+                if url != f.url:
+                    print ("REDIRECT", url, f.url)
+                    url = f.url
+                    done.append(url)
+
+        # Make list to put the source of the images in
+        img_list = []
+
+        # Open the xml tree map of the webpage. Catch all elements that are images
+        # <img> so to say, and then get the src tag
+        print('----- Images found:')
+        for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
+            img_src = urljoin(url, img.attrib.get('src'))
+            img_list.append(img_src)
+            print(img_src)
+
+        # find links (dir and subdir) to open and search for more imgs
+        for link in t.findall(".//a[@href]"):
+            href = nurl(urljoin(url, link.attrib.get('href')))
+            if not href in done:
+                print('adding href:', href)
+                input_var_url.append(href)
+
+        # Download the file from `url` and save it locally under the filename specified in the url:
+        for src in img_list:
+            file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
+            with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
+                data = response.read() # a `bytes` object
+                out_file.write(data)
+
+        # not getting the 'icons' folder, which is just unintersting thumbnails
+        # 'os.path.basename' gets the last folder of the path
+        path = os.path.dirname(img_src)
+        print('----- These imgs were inside the folder:', os.path.basename(path))
+
+        # for every item inside the img_list that has a certain extension and is not inside 'icons' folder
+        extensions = [".jpg", ".png", ".gif"]
+
+        for img_src in img_list:
+            if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons':
+                img = '<img class="image" src="{}" width="200">'.format(img_src)
+                stream = 'index.html'
+                f = open(stream, 'a')
+                f.write( """<!DOCTYPE html>
+                        <html lang="en">""" + '\n' +
+                        """<head> <meta charset="UTF-8">
+                        <link rel="stylesheet" type="text/css" href="style.css"/> </head>""" +'\n' + img)
+                f.close()
+
+    # creates an stream2 file with no duplicates
+                stream2 = 'index2.html'
+                lines_seen = set() # holds lines already seen
+                f = open(stream2, "w")
+                for line in open(stream, "r"):
+                    if line not in lines_seen: # not a duplicate
+                        f.write(line)
+                        lines_seen.add(line)
+                f.close()
+
+# this helps debbugging errors
+    except HTTPError as e:
+        print ("HTTPERROR", e)
+
+    print("Done! Going to the next url")
--- a/img_index/style.css
+++ b/img_index/style.css
@ -0,0 +1,7 @@
+p {
+  font-size: calc(9px + 0.4vw);
+}
+
+a {
+  color: black;
+}