# This script scrapes images
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute

import sys
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
import xml.etree.ElementTree as ET
import requests
import urllib.request
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import urljoin, urlparse, urlunparse
import os, sys

# creates a url list
input_var_url = ["http://nothat.bad.mn/"]
done = []

# it reconstructs the url
def nurl(url):
    p = urlparse(url)
    return urlunparse((p.scheme, p.netloc, p.path, None, None, None))

while len(input_var_url) > 0:
    url = input_var_url[0]
    input_var_url = input_var_url[1:]
    done.append(url)
    print('scraping:', url)

    try:
        with urlopen(url) as f:
            p = urlparse(url)
            # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links
            if p.netloc == 'nothat.bad.mn':
                t = html5lib.parse(f, namespaceHTMLElements=False)
                # check for redirection
                if url != f.url:
                    print ("REDIRECT", url, f.url)
                    url = f.url
                    done.append(url)

        # Make list to put the source of the images in
        img_list = []

        # Open the xml tree map of the webpage. Catch all elements that are images
        # <img> so to say, and then get the src tag
        print('----- Images found:')
        for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
            img_src = urljoin(url, img.attrib.get('src'))
            img_list.append(img_src)
            print(img_src)

        # find links (dir and subdir) to open and search for more imgs
        for link in t.findall(".//a[@href]"):
            href = nurl(urljoin(url, link.attrib.get('href')))
            if not href in done:
                print('adding href:', href)
                input_var_url.append(href)

        # Download the file from `url` and save it locally under the filename specified in the url:
        for src in img_list:
            file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
            with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
                data = response.read() # a `bytes` object
                out_file.write(data)

        # not getting the 'icons' folder, which is just unintersting thumbnails
        # 'os.path.basename' gets the last folder of the path
        path = os.path.dirname(img_src)
        print('----- These imgs were inside the folder:', os.path.basename(path))

        # for every item inside the img_list that has a certain extension and is not inside 'icons' folder
        extensions = [".jpg", ".png", ".gif"]

        for img_src in img_list:
            if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons':
                img = '<img class="image" src="{}" width="200">'.format(img_src)
                stream = 'index.html'
                f = open(stream, 'a')
                f.write( """<!DOCTYPE html>
                        <html lang="en">""" + '\n' +
                        """<head> <meta charset="UTF-8">
                        <link rel="stylesheet" type="text/css" href="style.css"/> </head>""" +'\n' + img)
                f.close()

    # creates an stream2 file with no duplicates
                stream2 = 'index2.html'
                lines_seen = set() # holds lines already seen
                f = open(stream2, "w")
                for line in open(stream, "r"):
                    if line not in lines_seen: # not a duplicate
                        f.write(line)
                        lines_seen.add(line)
                f.close()

# this helps debbugging errors
    except HTTPError as e:
        print ("HTTPERROR", e)

    print("Done! Going to the next url")