tool for image scraping

master
Rita Graça 5 years ago
parent c39440d879
commit 5c97712a2b

BIN
img_index/.DS_Store vendored

Binary file not shown.

@ -0,0 +1,99 @@
# This script scrapes images
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute
import sys
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
import xml.etree.ElementTree as ET
import requests
import urllib.request
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import urljoin, urlparse, urlunparse
import os, sys
# creates a url list
input_var_url = ["http://nothat.bad.mn/"]
done = []
# it reconstructs the url
def nurl(url):
p = urlparse(url)
return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
while len(input_var_url) > 0:
url = input_var_url[0]
input_var_url = input_var_url[1:]
done.append(url)
print('scraping:', url)
try:
with urlopen(url) as f:
p = urlparse(url)
# this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links
if p.netloc == 'nothat.bad.mn':
t = html5lib.parse(f, namespaceHTMLElements=False)
# check for redirection
if url != f.url:
print ("REDIRECT", url, f.url)
url = f.url
done.append(url)
# Make list to put the source of the images in
img_list = []
# Open the xml tree map of the webpage. Catch all elements that are images
# <img> so to say, and then get the src tag
print('----- Images found:')
for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
img_src = urljoin(url, img.attrib.get('src'))
img_list.append(img_src)
print(img_src)
# find links (dir and subdir) to open and search for more imgs
for link in t.findall(".//a[@href]"):
href = nurl(urljoin(url, link.attrib.get('href')))
if not href in done:
print('adding href:', href)
input_var_url.append(href)
# Download the file from `url` and save it locally under the filename specified in the url:
for src in img_list:
file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
data = response.read() # a `bytes` object
out_file.write(data)
# not getting the 'icons' folder, which is just unintersting thumbnails
# 'os.path.basename' gets the last folder of the path
path = os.path.dirname(img_src)
print('----- These imgs were inside the folder:', os.path.basename(path))
# for every item inside the img_list that has a certain extension and is not inside 'icons' folder
extensions = [".jpg", ".png", ".gif"]
for img_src in img_list:
if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons':
img = '<img class="image" src="{}" width="200">'.format(img_src)
stream = 'index.html'
f = open(stream, 'a')
f.write( """<!DOCTYPE html>
<html lang="en">""" + '\n' +
"""<head> <meta charset="UTF-8">
<link rel="stylesheet" type="text/css" href="style.css"/> </head>""" +'\n' + img)
f.close()
# creates an stream2 file with no duplicates
stream2 = 'index2.html'
lines_seen = set() # holds lines already seen
f = open(stream2, "w")
for line in open(stream, "r"):
if line not in lines_seen: # not a duplicate
f.write(line)
lines_seen.add(line)
f.close()
# this helps debbugging errors
except HTTPError as e:
print ("HTTPERROR", e)
print("Done! Going to the next url")

@ -0,0 +1,7 @@
p {
font-size: calc(9px + 0.4vw);
}
a {
color: black;
}
Loading…
Cancel
Save