tool for image scraping
parent
c39440d879
commit
5c97712a2b
Binary file not shown.
@ -0,0 +1,99 @@
|
|||||||
|
# This script scrapes images
|
||||||
|
# Based on a tutorial of Michael Murtaugh at Piet Zwart Institute, https://pad.xpub.nl/p/prototyping_02102018 + scrapping workshop at Piet Zwart Institute
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import requests
|
||||||
|
import urllib.request
|
||||||
|
from urllib.request import urlopen
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
from urllib.parse import urljoin, urlparse, urlunparse
|
||||||
|
import os, sys
|
||||||
|
|
||||||
|
# creates a url list
|
||||||
|
input_var_url = ["http://nothat.bad.mn/"]
|
||||||
|
done = []
|
||||||
|
|
||||||
|
# it reconstructs the url
|
||||||
|
def nurl(url):
|
||||||
|
p = urlparse(url)
|
||||||
|
return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
|
||||||
|
|
||||||
|
while len(input_var_url) > 0:
|
||||||
|
url = input_var_url[0]
|
||||||
|
input_var_url = input_var_url[1:]
|
||||||
|
done.append(url)
|
||||||
|
print('scraping:', url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urlopen(url) as f:
|
||||||
|
p = urlparse(url)
|
||||||
|
# this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links
|
||||||
|
if p.netloc == 'nothat.bad.mn':
|
||||||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||||||
|
# check for redirection
|
||||||
|
if url != f.url:
|
||||||
|
print ("REDIRECT", url, f.url)
|
||||||
|
url = f.url
|
||||||
|
done.append(url)
|
||||||
|
|
||||||
|
# Make list to put the source of the images in
|
||||||
|
img_list = []
|
||||||
|
|
||||||
|
# Open the xml tree map of the webpage. Catch all elements that are images
|
||||||
|
# <img> so to say, and then get the src tag
|
||||||
|
print('----- Images found:')
|
||||||
|
for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH
|
||||||
|
img_src = urljoin(url, img.attrib.get('src'))
|
||||||
|
img_list.append(img_src)
|
||||||
|
print(img_src)
|
||||||
|
|
||||||
|
# find links (dir and subdir) to open and search for more imgs
|
||||||
|
for link in t.findall(".//a[@href]"):
|
||||||
|
href = nurl(urljoin(url, link.attrib.get('href')))
|
||||||
|
if not href in done:
|
||||||
|
print('adding href:', href)
|
||||||
|
input_var_url.append(href)
|
||||||
|
|
||||||
|
# Download the file from `url` and save it locally under the filename specified in the url:
|
||||||
|
for src in img_list:
|
||||||
|
file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash
|
||||||
|
with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file:
|
||||||
|
data = response.read() # a `bytes` object
|
||||||
|
out_file.write(data)
|
||||||
|
|
||||||
|
# not getting the 'icons' folder, which is just unintersting thumbnails
|
||||||
|
# 'os.path.basename' gets the last folder of the path
|
||||||
|
path = os.path.dirname(img_src)
|
||||||
|
print('----- These imgs were inside the folder:', os.path.basename(path))
|
||||||
|
|
||||||
|
# for every item inside the img_list that has a certain extension and is not inside 'icons' folder
|
||||||
|
extensions = [".jpg", ".png", ".gif"]
|
||||||
|
|
||||||
|
for img_src in img_list:
|
||||||
|
if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons':
|
||||||
|
img = '<img class="image" src="{}" width="200">'.format(img_src)
|
||||||
|
stream = 'index.html'
|
||||||
|
f = open(stream, 'a')
|
||||||
|
f.write( """<!DOCTYPE html>
|
||||||
|
<html lang="en">""" + '\n' +
|
||||||
|
"""<head> <meta charset="UTF-8">
|
||||||
|
<link rel="stylesheet" type="text/css" href="style.css"/> </head>""" +'\n' + img)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
# creates an stream2 file with no duplicates
|
||||||
|
stream2 = 'index2.html'
|
||||||
|
lines_seen = set() # holds lines already seen
|
||||||
|
f = open(stream2, "w")
|
||||||
|
for line in open(stream, "r"):
|
||||||
|
if line not in lines_seen: # not a duplicate
|
||||||
|
f.write(line)
|
||||||
|
lines_seen.add(line)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
# this helps debbugging errors
|
||||||
|
except HTTPError as e:
|
||||||
|
print ("HTTPERROR", e)
|
||||||
|
|
||||||
|
print("Done! Going to the next url")
|
@ -0,0 +1,7 @@
|
|||||||
|
p {
|
||||||
|
font-size: calc(9px + 0.4vw);
|
||||||
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
color: black;
|
||||||
|
}
|
Loading…
Reference in New Issue