From 5c97712a2b05461f0bb9e435b21fd6e338c2b327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rita=20Gra=C3=A7a?= Date: Sat, 30 Mar 2019 16:43:31 +0100 Subject: [PATCH] tool for image scraping --- img_index/.DS_Store | Bin 0 -> 6148 bytes img_index/img_index.py | 99 +++++++++++++++++++++++++++++++++++++++++ img_index/style.css | 7 +++ 3 files changed, 106 insertions(+) create mode 100644 img_index/.DS_Store create mode 100644 img_index/img_index.py create mode 100644 img_index/style.css diff --git a/img_index/.DS_Store b/img_index/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f342e6e1f09354bb3308f94679fd489ae9ec2f82 GIT binary patch literal 6148 zcmeHKyG{c^3>-s>AShB&Ziy%&QPZrVP*C#$pd<)IItWrgBHeX-2VV-~2aw~Sr2t#9 zXT9sWy_@2^3qU51ixXf3U`SWQvxdIux_ZxcVn$Ky8B5$@fonWr+^hb6Lb(T6;ubf! zB!2Smu{m#6^I5Z+u|{9;lz#MT#u2cGH6E}*!yU(%Gh2UQys+wci(#}3sX!`_3Zw$5 zz`s_2JzH(K@0c+aNCi@X4+Zr9Q0R&^uywRw2ZJ8pb$y_y#vPUn1yx*FIzIvrv=^Et7z#0^Dkcg`0phg8RmsX!{wRbbcCk@o*H`VaU2E-ABA zAQkvi3dmr3F`e*}qPO-wPJ3;kU(w% 0: + url = input_var_url[0] + input_var_url = input_var_url[1:] + done.append(url) + print('scraping:', url) + + try: + with urlopen(url) as f: + p = urlparse(url) + # this makes the code contained to nothat.bad.mn, otherwise it would follow all links to other external links + if p.netloc == 'nothat.bad.mn': + t = html5lib.parse(f, namespaceHTMLElements=False) + # check for redirection + if url != f.url: + print ("REDIRECT", url, f.url) + url = f.url + done.append(url) + + # Make list to put the source of the images in + img_list = [] + + # Open the xml tree map of the webpage. Catch all elements that are images + # so to say, and then get the src tag + print('----- Images found:') + for img in t.findall('.//img[@src]'): # the syntax used here to select elements is XPATH + img_src = urljoin(url, img.attrib.get('src')) + img_list.append(img_src) + print(img_src) + + # find links (dir and subdir) to open and search for more imgs + for link in t.findall(".//a[@href]"): + href = nurl(urljoin(url, link.attrib.get('href'))) + if not href in done: + print('adding href:', href) + input_var_url.append(href) + + # Download the file from `url` and save it locally under the filename specified in the url: + for src in img_list: + file_name = src.rsplit('/', 1)[-1] # Get the url part after the last slash + with urllib.request.urlopen(src) as response, open(file_name, 'wb') as out_file: + data = response.read() # a `bytes` object + out_file.write(data) + + # not getting the 'icons' folder, which is just unintersting thumbnails + # 'os.path.basename' gets the last folder of the path + path = os.path.dirname(img_src) + print('----- These imgs were inside the folder:', os.path.basename(path)) + + # for every item inside the img_list that has a certain extension and is not inside 'icons' folder + extensions = [".jpg", ".png", ".gif"] + + for img_src in img_list: + if img_src.endswith(tuple(extensions)) and os.path.basename(path) != 'icons': + img = ''.format(img_src) + stream = 'index.html' + f = open(stream, 'a') + f.write( """ + """ + '\n' + + """ + """ +'\n' + img) + f.close() + + # creates an stream2 file with no duplicates + stream2 = 'index2.html' + lines_seen = set() # holds lines already seen + f = open(stream2, "w") + for line in open(stream, "r"): + if line not in lines_seen: # not a duplicate + f.write(line) + lines_seen.add(line) + f.close() + +# this helps debbugging errors + except HTTPError as e: + print ("HTTPERROR", e) + + print("Done! Going to the next url") diff --git a/img_index/style.css b/img_index/style.css new file mode 100644 index 0000000..13a1a00 --- /dev/null +++ b/img_index/style.css @@ -0,0 +1,7 @@ +p { + font-size: calc(9px + 0.4vw); +} + +a { + color: black; +}