import sys import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm import xml.etree.ElementTree as ET import requests import urllib.request from urllib.request import urlopen from urllib.error import HTTPError from urllib.parse import urljoin, urlparse, urlunparse import os, sys from urllib.request import urlopen from bs4 import BeautifulSoup from bs4 import BeautifulSoup, SoupStrainer import time import argparse ap = argparse.ArgumentParser("") ap.add_argument("--url", default="http://b-e-e-t.r-o-o-t.net/") args = ap.parse_args() # creates a url list input_var_url = [args.url] input_var_url_str = ' '.join(input_var_url) # turns list in string done = [] # it reconstructs the url to 'clean' the url def nurl(url): p = urlparse(url) return urlunparse((p.scheme, p.netloc, p.path, None, None, None)) while len(input_var_url) > 0: url = input_var_url[0] input_var_url = input_var_url[1:] done.append(url) print('scraping:', url, file=sys.stderr) try: with urlopen(url, timeout=4) as f: p = urlparse(url) if p.netloc in input_var_url_str: # avoids the scraping to get out of controll t = html5lib.parse(f, namespaceHTMLElements=False) # check for redirection if url != f.url: print ("REDIRECT", url, f.url, file=sys.stderr) url = f.url done.append(url) print(url, "DONE", file=sys.stderr) #time.sleep(5) extensions = ("jpg", "pdf") # Open the xml tree map of the webpage. Catch all elements that are links: for link in t.findall(".//a[@href]"): href = nurl(urljoin(url, link.attrib.get('href'))) path = urlparse(href).path _,ext=os.path.splitext(path) ext = ext.lower().lstrip('.') #print('EXT-----------', href, ext, file=sys.stderr) if (href not in done and href not in input_var_url) and ext not in extensions: print(href) input_var_url.append(href) all_links = "" except: # when an HTTP error is returned (e.g: a url didn't exist), the except avoids the code breaking print ("oh no, a broken link", file=sys.stderr) time.sleep(0.1)