import sys
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
import xml.etree.ElementTree as ET
import requests
import urllib.request
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import urljoin, urlparse, urlunparse
import os, sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, SoupStrainer
import time
import argparse
ap = argparse.ArgumentParser("")
ap.add_argument("--url", default="http://b-e-e-t.r-o-o-t.net/")
args = ap.parse_args()
# creates a url list
input_var_url = [args.url]
input_var_url_str = ' '.join(input_var_url) # turns list in string
done = []
# it reconstructs the url to 'clean' the url
def nurl(url):
p = urlparse(url)
return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
while len(input_var_url) > 0:
url = input_var_url[0]
input_var_url = input_var_url[1:]
done.append(url)
print('scraping:', url, file=sys.stderr)
try:
with urlopen(url, timeout=4) as f:
p = urlparse(url)
if p.netloc in input_var_url_str: # avoids the scraping to get out of controll
t = html5lib.parse(f, namespaceHTMLElements=False)
# check for redirection
if url != f.url:
print ("REDIRECT", url, f.url, file=sys.stderr)
url = f.url
done.append(url)
print(url, "DONE", file=sys.stderr)
#time.sleep(5)
extensions = ("jpg", "pdf")
# Open the xml tree map of the webpage. Catch all elements that are links:
for link in t.findall(".//a[@href]"):
href = nurl(urljoin(url, link.attrib.get('href')))
path = urlparse(href).path
_,ext=os.path.splitext(path)
ext = ext.lower().lstrip('.')
#print('EXT-----------', href, ext, file=sys.stderr)
if (href not in done and href not in input_var_url) and ext not in extensions:
print(href)
input_var_url.append(href)
all_links = ""
except: # when an HTTP error is returned (e.g: a url didn't exist), the except avoids the code breaking
print ("oh no, a broken link", file=sys.stderr)
time.sleep(0.1)