You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
6 years ago
|
import sys
|
||
|
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
|
||
|
import xml.etree.ElementTree as ET
|
||
|
import requests
|
||
|
import urllib.request
|
||
|
from urllib.request import urlopen
|
||
|
from urllib.error import HTTPError
|
||
|
from urllib.parse import urljoin, urlparse, urlunparse
|
||
|
import os, sys
|
||
|
from urllib.request import urlopen
|
||
|
from bs4 import BeautifulSoup
|
||
|
from bs4 import BeautifulSoup, SoupStrainer
|
||
|
import time
|
||
|
import argparse
|
||
|
|
||
|
ap = argparse.ArgumentParser("")
|
||
|
ap.add_argument("--url", default="http://b-e-e-t.r-o-o-t.net/")
|
||
|
args = ap.parse_args()
|
||
|
|
||
|
# creates a url list
|
||
|
input_var_url = [args.url]
|
||
|
input_var_url_str = ' '.join(input_var_url) # turns list in string
|
||
|
|
||
|
done = []
|
||
|
|
||
|
# it reconstructs the url to 'clean' the url
|
||
|
def nurl(url):
|
||
|
p = urlparse(url)
|
||
|
return urlunparse((p.scheme, p.netloc, p.path, None, None, None))
|
||
|
|
||
|
while len(input_var_url) > 0:
|
||
|
url = input_var_url[0]
|
||
|
input_var_url = input_var_url[1:]
|
||
|
done.append(url)
|
||
|
print('scraping:', url, file=sys.stderr)
|
||
|
|
||
|
try:
|
||
|
with urlopen(url, timeout=4) as f:
|
||
|
p = urlparse(url)
|
||
|
if p.netloc in input_var_url_str: # avoids the scraping to get out of controll
|
||
|
t = html5lib.parse(f, namespaceHTMLElements=False)
|
||
|
# check for redirection
|
||
|
if url != f.url:
|
||
|
print ("REDIRECT", url, f.url, file=sys.stderr)
|
||
|
url = f.url
|
||
|
done.append(url)
|
||
|
print(url, "DONE", file=sys.stderr)
|
||
|
#time.sleep(5)
|
||
|
|
||
|
extensions = ("jpg", "pdf")
|
||
|
|
||
|
# Open the xml tree map of the webpage. Catch all elements that are links: <href>
|
||
|
for link in t.findall(".//a[@href]"):
|
||
|
href = nurl(urljoin(url, link.attrib.get('href')))
|
||
|
path = urlparse(href).path
|
||
|
_,ext=os.path.splitext(path)
|
||
|
ext = ext.lower().lstrip('.')
|
||
|
#print('EXT-----------', href, ext, file=sys.stderr)
|
||
|
if (href not in done and href not in input_var_url) and ext not in extensions:
|
||
|
print(href)
|
||
|
input_var_url.append(href)
|
||
|
|
||
|
all_links = ""
|
||
|
|
||
|
except: # when an HTTP error is returned (e.g: a url didn't exist), the except avoids the code breaking
|
||
|
print ("oh no, a broken link", file=sys.stderr)
|
||
|
|
||
|
time.sleep(0.1)
|