Scraping_Tools/pipeline/crawl.py

import sys
import html5lib # html5lib is a Python package that implements the HTML5 parsing algorithm
import xml.etree.ElementTree as ET
import requests
import urllib.request
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.parse import urljoin, urlparse, urlunparse
import os, sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, SoupStrainer
import time
import argparse

ap = argparse.ArgumentParser("")
ap.add_argument("--url", default="http://b-e-e-t.r-o-o-t.net/")
args = ap.parse_args()

# creates a url list
input_var_url = [args.url]
input_var_url_str = ' '.join(input_var_url) # turns list in string

done = []

# it reconstructs the url to 'clean' the url
def nurl(url):
    p = urlparse(url)
    return urlunparse((p.scheme, p.netloc, p.path, None, None, None))

while len(input_var_url) > 0:
    url = input_var_url[0]
    input_var_url = input_var_url[1:]
    done.append(url)
    print('scraping:', url, file=sys.stderr)

    try:
        with urlopen(url, timeout=4) as f:
            p = urlparse(url)
            if p.netloc in input_var_url_str: # avoids the scraping to get out of controll
                t = html5lib.parse(f, namespaceHTMLElements=False)
                # check for redirection
                if url != f.url:
                    print ("REDIRECT", url, f.url, file=sys.stderr)
                    url = f.url
                    done.append(url)
                    print(url, "DONE", file=sys.stderr)
                    #time.sleep(5)

        extensions = ("jpg", "pdf")

        # Open the xml tree map of the webpage. Catch all elements that are links: <href>
        for link in t.findall(".//a[@href]"):
            href = nurl(urljoin(url, link.attrib.get('href')))
            path = urlparse(href).path
            _,ext=os.path.splitext(path)
            ext = ext.lower().lstrip('.')
            #print('EXT-----------', href, ext, file=sys.stderr)
            if (href not in done and href not in input_var_url) and ext not in extensions:
                print(href)
                input_var_url.append(href)

        all_links = ""

    except: # when an HTTP error is returned (e.g: a url didn't exist), the except avoids the code breaking
        print ("oh no, a broken link", file=sys.stderr)

    time.sleep(0.1)