You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
186 lines
6.9 KiB
Python
186 lines
6.9 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.web import Crawler, DEPTH, BREADTH, FIFO, LIFO, crawl, asynchronous
|
|
|
|
# This example demonstrates how to use the Crawler class for web crawling.
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# First, we need a subclass of Crawler with its own Crawler.visit() method.
|
|
# The visit() method takes two parameters: the visited link and the HTML source.
|
|
# We could parse the HTML DOM to extract information we need, for example.
|
|
# Anything that is not HTML (e.g., a JPEG file) is passed to Crawler.fail().
|
|
|
|
|
|
# class Polly(Crawler):
|
|
# def visit(self, link, source=None):
|
|
# print("visited:", link.url, "from:", link.referrer)
|
|
# def fail(self, link):
|
|
# print("failed:", link.url)
|
|
#
|
|
# p = Polly(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=5)
|
|
# while not p.done:
|
|
# p.crawl(method=DEPTH, cached=True, throttle=5)
|
|
|
|
|
|
# for link, source in crawl("http://www.clips.ua.ac.be/", delay=0, throttle=1, cached=False):
|
|
# print(link)
|
|
#
|
|
# g = crawl("http://www.clips.ua.ac.be/")
|
|
# for i in range(10):
|
|
# p = asynchronous(g.next)
|
|
# while not p.done:
|
|
# print("zzz...")
|
|
# time.sleep(0.1)
|
|
# link, source = p.value
|
|
# print(link)
|
|
|
|
|
|
|
|
class SimpleCrawler1(Crawler):
|
|
|
|
def visit(self, link, source=None):
|
|
print("visiting: %s from: %s" % (link.url, link.referrer))
|
|
|
|
def fail(self, link):
|
|
print("failed: %s" % link.url)
|
|
|
|
# Create a new crawler.
|
|
# 1) The links parameter is a list of URL's to visit.
|
|
# The crawler will visit the first link, extract new links from the HTML, and queue these for a visit too.
|
|
# 2) The domains parameter is a list of allowed domains.
|
|
# The crawler will never leave these domains.
|
|
# 3) The delay parameter specifies a number of seconds to wait before revisiting the same domain.
|
|
# In the meantime, other queued links will be crawled if possible.
|
|
|
|
crawler1 = SimpleCrawler1(links=["http://nodebox.net/"], domains=["nodebox.net"], delay=1)
|
|
|
|
print("CRAWLER 1 " + "-" * 50)
|
|
while len(crawler1.visited) < 5: # Crawler.visited is a dictionary of all URL's visited so far.
|
|
# The Crawler.crawl() method has the same optional parameters as URL.download(),
|
|
# for example: cached=True, proxy=("proxy.com", "https"), ...
|
|
crawler1.crawl(cached=True, throttle=5)
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# Typically, you'll want a crawler that runs in an endless loop as a background process,
|
|
# and just keeps on visiting new URL's. In this case, it is rude to use a delay of 0.0,
|
|
# because you will keep hammering servers with automated requests.
|
|
# A higher delay (in a real-world scenario, say 30 seconds) is better:
|
|
|
|
crawler2 = SimpleCrawler1(
|
|
links=["http://nodebox.net/"],
|
|
domains=["nodebox.net"],
|
|
delay=0.1
|
|
)
|
|
|
|
print("")
|
|
print("CRAWLER 2 " + "-" * 50)
|
|
while True:
|
|
crawler2.crawl(cached=False)
|
|
print("wait...")
|
|
# Of course we don't want this example to run forever,
|
|
# so we still add a stop condition:
|
|
if len(crawler2.visited) > 2:
|
|
break
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# If you create a crawler without a domains=[..] restriction, it is free to roam the entire web.
|
|
# What to visit first? You can use Crawler.crawl() with an optional "method" parameter.
|
|
# When set to DEPTH, it prefers to visit links in the same domain.
|
|
# When set to BREADTH, it prefers to visit links to other domains.
|
|
# Observe the difference between crawler3 and crawler4,
|
|
# which use DEPTH and BREADTH respectively.
|
|
|
|
crawler3 = SimpleCrawler1(
|
|
links=["http://nodebox.net/"],
|
|
delay=0.0
|
|
)
|
|
|
|
print("")
|
|
print("CRAWLER 3 " + "-" * 50)
|
|
while len(crawler3.visited) < 3:
|
|
crawler3.crawl(method=DEPTH)
|
|
|
|
crawler4 = SimpleCrawler1(
|
|
links=["http://nodebox.net/"],
|
|
delay=0.0
|
|
)
|
|
|
|
print("")
|
|
print("CRAWLER 4 " + "-" * 50)
|
|
while len(crawler4.visited) < 3:
|
|
crawler4.crawl(method=BREADTH)
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# With Crawler.crawl(method=DEPTH) and a delay,
|
|
# the crawler will wait between requests to the same domain.
|
|
# In the meantime, it will visit other links.
|
|
# Usually this means that it will alternate between a couple of domains:
|
|
|
|
crawler5 = SimpleCrawler1(
|
|
links=["http://nodebox.net/"],
|
|
delay=0.1
|
|
)
|
|
|
|
print("")
|
|
print("CRAWLER 5 " + "-" * 50)
|
|
while len(crawler5.visited) < 4:
|
|
crawler5.crawl(method=DEPTH)
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# A BREADTH-crawler in an endless crawl loop will eventually queue the entire web for a visit.
|
|
# But this is not possible of course: we can't keep the entire web in memory.
|
|
# When the number of queued links exceeds Crawler.QUEUE (10,000 by default),
|
|
# less relevant queued links will be discarded.
|
|
# "Less relevant" depends on two settings:
|
|
# 1) First, there is the Crawler.priority() method that returns a number between 0.0-1.0 for a link.
|
|
# Links with a higher priority are more relevant and will be visited sooner.
|
|
# 2) Links with an equal priority are queued either FIFO or LIFO.
|
|
# FIFO means first-in-first-out: the earliest queued links will be visited sooner.
|
|
# LIFO means last-in-first-out: more recently queued links will be visited sooner.
|
|
|
|
|
|
class SimpleCrawler2(Crawler):
|
|
|
|
def visit(self, link, source=None):
|
|
print("visiting: %s from: %s" % (link.url, link.referrer))
|
|
|
|
def priority(self, link, method=DEPTH):
|
|
if "?" in link.url:
|
|
# This ignores links with a querystring.
|
|
return 0.0
|
|
else:
|
|
# Otherwise use the default priority ranker,
|
|
# i.e. the priority depends on DEPTH or BREADTH crawl mode.
|
|
return Crawler.priority(self, link, method)
|
|
|
|
# Note the LIFO sort order.
|
|
# This will make more recently queued links more relevant.
|
|
# If you observe the given URL in a browser,
|
|
# you'll notice that the last external link at the bottom of the page is now visited first.
|
|
crawler6 = SimpleCrawler2(
|
|
links=["http://nodebox.net/"],
|
|
delay=0.1,
|
|
sort=LIFO
|
|
)
|
|
|
|
print("")
|
|
print("CRAWLER 6 " + "-" * 50)
|
|
while len(crawler6.visited) < 4:
|
|
crawler6.crawl(method=BREADTH)
|
|
|
|
# -------------------------------------------------------------------------------------------------
|
|
# In the long run, the Crawler.visited dictionary will start filling up memory too.
|
|
# If you want a single crawler that runs forever, you should empty the dictionary every now and then,
|
|
# and instead use a strategy with a persistent database of visited links,
|
|
# in combination with Crawler.follow().
|
|
# Another strategy would be to use different DEPTH-crawlers for different domains,
|
|
# and delete them when they are done.
|