#### DOCUMENTATION GENERATOR ########################################################################## # Keeps the offline documention in synch with the online documentation. # Simply run "python update.py" to generate the latest version. import os, sys; sys.path.insert(0, os.path.join("..")) import codecs import re from pattern.web import URL, Document, strip_javascript, strip_between url = "http://www.clips.ua.ac.be/pages/" #--- HTML TEMPLATE ----------------------------------------------------------------------------------- # Use a simplified HTML template based on the online documentation. template = """ %s

%s

%s
""".strip() #--- DOWNLOAD & UPDATE ------------------------------------------------------------------------------- for p in ("-", "-web", "-db", "-search", "-vector", "-graph", "-canvas", "-metrics", "-de", "-en", "-es", "-fr", "-it", "-nl", "-shell", "stop-words", "mbsp-tags", "-dev"): # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation. if p.startswith("-"): p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print("Retrieving", url + p) html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('