diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cdd3855 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ + +.PHONY: tasks +tasks: + mkdir -p tasks + python scripts/pulltasks.py --output tasks + +clean: + rm tasks/*.html diff --git a/scripts/pulltasks.py b/scripts/pulltasks.py index 999ff7a..84ed56a 100644 --- a/scripts/pulltasks.py +++ b/scripts/pulltasks.py @@ -1,18 +1,84 @@ -import argparse +import argparse, os, sys from mwclient import Site +from urllib.parse import quote as urlquote +import html5lib +from xml.etree import ElementTree as ET + NS_CATEGORY = 14 p = argparse.ArgumentParser(description="Dump wiki files to html") p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host') p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /") -p.add_argument("--output", default="../", help="Output path for pages") +p.add_argument("--output", default="pages", help="Output path for pages") +p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query") args = p.parse_args() # print(args) # site and login -print (""" +def catmembers (c): + prefix = c.get_prefix('cm', True) + kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None, + sort='sortkey', dir='asc', start=None, end=None, + title=c.name, type="page")) + return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs) + +NS_MAIN = 0 +NS_TALK = 1 +NS_USER = 2 +NS_USER_TALK = 3 + +def path4page(p): + """ REturns the local path for a page """ + ret = p.page_title + if "/" in ret: + ret = ret.split("/")[-1] + ret = ret.replace(" ", "_") + if p.namespace == NS_USER_TALK: + ret = ret + "_rvrs" + return ret + ".html" + +def href4page(p): + p = path4path(p) + ret = urlquote(p) + +def filenameforlink(href): + """ todo: deal with namespaces? """ + path = href + if "/" in href: + path = path.split("/")[-1] + path = path+".html" + return path + +def rewritelinks(html): + t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) + for a in t.findall(".//*[@href]"): + linkclass = a.attrib.get("class", "") + href = a.attrib.get("href") + if "external" in linkclass: + # leave external links alone + continue + # print ("LINK", href) + if href.startswith("/mediadesign/"): + new_href = filenameforlink(href) + # print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr) + a.attrib['href'] = new_href + html = ET.tostring(t, method="html", encoding="unicode") + return html + + +site = Site(host=args.host, path=args.path) +tasks=site.Categories[args.category] +# FOR EVERY CARD +for card in catmembers(tasks): + # FRONT + cardfilename = path4page(card) + cardpath = os.path.join(args.output, cardfilename) + print ("Saving page to {}".format(cardpath)) + + with open(cardpath, "w") as f: + print ("""
@@ -22,19 +88,36 @@ print (""" -""") - -site = Site(host=args.host, path=args.path) -tasks=site.Categories['Tasks of the Contingent Librarian'] -for card in tasks.members(): - print ("