diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cdd3855 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ + +.PHONY: tasks +tasks: + mkdir -p tasks + python scripts/pulltasks.py --output tasks + +clean: + rm tasks/*.html diff --git a/scripts/pulltasks.py b/scripts/pulltasks.py index 999ff7a..84ed56a 100644 --- a/scripts/pulltasks.py +++ b/scripts/pulltasks.py @@ -1,18 +1,84 @@ -import argparse +import argparse, os, sys from mwclient import Site +from urllib.parse import quote as urlquote +import html5lib +from xml.etree import ElementTree as ET + NS_CATEGORY = 14 p = argparse.ArgumentParser(description="Dump wiki files to html") p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host') p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /") -p.add_argument("--output", default="../", help="Output path for pages") +p.add_argument("--output", default="pages", help="Output path for pages") +p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query") args = p.parse_args() # print(args) # site and login -print (""" +def catmembers (c): + prefix = c.get_prefix('cm', True) + kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None, + sort='sortkey', dir='asc', start=None, end=None, + title=c.name, type="page")) + return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs) + +NS_MAIN = 0 +NS_TALK = 1 +NS_USER = 2 +NS_USER_TALK = 3 + +def path4page(p): + """ REturns the local path for a page """ + ret = p.page_title + if "/" in ret: + ret = ret.split("/")[-1] + ret = ret.replace(" ", "_") + if p.namespace == NS_USER_TALK: + ret = ret + "_rvrs" + return ret + ".html" + +def href4page(p): + p = path4path(p) + ret = urlquote(p) + +def filenameforlink(href): + """ todo: deal with namespaces? """ + path = href + if "/" in href: + path = path.split("/")[-1] + path = path+".html" + return path + +def rewritelinks(html): + t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) + for a in t.findall(".//*[@href]"): + linkclass = a.attrib.get("class", "") + href = a.attrib.get("href") + if "external" in linkclass: + # leave external links alone + continue + # print ("LINK", href) + if href.startswith("/mediadesign/"): + new_href = filenameforlink(href) + # print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr) + a.attrib['href'] = new_href + html = ET.tostring(t, method="html", encoding="unicode") + return html + + +site = Site(host=args.host, path=args.path) +tasks=site.Categories[args.category] +# FOR EVERY CARD +for card in catmembers(tasks): + # FRONT + cardfilename = path4page(card) + cardpath = os.path.join(args.output, cardfilename) + print ("Saving page to {}".format(cardpath)) + + with open(cardpath, "w") as f: + print (""" @@ -22,19 +88,36 @@ print (""" -""") - -site = Site(host=args.host, path=args.path) -tasks=site.Categories['Tasks of the Contingent Librarian'] -for card in tasks.members(): - print ("
") - print ("

{}

".format(card.page_title)) - print () - # print (card.text()) - htmlsrc = site.parse(page=card.name)['text']['*'] - print ("""
{}
""".format(htmlsrc)) - -print (""" +""", file=f) + htmlsrc = site.parse(page=card.name)['text']['*'] + htmlsrc = rewritelinks(htmlsrc) + print ("""
{}
""".format(htmlsrc), file=f) + print (""" -""") +""", file=f) + # BACK + talk = site.pages["User_talk:"+card.page_title] + if talk.exists: + print ("OUTPUTTING CARD BACK {}".format(talk.page_title)) + + talkfilename = path4page(talk) + talkpath = os.path.join(args.output, talkfilename) + with open(talkpath, "w") as f: + print (""" + + + + + Tasks of the Contingent Librarian + + + + +""", file=f) + htmlsrc = site.parse(page=talk.name)['text']['*'] + htmlsrc = rewritelinks(htmlsrc) + print ("""
{}
""".format(htmlsrc), file=f) + print (""" + +""", file=f) diff --git a/tasks/tasks.css b/tasks/tasks.css new file mode 100644 index 0000000..0be9497 --- /dev/null +++ b/tasks/tasks.css @@ -0,0 +1,3 @@ +span.mw-editsection { +display: none; +}