import argparse, os, sys from mwclient import Site from urllib.parse import quote as urlquote import html5lib from xml.etree import ElementTree as ET THUMB_SIZE = 320 FULL_SIZE = 640 NS_CATEGORY = 14 p = argparse.ArgumentParser(description="Dump wiki files to html") p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host') p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /") p.add_argument("--output", default="tasks", help="Output path for pages") p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query") args = p.parse_args() # print(args) # site and login def catmembers (c): prefix = c.get_prefix('cm', True) kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None, sort='sortkey', dir='asc', start=None, end=None, title=c.name, type="page")) return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs) NS_MAIN = 0 NS_TALK = 1 NS_USER = 2 NS_USER_TALK = 3 def path4page(p): """ REturns the local path for a page """ ret = p.page_title if "/" in ret: ret = ret.split("/")[-1] ret = ret.replace(" ", "_") if p.namespace == NS_USER_TALK: ret = ret + "_rvrs" return ret + ".html" def href4page(p): p = path4path(p) ret = urlquote(p) def filenameforlink(href): """ todo: deal with namespaces? """ path = href if "/" in href: path = path.split("/")[-1] path = path+".html" return path def rewriteimagelink(a): href = a.attrib.get("href") path = href if "/" in href: path = path.split("/")[-1] print ("rewriteimagelink", path) r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(THUMB_SIZE), formatversion=2) iinfo = r['query']['pages'][0]['imageinfo'][0] thumburl = iinfo['thumburl'] #fullsizeurl = iinfo['url'] #filepageurl = iinfo['descriptionurl'] r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(FULL_SIZE), formatversion=2) iinfo = r['query']['pages'][0]['imageinfo'][0] fullsizeurl = iinfo['thumburl'] #fullsizeurl = iinfo['url'] #filepageurl = iinfo['descriptionurl'] a.attrib['href'] = fullsizeurl img = a.find("img") img.attrib['src'] = thumburl if "width" in img.attrib: del img.attrib["width"] if "height" in img.attrib: del img.attrib["height"] if "srcset" in img.attrib: del img.attrib["srcset"] print ("rewriteimagelink", thumburl, fullsizeurl) def rewritelinks(html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) for a in t.findall(".//*[@href]"): linkclass = a.attrib.get("class", "") href = a.attrib.get("href") if "external" in linkclass: # leave external links alone continue # print ("LINK", href) if linkclass == "image": # link to presentation version of image # change img.src to a thumbnail rewriteimagelink(a) elif href.startswith("/mediadesign/"): new_href = filenameforlink(href) # print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr) a.attrib['href'] = new_href html = ET.tostring(t, method="html", encoding="unicode") return html site = Site(host=args.host, path=args.path) tasks=site.Categories[args.category] # FOR EVERY CARD for card in catmembers(tasks): # FRONT cardfilename = path4page(card) cardpath = os.path.join(args.output, cardfilename) print ("Saving page to {}".format(cardpath)) with open(cardpath, "w") as f: print (""" Tasks of the Contingent Librarian """, file=f) htmlsrc = site.parse(page=card.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) print ("""
{}
""".format(htmlsrc), file=f) print (""" """, file=f) # BACK talk = site.pages["User_talk:"+card.page_title] if talk.exists: print ("OUTPUTTING CARD BACK {}".format(talk.page_title)) talkfilename = path4page(talk) talkpath = os.path.join(args.output, talkfilename) with open(talkpath, "w") as f: print (""" Tasks of the Contingent Librarian """, file=f) htmlsrc = site.parse(page=talk.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) print ("""
{}
""".format(htmlsrc), file=f) print (""" """, file=f)