You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
5.0 KiB
Python
161 lines
5.0 KiB
Python
import argparse, os, sys
|
|
from mwclient import Site
|
|
from urllib.parse import quote as urlquote
|
|
import html5lib
|
|
from xml.etree import ElementTree as ET
|
|
|
|
|
|
THUMB_SIZE = 320
|
|
FULL_SIZE = 640
|
|
|
|
NS_CATEGORY = 14
|
|
|
|
p = argparse.ArgumentParser(description="Dump wiki files to html")
|
|
p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host')
|
|
p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /")
|
|
p.add_argument("--output", default="pages", help="Output path for pages")
|
|
p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query")
|
|
|
|
args = p.parse_args()
|
|
# print(args)
|
|
# site and login
|
|
|
|
def catmembers (c):
|
|
prefix = c.get_prefix('cm', True)
|
|
kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None,
|
|
sort='sortkey', dir='asc', start=None, end=None,
|
|
title=c.name, type="page"))
|
|
return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs)
|
|
|
|
NS_MAIN = 0
|
|
NS_TALK = 1
|
|
NS_USER = 2
|
|
NS_USER_TALK = 3
|
|
|
|
def path4page(p):
|
|
""" REturns the local path for a page """
|
|
ret = p.page_title
|
|
if "/" in ret:
|
|
ret = ret.split("/")[-1]
|
|
ret = ret.replace(" ", "_")
|
|
if p.namespace == NS_USER_TALK:
|
|
ret = ret + "_rvrs"
|
|
return ret + ".html"
|
|
|
|
def href4page(p):
|
|
p = path4path(p)
|
|
ret = urlquote(p)
|
|
|
|
def filenameforlink(href):
|
|
""" todo: deal with namespaces? """
|
|
path = href
|
|
if "/" in href:
|
|
path = path.split("/")[-1]
|
|
path = path+".html"
|
|
return path
|
|
|
|
def rewriteimagelink(a):
|
|
href = a.attrib.get("href")
|
|
path = href
|
|
if "/" in href:
|
|
path = path.split("/")[-1]
|
|
print ("rewriteimagelink", path)
|
|
|
|
r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(THUMB_SIZE), formatversion=2)
|
|
iinfo = r['query']['pages'][0]['imageinfo'][0]
|
|
thumburl = iinfo['thumburl']
|
|
#fullsizeurl = iinfo['url']
|
|
#filepageurl = iinfo['descriptionurl']
|
|
|
|
r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(FULL_SIZE), formatversion=2)
|
|
iinfo = r['query']['pages'][0]['imageinfo'][0]
|
|
fullsizeurl = iinfo['thumburl']
|
|
#fullsizeurl = iinfo['url']
|
|
#filepageurl = iinfo['descriptionurl']
|
|
|
|
a.attrib['href'] = fullsizeurl
|
|
img = a.find("img")
|
|
img.attrib['src'] = thumburl
|
|
if "width" in img.attrib:
|
|
del img.attrib["width"]
|
|
if "height" in img.attrib:
|
|
del img.attrib["height"]
|
|
if "srcset" in img.attrib:
|
|
del img.attrib["srcset"]
|
|
print ("rewriteimagelink", thumburl, fullsizeurl)
|
|
|
|
def rewritelinks(html):
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
for a in t.findall(".//*[@href]"):
|
|
linkclass = a.attrib.get("class", "")
|
|
href = a.attrib.get("href")
|
|
if "external" in linkclass:
|
|
# leave external links alone
|
|
continue
|
|
# print ("LINK", href)
|
|
if linkclass == "image":
|
|
# link to presentation version of image
|
|
# change img.src to a thumbnail
|
|
rewriteimagelink(a)
|
|
elif href.startswith("/mediadesign/"):
|
|
new_href = filenameforlink(href)
|
|
# print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr)
|
|
a.attrib['href'] = new_href
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
return html
|
|
|
|
|
|
site = Site(host=args.host, path=args.path)
|
|
tasks=site.Categories[args.category]
|
|
# FOR EVERY CARD
|
|
for card in catmembers(tasks):
|
|
# FRONT
|
|
cardfilename = path4page(card)
|
|
cardpath = os.path.join(args.output, cardfilename)
|
|
print ("Saving page to {}".format(cardpath))
|
|
|
|
with open(cardpath, "w") as f:
|
|
print ("""<!DOCTYPE html>
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>Tasks of the Contingent Librarian</title>
|
|
<link rel="stylesheet" type="text/css" href="tasks.css">
|
|
<script src="tasks.js"></script>
|
|
</head>
|
|
<body>
|
|
""", file=f)
|
|
htmlsrc = site.parse(page=card.name)['text']['*']
|
|
htmlsrc = rewritelinks(htmlsrc)
|
|
print ("""<div class="card">{}</div>""".format(htmlsrc), file=f)
|
|
print ("""
|
|
</body>
|
|
</html>""", file=f)
|
|
|
|
# BACK
|
|
talk = site.pages["User_talk:"+card.page_title]
|
|
if talk.exists:
|
|
print ("OUTPUTTING CARD BACK {}".format(talk.page_title))
|
|
|
|
talkfilename = path4page(talk)
|
|
talkpath = os.path.join(args.output, talkfilename)
|
|
with open(talkpath, "w") as f:
|
|
print ("""<!DOCTYPE html>
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>Tasks of the Contingent Librarian</title>
|
|
<link rel="stylesheet" type="text/css" href="tasks.css">
|
|
<script src="tasks.js"></script>
|
|
</head>
|
|
<body>
|
|
""", file=f)
|
|
htmlsrc = site.parse(page=talk.name)['text']['*']
|
|
htmlsrc = rewritelinks(htmlsrc)
|
|
print ("""<div class="cardback">{}</div>""".format(htmlsrc), file=f)
|
|
print ("""
|
|
</body>
|
|
</html>""", file=f)
|