You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
5.0 KiB
Python

import argparse, os, sys
from mwclient import Site
from urllib.parse import quote as urlquote
import html5lib
from xml.etree import ElementTree as ET
THUMB_SIZE = 320
FULL_SIZE = 960
NS_CATEGORY = 14
p = argparse.ArgumentParser(description="Dump wiki files to html")
p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host')
p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /")
p.add_argument("--output", default="tasks", help="Output path for pages")
p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query")
args = p.parse_args()
# print(args)
# site and login
def catmembers (c):
prefix = c.get_prefix('cm', True)
kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None,
sort='sortkey', dir='asc', start=None, end=None,
title=c.name, type="page"))
return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs)
NS_MAIN = 0
NS_TALK = 1
NS_USER = 2
NS_USER_TALK = 3
def path4page(p):
""" REturns the local path for a page """
ret = p.page_title
if "/" in ret:
ret = ret.split("/")[-1]
ret = ret.replace(" ", "_")
if p.namespace == NS_USER_TALK:
ret = ret + "_rvrs"
return ret + ".html"
def href4page(p):
p = path4path(p)
ret = urlquote(p)
def filenameforlink(href):
""" todo: deal with namespaces? """
path = href
if "/" in href:
path = path.split("/")[-1]
path = path+".html"
return path
def rewriteimagelink(a):
href = a.attrib.get("href")
path = href
if "/" in href:
path = path.split("/")[-1]
print ("rewriteimagelink", path)
r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(THUMB_SIZE), formatversion=2)
iinfo = r['query']['pages'][0]['imageinfo'][0]
thumburl = iinfo['thumburl']
#fullsizeurl = iinfo['url']
#filepageurl = iinfo['descriptionurl']
r = site.api("query", prop="imageinfo", titles=path, iiprop="url", iiurlwidth=str(FULL_SIZE), formatversion=2)
iinfo = r['query']['pages'][0]['imageinfo'][0]
fullsizeurl = iinfo['thumburl']
#fullsizeurl = iinfo['url']
#filepageurl = iinfo['descriptionurl']
a.attrib['href'] = fullsizeurl
img = a.find("img")
img.attrib['src'] = thumburl
if "width" in img.attrib:
del img.attrib["width"]
if "height" in img.attrib:
del img.attrib["height"]
if "srcset" in img.attrib:
del img.attrib["srcset"]
print ("rewriteimagelink", thumburl, fullsizeurl)
def rewritelinks(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = a.attrib.get("href")
if "external" in linkclass:
# leave external links alone
continue
# print ("LINK", href)
if linkclass == "image":
# link to presentation version of image
# change img.src to a thumbnail
rewriteimagelink(a)
elif href.startswith("/mediadesign/"):
new_href = filenameforlink(href)
# print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr)
a.attrib['href'] = new_href
html = ET.tostring(t, method="html", encoding="unicode")
return html
site = Site(host=args.host, path=args.path)
tasks=site.Categories[args.category]
# FOR EVERY CARD
for card in catmembers(tasks):
# FRONT
cardfilename = path4page(card)
cardpath = os.path.join(args.output, cardfilename)
print ("Saving page to {}".format(cardpath))
with open(cardpath, "w") as f:
print ("""<!DOCTYPE html>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Tasks of the Contingent Librarian</title>
<link rel="stylesheet" type="text/css" href="tasks.css">
<script src="tasks.js"></script>
</head>
<body>
""", file=f)
htmlsrc = site.parse(page=card.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
print ("""<div class="card">{}</div>""".format(htmlsrc), file=f)
print ("""
</body>
</html>""", file=f)
# BACK
talk = site.pages["User_talk:"+card.page_title]
if talk.exists:
print ("OUTPUTTING CARD BACK {}".format(talk.page_title))
talkfilename = path4page(talk)
talkpath = os.path.join(args.output, talkfilename)
with open(talkpath, "w") as f:
print ("""<!DOCTYPE html>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Tasks of the Contingent Librarian</title>
<link rel="stylesheet" type="text/css" href="tasks.css">
<script src="tasks.js"></script>
</head>
<body>
""", file=f)
htmlsrc = site.parse(page=talk.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
print ("""<div class="cardback">{}</div>""".format(htmlsrc), file=f)
print ("""
</body>
</html>""", file=f)