You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
3.7 KiB
Python

5 years ago
import argparse, os, sys
from mwclient import Site
5 years ago
from urllib.parse import quote as urlquote
import html5lib
from xml.etree import ElementTree as ET
NS_CATEGORY = 14
p = argparse.ArgumentParser(description="Dump wiki files to html")
p.add_argument("--host", metavar='', default="pzwiki.wdka.nl", help='wiki host')
p.add_argument("--path", metavar='', default="/mw-mediadesign/", help="Wiki path. Should end with /")
5 years ago
p.add_argument("--output", default="pages", help="Output path for pages")
p.add_argument("--category", default="Tasks of the Contingent Librarian", help="Category to query")
args = p.parse_args()
5 years ago
# print(args)
# site and login
5 years ago
def catmembers (c):
prefix = c.get_prefix('cm', True)
kwargs = dict(c.generate_kwargs(prefix, prop='ids|title', namespace=None,
sort='sortkey', dir='asc', start=None, end=None,
title=c.name, type="page"))
return c.get_list(True)(c.site, 'categorymembers', 'cm', **kwargs)
NS_MAIN = 0
NS_TALK = 1
NS_USER = 2
NS_USER_TALK = 3
def path4page(p):
""" REturns the local path for a page """
ret = p.page_title
if "/" in ret:
ret = ret.split("/")[-1]
ret = ret.replace(" ", "_")
if p.namespace == NS_USER_TALK:
ret = ret + "_rvrs"
return ret + ".html"
def href4page(p):
p = path4path(p)
ret = urlquote(p)
def filenameforlink(href):
""" todo: deal with namespaces? """
path = href
if "/" in href:
path = path.split("/")[-1]
path = path+".html"
return path
def rewritelinks(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = a.attrib.get("href")
if "external" in linkclass:
# leave external links alone
continue
# print ("LINK", href)
if href.startswith("/mediadesign/"):
new_href = filenameforlink(href)
# print ("Rewriting link {} to {}".format(href, new_href), file=sys.stderr)
a.attrib['href'] = new_href
html = ET.tostring(t, method="html", encoding="unicode")
return html
site = Site(host=args.host, path=args.path)
tasks=site.Categories[args.category]
# FOR EVERY CARD
for card in catmembers(tasks):
# FRONT
cardfilename = path4page(card)
cardpath = os.path.join(args.output, cardfilename)
print ("Saving page to {}".format(cardpath))
with open(cardpath, "w") as f:
print ("""<!DOCTYPE html>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Tasks of the Contingent Librarian</title>
<link rel="stylesheet" type="text/css" href="tasks.css">
<script src="tasks.js"></script>
</head>
<body>
5 years ago
""", file=f)
htmlsrc = site.parse(page=card.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
print ("""<div class="card">{}</div>""".format(htmlsrc), file=f)
print ("""
</body>
5 years ago
</html>""", file=f)
5 years ago
# BACK
talk = site.pages["User_talk:"+card.page_title]
if talk.exists:
print ("OUTPUTTING CARD BACK {}".format(talk.page_title))
talkfilename = path4page(talk)
talkpath = os.path.join(args.output, talkfilename)
with open(talkpath, "w") as f:
print ("""<!DOCTYPE html>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Tasks of the Contingent Librarian</title>
<link rel="stylesheet" type="text/css" href="tasks.css">
<script src="tasks.js"></script>
</head>
<body>
""", file=f)
htmlsrc = site.parse(page=talk.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
print ("""<div class="cardback">{}</div>""".format(htmlsrc), file=f)
print ("""
</body>
</html>""", file=f)