You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
special-issue-11-wiki2html/dumpwiki.py

113 lines
3.9 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os, json, sys, urllib
from mwclient import Site
from pprint import pprint
from jinja2 import Template
from functions import unpack_response, clean_dir, remove_nonwords
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote
p = argparse.ArgumentParser(description="Dump wiki files to html",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images")
args = p.parse_args()
print(args)
# site and login
site = Site(host=args.host, path=args.path)
with open('login.txt', 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
# read template files
SLASH = "\u2044"
HYPHEN = "\u2010"
def filenameforpage(p):
f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
return f
def filenameforlink(href):
href = urlunquote(href)
if href.startswith("/sandbox/itchwiki/index.php/"):
href = href[len("/sandbox/itchwiki/index.php/"):]
href = href.replace(' ','_').replace('/', SLASH).replace('', HYPHEN) + '.html'
href = urlquote(href)
return href
def rewritelinks (html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = a.attrib.get("href")
if "external" in linkclass:
# leave external links alone
continue
# print ("LINK", href)
if href.startswith("/sandbox/itchwiki/index.php/"):
new_href = filenameforlink(href)
a.attrib['href'] = new_href
if args.local is True:
for img in t.findall(".//img[@src]"):
src = img.attrib.get("src")
if not src.startswith('http'):
img.attrib['src'] = 'https://hub.xpub.nl' + src
html = ET.tostring(t, method="html", encoding="unicode")
return html
def rewriteimgs(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
# remove the srcset value as it prevents images from displaying
for img in t.findall(".//img[@srcset]"):
img.attrib['srcset'] = ""
html = ET.tostring(t, method="html", encoding="unicode")
return html
publish=site.Categories['Publish']
for cat in publish.members():
if cat.namespace != 14:
continue
print('dumping category {}'.format(cat.page_title))
# title=site.Categories['Title']
try:
with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
template = Template(templatefile.read())
except FileNotFoundError:
with open('templates/default.html') as templatefile:
template = Template(templatefile.read())
for p in cat.members():
print(p)
htmlsrc = site.parse(page=p.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
htmlsrc = rewriteimgs(htmlsrc)
if args.local is True:
html = template.render(page=p, body=htmlsrc, staticpath='..')
else:
html = template.render(page=p, body=htmlsrc, staticpath='0')
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
f.write(html)
# print(html, file=f)
if args.one:
break