|
|
|
import os, json, sys, urllib
|
|
|
|
from mwclient import Site
|
|
|
|
from pprint import pprint
|
|
|
|
from jinja2 import Template
|
|
|
|
from functions import unpack_response, clean_dir, remove_nonwords
|
|
|
|
import html5lib
|
|
|
|
from functions import Colors
|
|
|
|
import argparse
|
|
|
|
from xml.etree import ElementTree as ET
|
|
|
|
from urllib.parse import quote as urlquote, unquote as urlunquote
|
|
|
|
|
|
|
|
|
|
|
|
p = argparse.ArgumentParser(description="Dump wiki files to html",
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
|
|
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
|
|
|
|
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
|
|
|
|
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")
|
|
|
|
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
|
|
|
|
p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images")
|
|
|
|
|
|
|
|
args = p.parse_args()
|
|
|
|
print(args)
|
|
|
|
# site and login
|
|
|
|
|
|
|
|
site = Site(host=args.host, path=args.path)
|
|
|
|
|
|
|
|
with open('login.txt', 'r') as login: # read login user & pwd
|
|
|
|
loginlines = login.read()
|
|
|
|
user, pwd = loginlines.split('\n')
|
|
|
|
site.login(username=user, password=pwd) # login to wiki
|
|
|
|
|
|
|
|
# read template files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SLASH = "\u2044"
|
|
|
|
|
|
|
|
def filenameforpage(p):
|
|
|
|
f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
|
|
|
|
return f
|
|
|
|
|
|
|
|
def filenameforlink(href):
|
|
|
|
href = urlunquote(href)
|
|
|
|
if href.startswith("/sandbox/itchwiki/index.php/"):
|
|
|
|
href = href[len("/sandbox/itchwiki/index.php/"):]
|
|
|
|
href = href.replace(' ','_').replace('/', SLASH) + '.html'
|
|
|
|
href = urlquote(href)
|
|
|
|
return href
|
|
|
|
|
|
|
|
def rewritelinks (html):
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
for a in t.findall(".//*[@href]"):
|
|
|
|
linkclass = a.attrib.get("class", "")
|
|
|
|
href = a.attrib.get("href")
|
|
|
|
if "external" in linkclass:
|
|
|
|
# leave external links alone
|
|
|
|
continue
|
|
|
|
# print ("LINK", href)
|
|
|
|
if href.startswith("/sandbox/itchwiki/index.php/"):
|
|
|
|
new_href = filenameforlink(href)
|
|
|
|
a.attrib['href'] = new_href
|
|
|
|
if args.local is True:
|
|
|
|
for img in t.findall(".//img[@src]"):
|
|
|
|
src = img.attrib.get("src")
|
|
|
|
if not src.startswith('http'):
|
|
|
|
img.attrib['src'] = 'https://hub.xpub.nl' + src
|
|
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
def rewriteimgs(html):
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
# remove the srcset value as it prevents images from displaying
|
|
|
|
for img in t.findall(".//img[@srcset]"):
|
|
|
|
img.attrib['srcset'] = ""
|
|
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
|
|
return html
|
|
|
|
|
|
|
|
publish=site.Categories['Publish']
|
|
|
|
for cat in publish.members():
|
|
|
|
if cat.namespace != 14:
|
|
|
|
continue
|
|
|
|
print('dumping category {}'.format(cat.page_title))
|
|
|
|
# title=site.Categories['Title']
|
|
|
|
try:
|
|
|
|
with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
|
|
|
|
template = Template(templatefile.read())
|
|
|
|
except FileNotFoundError:
|
|
|
|
with open('templates/default.html') as templatefile:
|
|
|
|
template = Template(templatefile.read())
|
|
|
|
for p in cat.members():
|
|
|
|
print(p)
|
|
|
|
htmlsrc = site.parse(page=p.name)['text']['*']
|
|
|
|
htmlsrc = rewritelinks(htmlsrc)
|
|
|
|
htmlsrc = rewriteimgs(htmlsrc)
|
|
|
|
|
|
|
|
if args.local is True:
|
|
|
|
html = template.render(page=p, body=htmlsrc, staticpath='..')
|
|
|
|
else:
|
|
|
|
html = template.render(page=p, body=htmlsrc, staticpath='0')
|
|
|
|
|
|
|
|
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
|
|
|
|
print(html, file=f)
|
|
|
|
if args.one:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|