import os, json, sys, urllib from mwclient import Site from pprint import pprint from jinja2 import Template from functions import unpack_response, clean_dir, remove_nonwords import html5lib from functions import Colors import argparse from xml.etree import ElementTree as ET from urllib.parse import quote as urlquote, unquote as urlunquote p = argparse.ArgumentParser(description="Dump wiki files to html", formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages") p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images") args = p.parse_args() print(args) # site and login site = Site(host=args.host, path=args.path) with open('login.txt', 'r') as login: # read login user & pwd loginlines = login.read() user, pwd = loginlines.split('\n') site.login(username=user, password=pwd) # login to wiki # read template files SLASH = "\u2044" HYPHEN = "\u2010" def filenameforpage(p): f = p.name.replace(' ','_').replace('/', SLASH) + '.html' return f def filenameforlink(href): href = urlunquote(href) if href.startswith("/sandbox/itchwiki/index.php/"): href = href[len("/sandbox/itchwiki/index.php/"):] href = href.replace(' ','_').replace('/', SLASH).replace('‐', HYPHEN) + '.html' href = urlquote(href) return href def rewritelinks (html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) for a in t.findall(".//*[@href]"): linkclass = a.attrib.get("class", "") href = a.attrib.get("href") if "external" in linkclass: # leave external links alone continue # print ("LINK", href) if href.startswith("/sandbox/itchwiki/index.php/"): new_href = filenameforlink(href) a.attrib['href'] = new_href if args.local is True: for img in t.findall(".//img[@src]"): src = img.attrib.get("src") if not src.startswith('http'): img.attrib['src'] = 'https://hub.xpub.nl' + src html = ET.tostring(t, method="html", encoding="unicode") return html def rewriteimgs(html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) # remove the srcset value as it prevents images from displaying for img in t.findall(".//img[@srcset]"): img.attrib['srcset'] = "" html = ET.tostring(t, method="html", encoding="unicode") return html publish=site.Categories['Publish'] for cat in publish.members(): if cat.namespace != 14: continue print('dumping category {}'.format(cat.page_title)) # title=site.Categories['Title'] try: with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile: template = Template(templatefile.read()) except FileNotFoundError: with open('templates/default.html') as templatefile: template = Template(templatefile.read()) for p in cat.members(): print(p) htmlsrc = site.parse(page=p.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) htmlsrc = rewriteimgs(htmlsrc) if args.local is True: html = template.render(page=p, body=htmlsrc, staticpath='..') else: html = template.render(page=p, body=htmlsrc, staticpath='0') with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: print(html, file=f) if args.one: break