special-issue-11-wiki2html/dumpwiki.py

import os, json, sys
from mwclient import Site
from jinja2 import Template
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote


NS_MAIN = 0
NS_CATEGORY = 14

p = argparse.ArgumentParser(description="Dump wiki files to html",
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="../archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)")
p.add_argument("--imgsrc", default='archive',
               choices=['archive', 'remote'],
               help="What is the source of the images?")

args = p.parse_args()
print(args)
# site and login

site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__))  # working directory
wd_name = os.path.split(wd)[-1] # name of dir running script
with open('login.txt', 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki

if not args.skipimages:
    imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
    with open(imgsjson_fn, 'r') as imgsjson_file:
        images_info = json.load(imgsjson_file)


SLASH = "\u2044"


def filenameforpage(p):
    f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
    return f

def filenameforlink(href):
    href = urlunquote(href)
    if href.startswith("/sandbox/itchwiki/index.php/"):
        href = href[len("/sandbox/itchwiki/index.php/"):]
        href = href.replace(' ','_').replace('/', SLASH) + '.html'
    href = urlquote(href)
    return href


def rewriteimglinks(tree):
    # t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)

    # invoke after img src have be rewritten
    # remove links to wiki File: pages
    for a in tree.findall(".//a[@class='image']"):  # select img wrapping a
        href = a.attrib.get('href')
        if a.findall(".//img"):  # ensure a has child: img
            img = a.find(".//img")
            img_src = img.attrib['src']
            a.attrib['href'] = img_src  # 'javascript:void(0);'  # disable href
            a.attrib['target'] = "_blank"
            print(a)
            print(ET.tostring(a, method="html", encoding="unicode"))
    return tree

def rewritelinks (html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = a.attrib.get("href")
        if "external" in linkclass:
            # leave external links alone
            continue
        # print ("LINK", href)
        if href.startswith("/sandbox/itchwiki/index.php/"):
            new_href = filenameforlink(href)
            a.attrib['href'] = new_href
    html = ET.tostring(t, method="html", encoding="unicode")
    return html


def rewriteimgs(html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)

    # replace images url with local image in ../images
    for img in t.findall(".//img[@src]"):
        # imgsrc can be:
            # remote: url remains
            # archive f' images/{img_filename}'
            # local: f'../../images/{img_filename}'

        if args.imgsrc == 'remote':
            src = img.attrib.get("src")
            if not src.startswith('http'):
                img.attrib['src'] = 'https://hub.xpub.nl' + src
        else:  # local / archive imgsrc
            img_alt = img.attrib.get("alt")  # alt property has filename
            img_page = f'File:{img_alt}' # find image it images.json
            try:
                # get its filename
                img_filename = images_info[img_page]['filename']
            except KeyError:
                print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images")
                print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
                sys.exit()
            # same dir as HTML files: archive/
            img.attrib['src'] = f'./images/{img_filename}'

        img.attrib['srcset'] = ""  # rm srcset value:it prevent imgs displaying
        img.attrib['width'] = ""
        img.attrib['height'] = ""

    t = rewriteimglinks(tree=t)

    html = ET.tostring(t, method="html", encoding="unicode")
    return html

def dumppage(p, template, rewrite_images=True):
    htmlsrc = site.parse(page=p.name)['text']['*']
    htmlsrc = rewritelinks(htmlsrc)
    if rewrite_images:
        htmlsrc = rewriteimgs(htmlsrc)
    # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0
    html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}')
    with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
        f.write(html)
        # print(html, file=f)

publish=site.Categories['Publish']
for cat in publish.members():
    if cat.namespace == NS_CATEGORY:
        print('dumping category {}'.format(cat.page_title))
        # title=site.Categories['Title']
        try:
            with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
                template = Template(templatefile.read())
        except FileNotFoundError:
            with open('templates/default.html') as templatefile:
                template = Template(templatefile.read())
        for p in cat.members():
            print(p)
            dumppage(p, template, rewrite_images=not args.skipimages)
            if args.one:
                break
    else:
        print("Dumping page {}".format(cat.page_title))
        with open('templates/default.html') as templatefile:
            template = Template(templatefile.read())
        dumppage(cat, template, rewrite_images=not args.skipimages)