special-issue-11-wiki2html/dumpwiki.py

import os, json, sys, urllib
from mwclient import Site
from pprint import pprint
from jinja2 import Template
from functions import unpack_response, clean_dir, remove_nonwords
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote


p = argparse.ArgumentParser(description="Dump wiki files to html",
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images")

args = p.parse_args()
print(args)
# site and login

site = Site(host=args.host, path=args.path)

with open('login.txt', 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki

# read template files


SLASH = "\u2044"

def filenameforpage(p):
    f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
    return f

def filenameforlink(href):
    href = urlunquote(href)
    if href.startswith("/sandbox/itchwiki/index.php/"):
        href = href[len("/sandbox/itchwiki/index.php/"):]
        href = href.replace(' ','_').replace('/', SLASH) + '.html'
    href = urlquote(href)
    return href

def rewritelinks (html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = a.attrib.get("href")
        if "external" in linkclass:
            # leave external links alone
            continue
        # print ("LINK", href)
        if href.startswith("/sandbox/itchwiki/index.php/"):
            new_href = filenameforlink(href)
            a.attrib['href'] = new_href
    if args.local is True:
        for img in t.findall(".//img[@src]"):
            src = img.attrib.get("src")
            if not src.startswith('http'):
                img.attrib['src'] = 'https://hub.xpub.nl' + src
    html = ET.tostring(t, method="html", encoding="unicode")
    return html


def rewriteimgs(html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    # remove the srcset value as it prevents images from displaying
    for img in t.findall(".//img[@srcset]"):
        img.attrib['srcset'] = ""
    html = ET.tostring(t, method="html", encoding="unicode")
    return html

publish=site.Categories['Publish']
for cat in publish.members():
    if cat.namespace != 14:
        continue 
    print('dumping category {}'.format(cat.page_title))
    # title=site.Categories['Title']
    try:
        with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
            template = Template(templatefile.read())
    except FileNotFoundError:
        with open('templates/default.html') as templatefile:
            template = Template(templatefile.read())        
    for p in cat.members():
        print(p)
        htmlsrc = site.parse(page=p.name)['text']['*']
        htmlsrc = rewritelinks(htmlsrc)
        htmlsrc = rewriteimgs(htmlsrc)

        if args.local is True:
            html = template.render(page=p, body=htmlsrc, staticpath='..')
        else:
            html = template.render(page=p, body=htmlsrc, staticpath='0')

        with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
            print(html, file=f)
        if args.one:
            break
dumpwiki 5 years ago			`import os, json, sys, urllib`
			`from mwclient import Site`
			`from pprint import pprint`
			`from jinja2 import Template`
			`from functions import unpack_response, clean_dir, remove_nonwords`
rewritelinks starting 5 years ago			`import html5lib`
dumpwiki 5 years ago			`from functions import Colors`
			`import argparse`
something 5 years ago			`from xml.etree import ElementTree as ET`
urlquote/unquote for link rewriting 5 years ago			`from urllib.parse import quote as urlquote, unquote as urlunquote`

dumpwiki 5 years ago
			`p = argparse.ArgumentParser(description="Dump wiki files to html",`
			`formatter_class=argparse.ArgumentDefaultsHelpFormatter)`
			`p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')`
			`p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")`
			`p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")`
something 5 years ago			`p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")`
option --local; documentation 5 years ago			`p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images")`
dumpwiki 5 years ago
			`args = p.parse_args()`
added var staticpath to templates to allow JS/CSS files to be reached with running on --local or archive 5 years ago			`print(args)`
dumpwiki 5 years ago			`# site and login`

			`site = Site(host=args.host, path=args.path)`

			`with open('login.txt', 'r') as login: # read login user & pwd`
			`loginlines = login.read()`
			`user, pwd = loginlines.split('\n')`
			`site.login(username=user, password=pwd) # login to wiki`

			`# read template files`

publish cat 5 years ago
dumpwiki 5 years ago
fix slash 5 years ago			`SLASH = "\u2044"`

dumpwiki 5 years ago			`def filenameforpage(p):`
something 5 years ago			`f = p.name.replace(' ','_').replace('/', SLASH) + '.html'`
dumpwiki 5 years ago			`return f`

filenameforhref 5 years ago			`def filenameforlink(href):`
urlquote/unquote for link rewriting 5 years ago			`href = urlunquote(href)`
filenameforhref 5 years ago			`if href.startswith("/sandbox/itchwiki/index.php/"):`
			`href = href[len("/sandbox/itchwiki/index.php/"):]`
urlquote/unquote for link rewriting 5 years ago			`href = href.replace(' ','_').replace('/', SLASH) + '.html'`
			`href = urlquote(href)`
filenameforhref 5 years ago			`return href`

something 5 years ago			`def rewritelinks (html):`
			`t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)`
rewritelinks starting 5 years ago			`for a in t.findall(".//*[@href]"):`
			`linkclass = a.attrib.get("class", "")`
			`href = a.attrib.get("href")`
			`if "external" in linkclass:`
			`# leave external links alone`
			`continue`
href 5 years ago			`# print ("LINK", href)`
something 5 years ago			`if href.startswith("/sandbox/itchwiki/index.php/"):`
href 5 years ago			`new_href = filenameforlink(href)`
			`a.attrib['href'] = new_href`
option --local; documentation 5 years ago			`if args.local is True:`
			`for img in t.findall(".//img[@src]"):`
			`src = img.attrib.get("src")`
			`if not src.startswith('http'):`
			`img.attrib['src'] = 'https://hub.xpub.nl' + src`
something 5 years ago			`html = ET.tostring(t, method="html", encoding="unicode")`
			`return html`
rewritelinks starting 5 years ago
removing imgs srcset value 5 years ago
			`def rewriteimgs(html):`
			`t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)`
			`# remove the srcset value as it prevents images from displaying`
			`for img in t.findall(".//img[@srcset]"):`
			`img.attrib['srcset'] = ""`
			`html = ET.tostring(t, method="html", encoding="unicode")`
			`return html`

publish cat 5 years ago			`publish=site.Categories['Publish']`
			`for cat in publish.members():`
something 5 years ago			`if cat.namespace != 14:`
publish cat 5 years ago			`continue`
			`print('dumping category {}'.format(cat.page_title))`
			`# title=site.Categories['Title']`
			`try:`
			`with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:`
			`template = Template(templatefile.read())`
			`except FileNotFoundError:`
			`with open('templates/default.html') as templatefile:`
			`template = Template(templatefile.read())`
			`for p in cat.members():`
			`print(p)`
			`htmlsrc = site.parse(page=p.name)['text']['*']`
something 5 years ago			`htmlsrc = rewritelinks(htmlsrc)`
removing imgs srcset value 5 years ago			`htmlsrc = rewriteimgs(htmlsrc)`

added var staticpath to templates to allow JS/CSS files to be reached with running on --local or archive 5 years ago			`if args.local is True:`
			`html = template.render(page=p, body=htmlsrc, staticpath='..')`
			`else:`
			`html = template.render(page=p, body=htmlsrc, staticpath='0')`

publish cat 5 years ago			`with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:`
			`print(html, file=f)`
something 5 years ago			`if args.one:`
			`break`
dumpwiki 5 years ago