special-issue-11-wiki2html/dumpwiki.py

import os, json, sys
from mwclient import Site
from jinja2 import Template
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote


p = argparse.ArgumentParser(description="Dump wiki files to html",
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="../archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--imgsrc", default='archive',
               choices=['archive', 'remote'],
               help="What is the source of the images?")

args = p.parse_args()
print(args)
# site and login

site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__))  # working directory
wd_name = os.path.split(wd)[-1] # name of dir running script
with open('login.txt', 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki

imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
with open(imgsjson_fn, 'r') as imgsjson_file:
    images_info = json.load(imgsjson_file)


SLASH = "\u2044"
HYPHEN = "\u2010"


def filenameforpage(p):
    f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
    return f

def filenameforlink(href):
    href = urlunquote(href)
    if href.startswith("/sandbox/itchwiki/index.php/"):
        href = href[len("/sandbox/itchwiki/index.php/"):]
        href = href.replace(' ','_').replace('/', SLASH).replace('‐', HYPHEN) + '.html'
    href = urlquote(href)
    return href

def rewritelinks (html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = a.attrib.get("href")
        if "external" in linkclass:
            # leave external links alone
            continue
        # print ("LINK", href)
        if href.startswith("/sandbox/itchwiki/index.php/"):
            new_href = filenameforlink(href)
            a.attrib['href'] = new_href
    return html


def rewriteimgs(html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)

    # replace images url with local image in ../images
    for img in t.findall(".//img[@src]"):
        # imgsrc can be:
            # remote: url remains
            # archive f' images/{img_filename}'
            # local: f'../../images/{img_filename}'

        if args.imgsrc == 'remote':
            src = img.attrib.get("src")
            if not src.startswith('http'):
                img.attrib['src'] = 'https://hub.xpub.nl' + src
        else:  # local / archive imgsrc
            img_alt = img.attrib.get("alt")  # alt property has filename
            img_page = f'File:{img_alt}' # find image it images.json
            try:
                # get its filename
                img_filename = images_info[img_page]['filename']
            except KeyError:
                print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
                print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
                sys.exit()
            # same dir as HTML files: archive/
            img.attrib['src'] = f'./images/{img_filename}'

        img.attrib['srcset'] = ""  # rm srcset value:it prevent imgs displaying
        img.attrib['width'] = ""
        img.attrib['height'] = ""
    html = ET.tostring(t, method="html", encoding="unicode")
    return html

publish=site.Categories['Publish']
for cat in publish.members():
    if cat.namespace != 14:
        continue 
    print('dumping category {}'.format(cat.page_title))
    # title=site.Categories['Title']
    try:
        with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
            template = Template(templatefile.read())
    except FileNotFoundError:
        with open('templates/default.html') as templatefile:
            template = Template(templatefile.read())        
    for p in cat.members():
        print(p)
        htmlsrc = site.parse(page=p.name)['text']['*']
        htmlsrc = rewritelinks(htmlsrc)
        htmlsrc = rewriteimgs(htmlsrc)

        # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0

        html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}')

        with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
            f.write(html)
            # print(html, file=f)
        if args.one:
            break
-												images: remote, local, archoive

											
										
										
											4 years ago
+								import os, json, sys
-												dumpwiki

											
										
										
											4 years ago
+								from mwclient import Site
 								from jinja2 import Template
-												rewritelinks starting

											
										
										
											4 years ago
+								import html5lib
-												dumpwiki

											
										
										
											4 years ago
+								from functions import Colors
 								import argparse
-												something

											
										
										
											4 years ago
+								from xml.etree import ElementTree as ET
-												urlquote/unquote for link rewriting

											
										
										
											4 years ago
+								from urllib.parse import quote as urlquote, unquote as urlunquote
-												dumpwiki

											
										
										
											4 years ago
 								p = argparse.ArgumentParser(description="Dump wiki files to html",
 								                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 								p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
 								p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
-												removed --local

											
										
										
											4 years ago
+								p.add_argument("--output", default="../archive", help="Output path for pages")
-												something

											
										
										
											4 years ago
+								p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
-												images: remote, local, archoive

											
										
										
											4 years ago
+								p.add_argument("--imgsrc", default='archive',
-												removed --local

											
										
										
											4 years ago
+								               choices=['archive', 'remote'],
-												images: remote, local, archoive

											
										
										
											4 years ago
+								               help="What is the source of the images?")
-												dumpwiki

											
										
										
											4 years ago
 								args = p.parse_args()
-												added var staticpath to templates to allow JS/CSS files to be reached with running on --local or archive

											
										
										
											4 years ago
+								print(args)
-												dumpwiki

											
										
										
											4 years ago
+								# site and login
 								site = Site(host=args.host, path=args.path)
-												images: remote, local, archoive

											
										
										
											4 years ago
+								wd = os.path.dirname(os.path.abspath(__file__))  # working directory
-												removed --local

											
										
										
											4 years ago
+								wd_name = os.path.split(wd)[-1] # name of dir running script
-												dumpwiki

											
										
										
											4 years ago
+								with open('login.txt', 'r') as login:  # read login user & pwd
 								    loginlines = login.read()
 								    user, pwd = loginlines.split('\n')
 								    site.login(username=user, password=pwd)  # login to wiki
-												images: remote, local, archoive

											
										
										
											4 years ago
+								imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
 								with open(imgsjson_fn, 'r') as imgsjson_file:
 								    images_info = json.load(imgsjson_file)
-												publish cat

											
										
										
											4 years ago
-												dumpwiki

											
										
										
											4 years ago
-												fix slash

											
										
										
											4 years ago
+								SLASH = "\u2044"
-												writing html content to file

											
										
										
											4 years ago
+								HYPHEN = "\u2010"
-												replacing hyphen

											
										
										
											4 years ago
-												fix slash

											
										
										
											4 years ago
-												dumpwiki

											
										
										
											4 years ago
+								def filenameforpage(p):
-												something

											
										
										
											4 years ago
+								    f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
-												dumpwiki

											
										
										
											4 years ago
+								    return f
-												filenameforhref

											
										
										
											4 years ago
+								def filenameforlink(href):
-												urlquote/unquote for link rewriting

											
										
										
											4 years ago
+								    href = urlunquote(href)
-												filenameforhref

											
										
										
											4 years ago
+								    if href.startswith("/sandbox/itchwiki/index.php/"):
 								        href = href[len("/sandbox/itchwiki/index.php/"):]
-												writing html content to file

											
										
										
											4 years ago
+								        href = href.replace(' ','_').replace('/', SLASH).replace('‐', HYPHEN) + '.html'
-												urlquote/unquote for link rewriting

											
										
										
											4 years ago
+								    href = urlquote(href)
-												filenameforhref

											
										
										
											4 years ago
+								    return href
-												something

											
										
										
											4 years ago
+								def rewritelinks (html):
 								    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
-												rewritelinks starting

											
										
										
											4 years ago
+								    for a in t.findall(".//*[@href]"):
 								        linkclass = a.attrib.get("class", "")
 								        href = a.attrib.get("href")
 								        if "external" in linkclass:
 								            # leave external links alone
 								            continue
-												href

											
										
										
											4 years ago
+								        # print ("LINK", href)
-												something

											
										
										
											4 years ago
+								        if href.startswith("/sandbox/itchwiki/index.php/"):
-												href

											
										
										
											4 years ago
+								            new_href = filenameforlink(href)
 								            a.attrib['href'] = new_href
-												something

											
										
										
											4 years ago
+								    return html
-												rewritelinks starting

											
										
										
											4 years ago
-												removing imgs srcset value

											
										
										
											4 years ago
 								def rewriteimgs(html):
 								    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
-												images: remote, local, archoive

											
										
										
											4 years ago
 								    # replace images url with local image in ../images
 								    for img in t.findall(".//img[@src]"):
 								        # imgsrc can be:
 								            # remote: url remains
 								            # archive f' images/{img_filename}'
 								            # local: f'../../images/{img_filename}'
 								        if args.imgsrc == 'remote':
 								            src = img.attrib.get("src")
 								            if not src.startswith('http'):
 								                img.attrib['src'] = 'https://hub.xpub.nl' + src
 								        else:  # local / archive imgsrc
 								            img_alt = img.attrib.get("alt")  # alt property has filename
 								            img_page = f'File:{img_alt}' # find image it images.json
 								            try:
 								                # get its filename
 								                img_filename = images_info[img_page]['filename']
 								            except KeyError:
 								                print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
 								                print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
 								                sys.exit()
-												removed --local

											
										
										
											4 years ago
+								            # same dir as HTML files: archive/
 								            img.attrib['src'] = f'./images/{img_filename}'
-												images: remote, local, archoive

											
										
										
											4 years ago
 								        img.attrib['srcset'] = ""  # rm srcset value:it prevent imgs displaying
 								        img.attrib['width'] = ""
 								        img.attrib['height'] = ""
-												removing imgs srcset value

											
										
										
											4 years ago
+								    html = ET.tostring(t, method="html", encoding="unicode")
 								    return html
-												publish cat

											
										
										
											4 years ago
+								publish=site.Categories['Publish']
 								for cat in publish.members():
-												something

											
										
										
											4 years ago
+								    if cat.namespace != 14:
-												publish cat

											
										
										
											4 years ago
+								        continue
 								    print('dumping category {}'.format(cat.page_title))
 								    # title=site.Categories['Title']
 								    try:
 								        with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
 								            template = Template(templatefile.read())
 								    except FileNotFoundError:
 								        with open('templates/default.html') as templatefile:
 								            template = Template(templatefile.read())
 								    for p in cat.members():
 								        print(p)
 								        htmlsrc = site.parse(page=p.name)['text']['*']
-												something

											
										
										
											4 years ago
+								        htmlsrc = rewritelinks(htmlsrc)
-												removing imgs srcset value

											
										
										
											4 years ago
+								        htmlsrc = rewriteimgs(htmlsrc)
-												removed --local

											
										
										
											4 years ago
+								        # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0
 								        html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}')
-												added var staticpath to templates to allow JS/CSS files to be reached with running on --local or archive

											
										
										
											4 years ago
-												publish cat

											
										
										
											4 years ago
+								        with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
-												writing html content to file

											
										
										
											4 years ago
+								            f.write(html)
 								            # print(html, file=f)
-												something

											
										
										
											4 years ago
+								        if args.one:
 								            break
-												dumpwiki

											
										
										
											4 years ago