special-issue-11-wiki2html/dumpwiki.py

import os, json, sys
from mwclient import Site
from jinja2 import Template
from shutil import copy
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote


NS_MAIN = 0
NS_CATEGORY = 14

p = argparse.ArgumentParser(description="Dump wiki files to html",
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="../archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)")
p.add_argument("--imgsrc", default='archive',
               choices=['archive', 'remote'],
               help="What is the source of the images?")

args = p.parse_args()
print(args)
# site and login

site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__))  # working directory
wd_name = os.path.split(wd)[-1] # name of dir running script

# copy static/ to ../archive/static
repo_static_path = './static'
archive_static_path = os.path.join(args.output, repo_static_path)
os.makedirs(archive_static_path, exist_ok=True) # create static/ dir in archive
for static_file in os.listdir(path='./static'):
    copy(src=os.path.join(repo_static_path, static_file),
         dst=os.path.join(archive_static_path, static_file))


with open('login.txt', 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki

if not args.skipimages:
    imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
    with open(imgsjson_fn, 'r') as imgsjson_file:
        images_info = json.load(imgsjson_file)


SLASH = "\u2044"


def filenameforpage(p):
    f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
    return f

def filenameforlink(href):
    href = urlunquote(href)
    if href.startswith("/sandbox/itchwiki/index.php/"):
        href = href[len("/sandbox/itchwiki/index.php/"):]
        href = href.replace(' ','_').replace('/', SLASH) + '.html'
    href = urlquote(href)
    return href


def rewriteimglinks(tree, page):
    # invoke after img src has been rewritten
    # To: remove links to wiki File on all pages
    # but Overview_main_page page where link to publication page is added
    if page.name == 'Overview main page':
        for div_parent in tree.findall(".//div[@class='tooltip']"):
            anchor_of_img = div_parent.find(".//div/a")
            if anchor_of_img.find(".//img") is not None: # <a> needs child <img>
                a_tag = div_parent.find(".//p/span/a")
                publication_href = a_tag.attrib.get('href')
                anchor_of_img.attrib['href'] = publication_href
    else:
        for a in tree.findall(".//a[@class='image']"):  # select img wrapping a
            if a.findall(".//img"):  # ensure a has child: img
                a.attrib['href'] = 'javascript:void(0);'  # disable href
    return tree

def rewritelinks(html):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
    for a in t.findall(".//*[@href]"):
        linkclass = a.attrib.get("class", "")
        href = a.attrib.get("href")
        if "external" in linkclass:
            # leave external links alone
            continue
        # print ("LINK", href)
        if href.startswith("/sandbox/itchwiki/index.php/"):
            new_href = filenameforlink(href)
            a.attrib['href'] = new_href
    html = ET.tostring(t, method="html", encoding="unicode")
    return html


def rewriteimgs(html, page):
    t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)

    # replace images url with local image in ../images
    for img in t.findall(".//img[@src]"):
        # imgsrc can be:
            # remote: url remains
            # archive f' images/{img_filename}'
            # local: f'../../images/{img_filename}'

        if args.imgsrc == 'remote':
            src = img.attrib.get("src")
            if not src.startswith('http'):
                img.attrib['src'] = 'https://hub.xpub.nl' + src
        else:  # local / archive imgsrc
            img_alt = img.attrib.get("alt")  # alt property has filename
            img_page = f'File:{img_alt}' # find image it images.json
            try:
                # get its filename
                img_filename = images_info[img_page]['filename']
            except KeyError:
                print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images")
                print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
                sys.exit()
            # same dir as HTML files: archive/
            img.attrib['src'] = f'./images/{img_filename}'

        img.attrib['srcset'] = ""  # rm srcset value:it prevent imgs displaying
        img.attrib['width'] = ""
        img.attrib['height'] = ""

    t = rewriteimglinks(tree=t, page=page)

    html = ET.tostring(t, method="html", encoding="unicode")
    return html

def dumppage(p, template, rewrite_images=True):
    htmlsrc = site.parse(page=p.name)['text']['*']
    htmlsrc = rewritelinks(htmlsrc)
    if rewrite_images:
        htmlsrc = rewriteimgs(html=htmlsrc, page=p)
    html = template.render(page=p, body=htmlsrc, staticpath='.')
    with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
        f.write(html)
        # print(html, file=f)

publish=site.Categories['Publish']
for cat in publish.members():
    if cat.namespace == NS_CATEGORY:
        print('dumping category {}'.format(cat.page_title))
        # title=site.Categories['Title']
        try:
            with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
                template = Template(templatefile.read())
        except FileNotFoundError:
            with open('templates/default.html') as templatefile:
                template = Template(templatefile.read())
        for p in cat.members():
            print(p)
            dumppage(p, template, rewrite_images=not args.skipimages)
            if args.one:
                break
    else:
        print("Dumping page {}".format(cat.page_title))
        with open('templates/default.html') as templatefile:
            template = Template(templatefile.read())
        dumppage(cat, template, rewrite_images=not args.skipimages)
images: remote, local, archoive 4 years ago			`import os, json, sys`
dumpwiki 4 years ago			`from mwclient import Site`
			`from jinja2 import Template`
dumpwiki.py copy ./static to ../archive/static and points to it on templates 4 years ago			`from shutil import copy`
rewritelinks starting 4 years ago			`import html5lib`
dumpwiki 4 years ago			`from functions import Colors`
			`import argparse`
something 4 years ago			`from xml.etree import ElementTree as ET`
urlquote/unquote for link rewriting 4 years ago			`from urllib.parse import quote as urlquote, unquote as urlunquote`

dumpwiki 4 years ago
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`NS_MAIN = 0`
			`NS_CATEGORY = 14`

dumpwiki 4 years ago			`p = argparse.ArgumentParser(description="Dump wiki files to html",`
			`formatter_class=argparse.ArgumentDefaultsHelpFormatter)`
			`p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')`
			`p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")`
removed --local 4 years ago			`p.add_argument("--output", default="../archive", help="Output path for pages")`
something 4 years ago			`p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")`
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)")`
images: remote, local, archoive 4 years ago			`p.add_argument("--imgsrc", default='archive',`
removed --local 4 years ago			`choices=['archive', 'remote'],`
images: remote, local, archoive 4 years ago			`help="What is the source of the images?")`
dumpwiki 4 years ago
			`args = p.parse_args()`
added var staticpath to templates to allow JS/CSS files to be reached with running on --local or archive 4 years ago			`print(args)`
dumpwiki 4 years ago			`# site and login`

			`site = Site(host=args.host, path=args.path)`
images: remote, local, archoive 4 years ago			`wd = os.path.dirname(os.path.abspath(__file__)) # working directory`
removed --local 4 years ago			`wd_name = os.path.split(wd)[-1] # name of dir running script`
dumpwiki.py copy ./static to ../archive/static and points to it on templates 4 years ago
			`# copy static/ to ../archive/static`
			`repo_static_path = './static'`
			`archive_static_path = os.path.join(args.output, repo_static_path)`
			`os.makedirs(archive_static_path, exist_ok=True) # create static/ dir in archive`
			`for static_file in os.listdir(path='./static'):`
			`copy(src=os.path.join(repo_static_path, static_file),`
			`dst=os.path.join(archive_static_path, static_file))`


dumpwiki 4 years ago			`with open('login.txt', 'r') as login: # read login user & pwd`
			`loginlines = login.read()`
			`user, pwd = loginlines.split('\n')`
			`site.login(username=user, password=pwd) # login to wiki`

dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`if not args.skipimages:`
			`imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file`
			`with open(imgsjson_fn, 'r') as imgsjson_file:`
			`images_info = json.load(imgsjson_file)`
publish cat 4 years ago
dumpwiki 4 years ago
fix slash 4 years ago			`SLASH = "\u2044"`
replacing hyphen 4 years ago
fix slash 4 years ago
dumpwiki 4 years ago			`def filenameforpage(p):`
something 4 years ago			`f = p.name.replace(' ','_').replace('/', SLASH) + '.html'`
dumpwiki 4 years ago			`return f`

filenameforhref 4 years ago			`def filenameforlink(href):`
urlquote/unquote for link rewriting 4 years ago			`href = urlunquote(href)`
filenameforhref 4 years ago			`if href.startswith("/sandbox/itchwiki/index.php/"):`
			`href = href[len("/sandbox/itchwiki/index.php/"):]`
removing unecessary hyphen replacement (which i introduced) 4 years ago			`href = href.replace(' ','_').replace('/', SLASH) + '.html'`
urlquote/unquote for link rewriting 4 years ago			`href = urlquote(href)`
filenameforhref 4 years ago			`return href`

disabeling links to wiki File pages 4 years ago
Overview main page: image as links (wip) 4 years ago			`def rewriteimglinks(tree, page):`
Overview main page: img link to publication pages 4 years ago			`# invoke after img src has been rewritten`
			`# To: remove links to wiki File on all pages`
			`# but Overview_main_page page where link to publication page is added`
			`if page.name == 'Overview main page':`
			`for div_parent in tree.findall(".//div[@class='tooltip']"):`
			`anchor_of_img = div_parent.find(".//div/a")`
			`if anchor_of_img.find(".//img") is not None: # <a> needs child <img>`
			`a_tag = div_parent.find(".//p/span/a")`
			`publication_href = a_tag.attrib.get('href')`
			`anchor_of_img.attrib['href'] = publication_href`
			`else:`
			`for a in tree.findall(".//a[@class='image']"): # select img wrapping a`
			`if a.findall(".//img"): # ensure a has child: img`
Overview main page: image as links (wip) 4 years ago			`a.attrib['href'] = 'javascript:void(0);' # disable href`
image parent links: link to image file 4 years ago			`return tree`
disabeling links to wiki File pages 4 years ago
Overview main page: image as links (wip) 4 years ago			`def rewritelinks(html):`
image parent links: link to image file 4 years ago			`t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)`
rewritelinks starting 4 years ago			`for a in t.findall(".//*[@href]"):`
			`linkclass = a.attrib.get("class", "")`
			`href = a.attrib.get("href")`
			`if "external" in linkclass:`
			`# leave external links alone`
			`continue`
href 4 years ago			`# print ("LINK", href)`
something 4 years ago			`if href.startswith("/sandbox/itchwiki/index.php/"):`
href 4 years ago			`new_href = filenameforlink(href)`
			`a.attrib['href'] = new_href`
added back missing ET.tostring in rewritelinks 4 years ago			`html = ET.tostring(t, method="html", encoding="unicode")`
something 4 years ago			`return html`
rewritelinks starting 4 years ago
removing imgs srcset value 4 years ago
Overview main page: image as links (wip) 4 years ago			`def rewriteimgs(html, page):`
removing imgs srcset value 4 years ago			`t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)`
images: remote, local, archoive 4 years ago
			`# replace images url with local image in ../images`
			`for img in t.findall(".//img[@src]"):`
			`# imgsrc can be:`
			`# remote: url remains`
			`# archive f' images/{img_filename}'`
			`# local: f'../../images/{img_filename}'`

			`if args.imgsrc == 'remote':`
			`src = img.attrib.get("src")`
			`if not src.startswith('http'):`
			`img.attrib['src'] = 'https://hub.xpub.nl' + src`
			`else: # local / archive imgsrc`
			`img_alt = img.attrib.get("alt") # alt property has filename`
			`img_page = f'File:{img_alt}' # find image it images.json`
			`try:`
			`# get its filename`
			`img_filename = images_info[img_page]['filename']`
			`except KeyError:`
removed uncessary reference to printout_dict 4 years ago			`print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images")`
images: remote, local, archoive 4 years ago			`print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)`
			`sys.exit()`
removed --local 4 years ago			`# same dir as HTML files: archive/`
			`img.attrib['src'] = f'./images/{img_filename}'`
images: remote, local, archoive 4 years ago
			`img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying`
			`img.attrib['width'] = ""`
			`img.attrib['height'] = ""`
image parent links: link to image file 4 years ago
Overview main page: image as links (wip) 4 years ago			`t = rewriteimglinks(tree=t, page=page)`
image parent links: link to image file 4 years ago
removing imgs srcset value 4 years ago			`html = ET.tostring(t, method="html", encoding="unicode")`
			`return html`

dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`def dumppage(p, template, rewrite_images=True):`
			`htmlsrc = site.parse(page=p.name)['text']['*']`
			`htmlsrc = rewritelinks(htmlsrc)`
			`if rewrite_images:`
Overview main page: image as links (wip) 4 years ago			`htmlsrc = rewriteimgs(html=htmlsrc, page=p)`
dumpwiki.py copy ./static to ../archive/static and points to it on templates 4 years ago			`html = template.render(page=p, body=htmlsrc, staticpath='.')`
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:`
			`f.write(html)`
			`# print(html, file=f)`

publish cat 4 years ago			`publish=site.Categories['Publish']`
			`for cat in publish.members():`
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`if cat.namespace == NS_CATEGORY:`
			`print('dumping category {}'.format(cat.page_title))`
			`# title=site.Categories['Title']`
			`try:`
			`with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:`
			`template = Template(templatefile.read())`
			`except FileNotFoundError:`
			`with open('templates/default.html') as templatefile:`
dumpwiki.py copy ./static to ../archive/static and points to it on templates 4 years ago			`template = Template(templatefile.read())`
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`for p in cat.members():`
			`print(p)`
			`dumppage(p, template, rewrite_images=not args.skipimages)`
			`if args.one:`
			`break`
			`else:`
			`print("Dumping page {}".format(cat.page_title))`
publish cat 4 years ago			`with open('templates/default.html') as templatefile:`
dumpwiki.py copy ./static to ../archive/static and points to it on templates 4 years ago			`template = Template(templatefile.read())`
dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing 4 years ago			`dumppage(cat, template, rewrite_images=not args.skipimages)`
dumpwiki 4 years ago