You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
special-issue-11-wiki2html/dumpwiki.py

172 lines
6.6 KiB
Python

import os, json, sys
from mwclient import Site
from jinja2 import Template
from shutil import copy
import html5lib
from functions import Colors
import argparse
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote
NS_MAIN = 0
NS_CATEGORY = 14
p = argparse.ArgumentParser(description="Dump wiki files to html",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="../archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)")
p.add_argument("--imgsrc", default='archive',
choices=['archive', 'remote'],
help="What is the source of the images?")
args = p.parse_args()
print(args)
# site and login
site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
wd_name = os.path.split(wd)[-1] # name of dir running script
# copy static/ to ../archive/static
repo_static_path = './static'
archive_static_path = os.path.join(args.output, repo_static_path)
os.makedirs(archive_static_path, exist_ok=True) # create static/ dir in archive
for static_file in os.listdir(path='./static'):
copy(src=os.path.join(repo_static_path, static_file),
dst=os.path.join(archive_static_path, static_file))
with open('login.txt', 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
if not args.skipimages:
imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
SLASH = "\u2044"
def filenameforpage(p):
f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
return f
def filenameforlink(href):
href = urlunquote(href)
if href.startswith("/sandbox/itchwiki/index.php/"):
href = href[len("/sandbox/itchwiki/index.php/"):]
href = href.replace(' ','_').replace('/', SLASH) + '.html'
href = urlquote(href)
return href
def rewriteimglinks(tree, page):
# invoke after img src has been rewritten
# To: remove links to wiki File on all pages
# but Overview_main_page page where link to publication page is added
if page.name == 'Overview main page':
for div_parent in tree.findall(".//div[@class='tooltip']"):
anchor_of_img = div_parent.find(".//div/a")
if anchor_of_img.find(".//img") is not None: # <a> needs child <img>
a_tag = div_parent.find(".//p/span/a")
publication_href = a_tag.attrib.get('href')
anchor_of_img.attrib['href'] = publication_href
else:
for a in tree.findall(".//a[@class='image']"): # select img wrapping a
if a.findall(".//img"): # ensure a has child: img
a.attrib['href'] = 'javascript:void(0);' # disable href
return tree
def rewritelinks(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = a.attrib.get("href")
if "external" in linkclass:
# leave external links alone
continue
# print ("LINK", href)
if href.startswith("/sandbox/itchwiki/index.php/"):
new_href = filenameforlink(href)
a.attrib['href'] = new_href
html = ET.tostring(t, method="html", encoding="unicode")
return html
def rewriteimgs(html, page):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
# replace images url with local image in ../images
for img in t.findall(".//img[@src]"):
# imgsrc can be:
# remote: url remains
# archive f' images/{img_filename}'
# local: f'../../images/{img_filename}'
if args.imgsrc == 'remote':
src = img.attrib.get("src")
if not src.startswith('http'):
img.attrib['src'] = 'https://hub.xpub.nl' + src
else: # local / archive imgsrc
img_alt = img.attrib.get("alt") # alt property has filename
img_page = f'File:{img_alt}' # find image it images.json
try:
# get its filename
img_filename = images_info[img_page]['filename']
except KeyError:
print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images")
print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
sys.exit()
# same dir as HTML files: archive/
img.attrib['src'] = f'./images/{img_filename}'
img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying
img.attrib['width'] = ""
img.attrib['height'] = ""
t = rewriteimglinks(tree=t, page=page)
html = ET.tostring(t, method="html", encoding="unicode")
return html
def dumppage(p, template, rewrite_images=True):
htmlsrc = site.parse(page=p.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
if rewrite_images:
htmlsrc = rewriteimgs(html=htmlsrc, page=p)
html = template.render(page=p, body=htmlsrc, staticpath='.')
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
f.write(html)
# print(html, file=f)
publish=site.Categories['Publish']
for cat in publish.members():
if cat.namespace == NS_CATEGORY:
print('dumping category {}'.format(cat.page_title))
# title=site.Categories['Title']
try:
with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
template = Template(templatefile.read())
except FileNotFoundError:
with open('templates/default.html') as templatefile:
template = Template(templatefile.read())
for p in cat.members():
print(p)
dumppage(p, template, rewrite_images=not args.skipimages)
if args.one:
break
else:
print("Dumping page {}".format(cat.page_title))
with open('templates/default.html') as templatefile:
template = Template(templatefile.read())
dumppage(cat, template, rewrite_images=not args.skipimages)