You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
special-issue-11-wiki2html/dumpwiki.py

172 lines
6.6 KiB
Python

import os, json, sys
5 years ago
from mwclient import Site
from jinja2 import Template
from shutil import copy
import html5lib
5 years ago
from functions import Colors
import argparse
5 years ago
from xml.etree import ElementTree as ET
from urllib.parse import quote as urlquote, unquote as urlunquote
5 years ago
NS_MAIN = 0
NS_CATEGORY = 14
5 years ago
p = argparse.ArgumentParser(description="Dump wiki files to html",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
5 years ago
p.add_argument("--output", default="../archive", help="Output path for pages")
5 years ago
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)")
p.add_argument("--imgsrc", default='archive',
5 years ago
choices=['archive', 'remote'],
help="What is the source of the images?")
5 years ago
args = p.parse_args()
print(args)
5 years ago
# site and login
site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
5 years ago
wd_name = os.path.split(wd)[-1] # name of dir running script
# copy static/ to ../archive/static
repo_static_path = './static'
archive_static_path = os.path.join(args.output, repo_static_path)
os.makedirs(archive_static_path, exist_ok=True) # create static/ dir in archive
for static_file in os.listdir(path='./static'):
copy(src=os.path.join(repo_static_path, static_file),
dst=os.path.join(archive_static_path, static_file))
5 years ago
with open('login.txt', 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
if not args.skipimages:
imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
5 years ago
5 years ago
5 years ago
SLASH = "\u2044"
5 years ago
5 years ago
5 years ago
def filenameforpage(p):
5 years ago
f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
5 years ago
return f
5 years ago
def filenameforlink(href):
href = urlunquote(href)
5 years ago
if href.startswith("/sandbox/itchwiki/index.php/"):
href = href[len("/sandbox/itchwiki/index.php/"):]
href = href.replace(' ','_').replace('/', SLASH) + '.html'
href = urlquote(href)
5 years ago
return href
def rewriteimglinks(tree, page):
# invoke after img src has been rewritten
# To: remove links to wiki File on all pages
# but Overview_main_page page where link to publication page is added
if page.name == 'Overview main page':
for div_parent in tree.findall(".//div[@class='tooltip']"):
anchor_of_img = div_parent.find(".//div/a")
if anchor_of_img.find(".//img") is not None: # <a> needs child <img>
a_tag = div_parent.find(".//p/span/a")
publication_href = a_tag.attrib.get('href')
anchor_of_img.attrib['href'] = publication_href
else:
for a in tree.findall(".//a[@class='image']"): # select img wrapping a
if a.findall(".//img"): # ensure a has child: img
a.attrib['href'] = 'javascript:void(0);' # disable href
return tree
def rewritelinks(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "")
href = a.attrib.get("href")
if "external" in linkclass:
# leave external links alone
continue
5 years ago
# print ("LINK", href)
5 years ago
if href.startswith("/sandbox/itchwiki/index.php/"):
5 years ago
new_href = filenameforlink(href)
a.attrib['href'] = new_href
html = ET.tostring(t, method="html", encoding="unicode")
5 years ago
return html
def rewriteimgs(html, page):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
# replace images url with local image in ../images
for img in t.findall(".//img[@src]"):
# imgsrc can be:
# remote: url remains
# archive f' images/{img_filename}'
# local: f'../../images/{img_filename}'
if args.imgsrc == 'remote':
src = img.attrib.get("src")
if not src.startswith('http'):
img.attrib['src'] = 'https://hub.xpub.nl' + src
else: # local / archive imgsrc
img_alt = img.attrib.get("alt") # alt property has filename
img_page = f'File:{img_alt}' # find image it images.json
try:
# get its filename
img_filename = images_info[img_page]['filename']
except KeyError:
print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images")
print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
sys.exit()
5 years ago
# same dir as HTML files: archive/
img.attrib['src'] = f'./images/{img_filename}'
img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying
img.attrib['width'] = ""
img.attrib['height'] = ""
t = rewriteimglinks(tree=t, page=page)
html = ET.tostring(t, method="html", encoding="unicode")
return html
def dumppage(p, template, rewrite_images=True):
htmlsrc = site.parse(page=p.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
if rewrite_images:
htmlsrc = rewriteimgs(html=htmlsrc, page=p)
html = template.render(page=p, body=htmlsrc, staticpath='.')
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
f.write(html)
# print(html, file=f)
5 years ago
publish=site.Categories['Publish']
for cat in publish.members():
if cat.namespace == NS_CATEGORY:
print('dumping category {}'.format(cat.page_title))
# title=site.Categories['Title']
try:
with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile:
template = Template(templatefile.read())
except FileNotFoundError:
with open('templates/default.html') as templatefile:
template = Template(templatefile.read())
for p in cat.members():
print(p)
dumppage(p, template, rewrite_images=not args.skipimages)
if args.one:
break
else:
print("Dumping page {}".format(cat.page_title))
5 years ago
with open('templates/default.html') as templatefile:
template = Template(templatefile.read())
dumppage(cat, template, rewrite_images=not args.skipimages)
5 years ago