import os, json, sys from mwclient import Site from jinja2 import Template from shutil import copy import html5lib from functions import Colors import argparse from xml.etree import ElementTree as ET from urllib.parse import quote as urlquote, unquote as urlunquote NS_MAIN = 0 NS_CATEGORY = 14 p = argparse.ArgumentParser(description="Dump wiki files to html", formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--output", default="../archive", help="Output path for pages") p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)") p.add_argument("--imgsrc", default='archive', choices=['archive', 'remote'], help="What is the source of the images?") args = p.parse_args() print(args) # site and login site = Site(host=args.host, path=args.path) wd = os.path.dirname(os.path.abspath(__file__)) # working directory wd_name = os.path.split(wd)[-1] # name of dir running script # copy static/ to ../archive/static repo_static_path = './static' archive_static_path = os.path.join(args.output, repo_static_path) os.makedirs(archive_static_path, exist_ok=True) # create static/ dir in archive for static_file in os.listdir(path='./static'): copy(src=os.path.join(repo_static_path, static_file), dst=os.path.join(archive_static_path, static_file)) with open('login.txt', 'r') as login: # read login user & pwd loginlines = login.read() user, pwd = loginlines.split('\n') site.login(username=user, password=pwd) # login to wiki if not args.skipimages: imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file with open(imgsjson_fn, 'r') as imgsjson_file: images_info = json.load(imgsjson_file) SLASH = "\u2044" def filenameforpage(p): f = p.name.replace(' ','_').replace('/', SLASH) + '.html' return f def filenameforlink(href): href = urlunquote(href) if href.startswith("/sandbox/itchwiki/index.php/"): href = href[len("/sandbox/itchwiki/index.php/"):] href = href.replace(' ','_').replace('/', SLASH) + '.html' href = urlquote(href) return href def rewriteimglinks(tree, page): # invoke after img src has been rewritten # To: remove links to wiki File on all pages # but Overview_main_page page where link to publication page is added if page.name == 'Overview main page': for div_parent in tree.findall(".//div[@class='tooltip']"): anchor_of_img = div_parent.find(".//div/a") if anchor_of_img.find(".//img") is not None: # needs child a_tag = div_parent.find(".//p/span/a") publication_href = a_tag.attrib.get('href') anchor_of_img.attrib['href'] = publication_href else: for a in tree.findall(".//a[@class='image']"): # select img wrapping a if a.findall(".//img"): # ensure a has child: img a.attrib['href'] = 'javascript:void(0);' # disable href return tree def rewritelinks(html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) for a in t.findall(".//*[@href]"): linkclass = a.attrib.get("class", "") href = a.attrib.get("href") if "external" in linkclass: # leave external links alone continue # print ("LINK", href) if href.startswith("/sandbox/itchwiki/index.php/"): new_href = filenameforlink(href) a.attrib['href'] = new_href html = ET.tostring(t, method="html", encoding="unicode") return html def rewriteimgs(html, page): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) # replace images url with local image in ../images for img in t.findall(".//img[@src]"): # imgsrc can be: # remote: url remains # archive f' images/{img_filename}' # local: f'../../images/{img_filename}' if args.imgsrc == 'remote': src = img.attrib.get("src") if not src.startswith('http'): img.attrib['src'] = 'https://hub.xpub.nl' + src else: # local / archive imgsrc img_alt = img.attrib.get("alt") # alt property has filename img_page = f'File:{img_alt}' # find image it images.json try: # get its filename img_filename = images_info[img_page]['filename'] except KeyError: print(Colors.WARNING, f"{img_page} is not is missing from the local downloaded images") print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC) sys.exit() # same dir as HTML files: archive/ img.attrib['src'] = f'./images/{img_filename}' img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying img.attrib['width'] = "" img.attrib['height'] = "" t = rewriteimglinks(tree=t, page=page) html = ET.tostring(t, method="html", encoding="unicode") return html def dumppage(p, template, rewrite_images=True): htmlsrc = site.parse(page=p.name)['text']['*'] htmlsrc = rewritelinks(htmlsrc) if rewrite_images: htmlsrc = rewriteimgs(html=htmlsrc, page=p) html = template.render(page=p, body=htmlsrc, staticpath='.') with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: f.write(html) # print(html, file=f) publish=site.Categories['Publish'] for cat in publish.members(): if cat.namespace == NS_CATEGORY: print('dumping category {}'.format(cat.page_title)) # title=site.Categories['Title'] try: with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile: template = Template(templatefile.read()) except FileNotFoundError: with open('templates/default.html') as templatefile: template = Template(templatefile.read()) for p in cat.members(): print(p) dumppage(p, template, rewrite_images=not args.skipimages) if args.one: break else: print("Dumping page {}".format(cat.page_title)) with open('templates/default.html') as templatefile: template = Template(templatefile.read()) dumppage(cat, template, rewrite_images=not args.skipimages)