From dea78943e09125760c640cea5fd00dd219e790ca Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Wed, 15 Apr 2020 15:57:56 +0200 Subject: [PATCH] dump (main) pages tagged with Category:Publish as well, added --skipimages option for testing --- dumpwiki.py | 64 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/dumpwiki.py b/dumpwiki.py index e57ad49..7a37930 100644 --- a/dumpwiki.py +++ b/dumpwiki.py @@ -8,12 +8,16 @@ from xml.etree import ElementTree as ET from urllib.parse import quote as urlquote, unquote as urlunquote +NS_MAIN = 0 +NS_CATEGORY = 14 + p = argparse.ArgumentParser(description="Dump wiki files to html", formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--output", default="../archive", help="Output path for pages") p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") +p.add_argument("--skipimages", default=False, action="store_true", help="Don't do images (for testing)") p.add_argument("--imgsrc", default='archive', choices=['archive', 'remote'], help="What is the source of the images?") @@ -30,9 +34,10 @@ with open('login.txt', 'r') as login: # read login user & pwd user, pwd = loginlines.split('\n') site.login(username=user, password=pwd) # login to wiki -imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file -with open(imgsjson_fn, 'r') as imgsjson_file: - images_info = json.load(imgsjson_file) +if not args.skipimages: + imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file + with open(imgsjson_fn, 'r') as imgsjson_file: + images_info = json.load(imgsjson_file) SLASH = "\u2044" @@ -106,34 +111,37 @@ def rewriteimgs(html): html = ET.tostring(t, method="html", encoding="unicode") return html +def dumppage(p, template, rewrite_images=True): + htmlsrc = site.parse(page=p.name)['text']['*'] + htmlsrc = rewritelinks(htmlsrc) + if rewrite_images: + htmlsrc = rewriteimgs(htmlsrc) + # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0 + html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}') + with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: + f.write(html) + # print(html, file=f) + publish=site.Categories['Publish'] for cat in publish.members(): - if cat.namespace != 14: - continue - print('dumping category {}'.format(cat.page_title)) - # title=site.Categories['Title'] - try: - with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile: - template = Template(templatefile.read()) - except FileNotFoundError: + if cat.namespace == NS_CATEGORY: + print('dumping category {}'.format(cat.page_title)) + # title=site.Categories['Title'] + try: + with open('templates/{}.html'.format(cat.page_title.lower())) as templatefile: + template = Template(templatefile.read()) + except FileNotFoundError: + with open('templates/default.html') as templatefile: + template = Template(templatefile.read()) + for p in cat.members(): + print(p) + dumppage(p, template, rewrite_images=not args.skipimages) + if args.one: + break + else: + print("Dumping page {}".format(cat.page_title)) with open('templates/default.html') as templatefile: template = Template(templatefile.read()) - for p in cat.members(): - print(p) - htmlsrc = site.parse(page=p.name)['text']['*'] - htmlsrc = rewritelinks(htmlsrc) - htmlsrc = rewriteimgs(htmlsrc) - - # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0 - - html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}') - - with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: - f.write(html) - # print(html, file=f) - if args.one: - break - - + dumppage(cat, template, rewrite_images=not args.skipimages)