From a57bfedd682ac6b79c636e78fd111b4ce417889b Mon Sep 17 00:00:00 2001 From: Sandra Date: Wed, 1 Apr 2020 19:47:22 +0200 Subject: [PATCH] something --- dumpwiki.py | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/dumpwiki.py b/dumpwiki.py index 522470d..1a97ef0 100644 --- a/dumpwiki.py +++ b/dumpwiki.py @@ -6,34 +6,17 @@ from functions import unpack_response, clean_dir, remove_nonwords import html5lib from functions import Colors import argparse +from xml.etree import ElementTree as ET p = argparse.ArgumentParser(description="Dump wiki files to html", formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages") -# p.add_argument("--conditions", "-c", metavar='', -# default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]', -# help='The query conditions') -# p.add_argument("--printouts", "-p", metavar='', -# default='?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language', -# help='Selection of properties to printout') -# p.add_argument("--sort", "-s", metavar='', -# default='Date,Title,Part', -# help='Sorting according to conditions') -# p.add_argument("--order", "-o", metavar='', -# default='asc,asc,asc', -# help='Order of sorting conditions. Should same amount as the --sort properties') -# p.add_argument('--limit', '-l', help='(optional) Limit the number of returned ' -# 'items') -# # TODO: GET limit to work.Perhaps with a site.raw_api method -# p.add_argument('--dry', '-d', action='store_true', -# help='dry-run: will only show the query but not run it') +p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") args = p.parse_args() - - # site and login site = Site(host=args.host, path=args.path) @@ -50,11 +33,11 @@ with open('login.txt', 'r') as login: # read login user & pwd SLASH = "\u2044" def filenameforpage(p): - f=p.name.replace(' ','_').replace('/', SLASH) + '.html' + f = p.name.replace(' ','_').replace('/', SLASH) + '.html' return f -def rewritelinks (html) - t = html5lib.parseFragment(html, treebuilder="etree", namespaceHTMLElements=False) +def rewritelinks (html): + t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) for a in t.findall(".//*[@href]"): linkclass = a.attrib.get("class", "") href = a.attrib.get("href") @@ -62,11 +45,15 @@ def rewritelinks (html) # leave external links alone continue print ("LINK", href) + if href.startswith("/sandbox/itchwiki/index.php/"): + new_href = # a.attrib['href'] = new_href + html = ET.tostring(t, method="html", encoding="unicode") + return html publish=site.Categories['Publish'] for cat in publish.members(): - if cat.namespace!=14: + if cat.namespace != 14: continue print('dumping category {}'.format(cat.page_title)) # title=site.Categories['Title'] @@ -79,10 +66,12 @@ for cat in publish.members(): for p in cat.members(): print(p) htmlsrc = site.parse(page=p.name)['text']['*'] + htmlsrc = rewritelinks(htmlsrc) html = template.render(page=p, body=htmlsrc) with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: print(html, file=f) - # break + if args.one: + break