something

master
Sandra 5 years ago
parent bf73e71300
commit a57bfedd68

@ -6,34 +6,17 @@ from functions import unpack_response, clean_dir, remove_nonwords
import html5lib import html5lib
from functions import Colors from functions import Colors
import argparse import argparse
from xml.etree import ElementTree as ET
p = argparse.ArgumentParser(description="Dump wiki files to html", p = argparse.ArgumentParser(description="Dump wiki files to html",
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages") p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")
# p.add_argument("--conditions", "-c", metavar='', p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
# default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]',
# help='The query conditions')
# p.add_argument("--printouts", "-p", metavar='',
# default='?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language',
# help='Selection of properties to printout')
# p.add_argument("--sort", "-s", metavar='',
# default='Date,Title,Part',
# help='Sorting according to conditions')
# p.add_argument("--order", "-o", metavar='',
# default='asc,asc,asc',
# help='Order of sorting conditions. Should same amount as the --sort properties')
# p.add_argument('--limit', '-l', help='(optional) Limit the number of returned '
# 'items')
# # TODO: GET limit to work.Perhaps with a site.raw_api method
# p.add_argument('--dry', '-d', action='store_true',
# help='dry-run: will only show the query but not run it')
args = p.parse_args() args = p.parse_args()
# site and login # site and login
site = Site(host=args.host, path=args.path) site = Site(host=args.host, path=args.path)
@ -50,11 +33,11 @@ with open('login.txt', 'r') as login: # read login user & pwd
SLASH = "\u2044" SLASH = "\u2044"
def filenameforpage(p): def filenameforpage(p):
f=p.name.replace(' ','_').replace('/', SLASH) + '.html' f = p.name.replace(' ','_').replace('/', SLASH) + '.html'
return f return f
def rewritelinks (html) def rewritelinks (html):
t = html5lib.parseFragment(html, treebuilder="etree", namespaceHTMLElements=False) t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
for a in t.findall(".//*[@href]"): for a in t.findall(".//*[@href]"):
linkclass = a.attrib.get("class", "") linkclass = a.attrib.get("class", "")
href = a.attrib.get("href") href = a.attrib.get("href")
@ -62,11 +45,15 @@ def rewritelinks (html)
# leave external links alone # leave external links alone
continue continue
print ("LINK", href) print ("LINK", href)
if href.startswith("/sandbox/itchwiki/index.php/"):
new_href =
# a.attrib['href'] = new_href # a.attrib['href'] = new_href
html = ET.tostring(t, method="html", encoding="unicode")
return html
publish=site.Categories['Publish'] publish=site.Categories['Publish']
for cat in publish.members(): for cat in publish.members():
if cat.namespace!=14: if cat.namespace != 14:
continue continue
print('dumping category {}'.format(cat.page_title)) print('dumping category {}'.format(cat.page_title))
# title=site.Categories['Title'] # title=site.Categories['Title']
@ -79,10 +66,12 @@ for cat in publish.members():
for p in cat.members(): for p in cat.members():
print(p) print(p)
htmlsrc = site.parse(page=p.name)['text']['*'] htmlsrc = site.parse(page=p.name)['text']['*']
htmlsrc = rewritelinks(htmlsrc)
html = template.render(page=p, body=htmlsrc) html = template.render(page=p, body=htmlsrc)
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
print(html, file=f) print(html, file=f)
# break if args.one:
break

Loading…
Cancel
Save