diff --git a/dumpwiki.py b/dumpwiki.py index 601d692..522470d 100644 --- a/dumpwiki.py +++ b/dumpwiki.py @@ -3,7 +3,7 @@ from mwclient import Site from pprint import pprint from jinja2 import Template from functions import unpack_response, clean_dir, remove_nonwords - +import html5lib from functions import Colors import argparse @@ -53,6 +53,17 @@ def filenameforpage(p): f=p.name.replace(' ','_').replace('/', SLASH) + '.html' return f +def rewritelinks (html) + t = html5lib.parseFragment(html, treebuilder="etree", namespaceHTMLElements=False) + for a in t.findall(".//*[@href]"): + linkclass = a.attrib.get("class", "") + href = a.attrib.get("href") + if "external" in linkclass: + # leave external links alone + continue + print ("LINK", href) + # a.attrib['href'] = new_href + publish=site.Categories['Publish'] for cat in publish.members(): if cat.namespace!=14: