|
|
|
@ -66,15 +66,26 @@ def filenameforlink(href):
|
|
|
|
|
href = urlquote(href)
|
|
|
|
|
return href
|
|
|
|
|
|
|
|
|
|
def rewritelinks (html):
|
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
|
|
|
|
|
|
# remove links to wiki File: pages
|
|
|
|
|
for a in t.findall(".//a[@class='image']"): # select img wrapping a
|
|
|
|
|
href = a.attrib.get('href')
|
|
|
|
|
if a.findall(".//img") and 'File:' in href: # ensure a has child: img
|
|
|
|
|
def rewriteimglinks(tree, page):
|
|
|
|
|
# invoke after img src has been rewritten
|
|
|
|
|
# To: remove links to wiki File on all pages
|
|
|
|
|
# but Overview_main_page page where link to publication page is added
|
|
|
|
|
if page.name == 'Overview main page':
|
|
|
|
|
for div_parent in tree.findall(".//div[@class='tooltip']"):
|
|
|
|
|
anchor_of_img = div_parent.find(".//div/a")
|
|
|
|
|
if anchor_of_img.find(".//img") is not None: # <a> needs child <img>
|
|
|
|
|
a_tag = div_parent.find(".//p/span/a")
|
|
|
|
|
publication_href = a_tag.attrib.get('href')
|
|
|
|
|
anchor_of_img.attrib['href'] = publication_href
|
|
|
|
|
else:
|
|
|
|
|
for a in tree.findall(".//a[@class='image']"): # select img wrapping a
|
|
|
|
|
if a.findall(".//img"): # ensure a has child: img
|
|
|
|
|
a.attrib['href'] = 'javascript:void(0);' # disable href
|
|
|
|
|
return tree
|
|
|
|
|
|
|
|
|
|
def rewritelinks(html):
|
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
|
for a in t.findall(".//*[@href]"):
|
|
|
|
|
linkclass = a.attrib.get("class", "")
|
|
|
|
|
href = a.attrib.get("href")
|
|
|
|
@ -89,7 +100,7 @@ def rewritelinks (html):
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rewriteimgs(html):
|
|
|
|
|
def rewriteimgs(html, page):
|
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
|
|
|
|
|
|
# replace images url with local image in ../images
|
|
|
|
@ -119,6 +130,9 @@ def rewriteimgs(html):
|
|
|
|
|
img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying
|
|
|
|
|
img.attrib['width'] = ""
|
|
|
|
|
img.attrib['height'] = ""
|
|
|
|
|
|
|
|
|
|
t = rewriteimglinks(tree=t, page=page)
|
|
|
|
|
|
|
|
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
@ -126,7 +140,7 @@ def dumppage(p, template, rewrite_images=True):
|
|
|
|
|
htmlsrc = site.parse(page=p.name)['text']['*']
|
|
|
|
|
htmlsrc = rewritelinks(htmlsrc)
|
|
|
|
|
if rewrite_images:
|
|
|
|
|
htmlsrc = rewriteimgs(htmlsrc)
|
|
|
|
|
htmlsrc = rewriteimgs(html=htmlsrc, page=p)
|
|
|
|
|
html = template.render(page=p, body=htmlsrc, staticpath='.')
|
|
|
|
|
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
|
|
|
|
|
f.write(html)
|
|
|
|
|