You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
11 KiB
Python
319 lines
11 KiB
Python
from __future__ import print_function
|
|
|
|
import os, sys, re, urllib, urlparse, html5lib, json
|
|
from PIL import Image
|
|
from math import log
|
|
from argparse import ArgumentParser
|
|
from urllib2 import urlopen
|
|
|
|
from xml.etree import ElementTree as ET
|
|
|
|
# from wiki_get_html import page_html
|
|
from mwclient import Site
|
|
from mwclient.page import Page
|
|
|
|
from leaflet import tiles_wrapper, recursiverender, gridrender, html
|
|
from imagetile2 import tile_image
|
|
|
|
|
|
def wiki_url_to_title (url):
|
|
return urllib.unquote(url.split("/")[-1])
|
|
|
|
def parse_gallery(t):
|
|
""" returns [(imagepageurl, caption, articleurl), ...] """
|
|
galleryitems = t.findall(".//li[@class='gallerybox']")
|
|
items = []
|
|
for i in galleryitems:
|
|
image_link = i.find(".//a[@class='image']")
|
|
src = None
|
|
captiontext = None
|
|
article = None
|
|
|
|
if image_link != None:
|
|
src = image_link.attrib.get("href")
|
|
# src = src.split("/")[-1]
|
|
|
|
caption = i.find(".//*[@class='gallerytext']")
|
|
if caption:
|
|
captiontext = ET.tostring(caption, method="html")
|
|
articlelink = caption.find(".//a")
|
|
if articlelink != None:
|
|
article = articlelink.attrib.get("href")
|
|
|
|
# f = wiki.Pages[imgname]
|
|
# items.append((f.imageinfo['url'], captiontext))
|
|
items.append((src, captiontext, article))
|
|
return items
|
|
|
|
def mwfilepage_to_url (wiki, url):
|
|
filename = urllib.unquote(url.split("/")[-1])
|
|
page = wiki.Pages[filename]
|
|
return page, page.imageinfo['url']
|
|
|
|
def url_to_path (url):
|
|
""" https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
|
|
path = urllib.unquote(urlparse.urlparse(url).path)
|
|
return "/".join(path.split("/")[3:])
|
|
|
|
def wiki_absurl (wiki, url):
|
|
ret = ''
|
|
if type(wiki.host) == tuple:
|
|
ret = wiki.host[0]+"://"+wiki.host[1]
|
|
else:
|
|
ret = "http://"+wiki.host
|
|
|
|
return urlparse.urljoin(ret, url)
|
|
|
|
def wiki_title_to_url (wiki, title):
|
|
""" relies on wiki.site['base'] being set to the public facing URL of the Main page """
|
|
ret = ''
|
|
parts = urlparse.urlparse(wiki.site['base'])
|
|
base, main_page = os.path.split(parts.path)
|
|
ret = parts.scheme+"://"+parts.netloc+base
|
|
p = wiki.pages[title]
|
|
ret += "/" + p.normalize_title(p.name)
|
|
return ret
|
|
|
|
def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3):
|
|
print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
|
|
page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
|
|
path = os.path.join(basepath, url_to_path(imageurl))
|
|
print ("imageurl, path", imageurl, path, file=sys.stderr)
|
|
ret = tiles_wrapper(path, imagepageurl, text=text)
|
|
tp = ret.get_tile_path(0, 0, 0)
|
|
if os.path.exists(tp) and not force:
|
|
return ret
|
|
|
|
try:
|
|
os.makedirs(path)
|
|
except OSError:
|
|
pass
|
|
im = Image.open(urlopen(imageurl))
|
|
tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor)
|
|
return ret
|
|
|
|
def textcell (paras):
|
|
node = {}
|
|
node['text'] = paras[:1]
|
|
moretext = paras[1:]
|
|
if moretext:
|
|
node['children'] = [textcell([x]) for x in moretext]
|
|
return node
|
|
|
|
def name_to_path (name):
|
|
return name.replace("/", "_")
|
|
|
|
|
|
def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3):
|
|
print ("render_article", ref, file=sys.stderr)
|
|
if type(ref) == Page:
|
|
page = ref
|
|
title = page.name
|
|
ref = wiki_title_to_url(wiki, page.name)
|
|
elif ref.startswith("http"):
|
|
title = wiki_url_to_title(ref)
|
|
page = wiki.pages[title]
|
|
else:
|
|
title = ref
|
|
page = wiki.pages[title]
|
|
ref = wiki_title_to_url(wiki, page.name)
|
|
# pagetext = page.text()
|
|
# print ("WIKI PARSE", title, file=sys.stderr)
|
|
parse = wiki.parse(page=title)
|
|
html = parse['text']['*']
|
|
# print ("GOT HTML ", html, file=sys.stderr)
|
|
tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
|
|
body = tree.find("./body")
|
|
paras = []
|
|
images = []
|
|
imgsrcs = {}
|
|
|
|
for c in body:
|
|
if c.tag == "p":
|
|
# filter out paras like <p><br></p> but checking text-only render length
|
|
ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
|
|
if len(ptext) > 0:
|
|
ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
|
|
paras.append(ptext)
|
|
|
|
elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
|
|
# print ("GALLERY")
|
|
gallery = parse_gallery(c)
|
|
# Ensure image is downloaded ... at least the 00 image...
|
|
for src, caption, article in gallery:
|
|
src = wiki_absurl(wiki, src)
|
|
if src in imgsrcs:
|
|
continue
|
|
imgsrcs[src] = True
|
|
print ("GalleryImage", src, caption, article, file=sys.stderr)
|
|
# if article and depth < maxdepth:
|
|
# article = wiki_absurl(wiki, article)
|
|
# images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
|
|
# else:
|
|
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom())
|
|
|
|
for a in body.findall('.//a[@class="image"]'):
|
|
caption = a.attrib.get("title", '')
|
|
src = wiki_absurl(wiki, a.attrib.get("href"))
|
|
# OEI... skippin svg for the moment (can't go straight to PIL)
|
|
if src.endswith(".svg"):
|
|
continue
|
|
print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
|
|
if src in imgsrcs:
|
|
continue
|
|
imgsrcs[src] = True
|
|
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom())
|
|
|
|
print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)
|
|
|
|
|
|
if title == None:
|
|
title = page.name
|
|
|
|
basename = "tiles/" + name_to_path(page.name)
|
|
|
|
# gallerynode = gridrender(images, basename)
|
|
# return gallerynode
|
|
cells = []
|
|
if len(paras) > 0:
|
|
cells.append(textcell(paras))
|
|
cells.extend(images)
|
|
|
|
ret = recursiverender(cells, basename)
|
|
ret['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(title, ref)
|
|
if images:
|
|
ret['image'] = images[0]['image']
|
|
return ret
|
|
|
|
# article = {}
|
|
# article['text'] = title
|
|
# article['children'] = children = []
|
|
# children.append(textcell(paras))
|
|
# for iz in images[:2]:
|
|
# if 'image' not in article and 'image' in iz:
|
|
# article['image'] = iz['image']
|
|
# children.append(iz)
|
|
# restimages = images[2:]
|
|
# if len(restimages) == 1:
|
|
# children.append(restimages[0])
|
|
# elif len(restimages) > 1:
|
|
# children.append(gridrender(restimages, basename))
|
|
# return article
|
|
|
|
def render_category (wiki, cat, output="tiles"):
|
|
print ("Render Category", cat, file=sys.stderr)
|
|
# if type(cat) == Page:
|
|
# page = ref
|
|
# title = page.name
|
|
# ref = wiki_title_to_url(wiki, page.name)
|
|
if cat.startswith("http"):
|
|
title = wiki_url_to_title(cat)
|
|
cat = wiki.pages[title]
|
|
else:
|
|
title = ref
|
|
cat = wiki.pages[cat]
|
|
# ref = wiki_title_to_url(wiki, cat.name)
|
|
print ("cat", cat, file=sys.stderr)
|
|
pages = []
|
|
for m in cat.members():
|
|
pages.append(m)
|
|
pages.sort(key=lambda x: x.name)
|
|
pagenodes = [render_article(wiki, x.name) for x in pages]
|
|
for page, node in zip(pages, pagenodes):
|
|
node['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
|
|
ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"))
|
|
ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
|
|
return ret
|
|
# for p in pages:
|
|
# print (p.name, wiki_title_to_url(wiki, p.name))
|
|
|
|
def make_category (args):
|
|
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
root_node = render_category(wiki, args.category)
|
|
if args.html:
|
|
print (html(root_node, ""))
|
|
else:
|
|
print (json.dumps(root_node, indent=2))
|
|
|
|
|
|
def make_article (args):
|
|
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
root_node = render_article(wiki, args.wikipage)
|
|
if args.html:
|
|
print (html(root_node, ""))
|
|
else:
|
|
print (json.dumps(root_node, indent=2))
|
|
|
|
def make_gallery(args):
|
|
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
# apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
|
|
if len(args.wikipage) == 1:
|
|
root_node = render_article(wiki, args.wikipage[0])
|
|
else:
|
|
children = []
|
|
for wikipage in args.wikipage:
|
|
print ("rendering", wikipage, file=sys.stderr)
|
|
if "Category:" in wikipage:
|
|
print ("rendering", wikipage, file=sys.stderr)
|
|
cnode = render_category(wiki, wikipage, args.output)
|
|
else:
|
|
cnode = render_article(wiki, wikipage)
|
|
children.append(cnode)
|
|
if args.recursive:
|
|
root_node = recursiverender(children, args.output+"/"+args.name, direction=1)
|
|
else:
|
|
root_node = gridrender(children, args.output+"/"+args.name, direction=1)
|
|
|
|
if args.html:
|
|
print (html(root_node, ""))
|
|
else:
|
|
print (json.dumps(root_node, indent=2))
|
|
|
|
|
|
def testwiki (args):
|
|
return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
|
|
if __name__ == "__main__":
|
|
|
|
ap = ArgumentParser("")
|
|
ap.add_argument("--wikiprotocol", default="https")
|
|
ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
|
|
ap.add_argument("--wikipath", default="/mw-mediadesign/")
|
|
ap.add_argument("--wikishortpath", default="/mediadesign/")
|
|
|
|
ap.add_argument("--tilewidth", type=int, default=256)
|
|
ap.add_argument("--tileheight", type=int, default=256)
|
|
# ap.add_argument("--zoom", type=int, default=3)
|
|
|
|
ap.add_argument("--output", default="tiles")
|
|
# ap.add_argument("--title", default="TITLE")
|
|
|
|
|
|
subparsers = ap.add_subparsers(help='sub-command help')
|
|
ap_article = subparsers.add_parser('article', help='Render an article')
|
|
ap_article.add_argument("wikipage")
|
|
ap_article.add_argument("--html", default=False, action="store_true")
|
|
ap_article.set_defaults(func=make_article)
|
|
|
|
ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
|
|
ap_gallery.add_argument("wikipage", nargs="+")
|
|
ap_gallery.add_argument("--html", default=False, action="store_true")
|
|
ap_gallery.add_argument("--recursive", default=False, action="store_true")
|
|
ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
|
|
ap_gallery.add_argument("--name", default=None)
|
|
ap_gallery.set_defaults(func=make_gallery)
|
|
|
|
ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
|
|
ap_gallery.set_defaults(func=testwiki)
|
|
|
|
ap_article = subparsers.add_parser('category', help='Render an article')
|
|
ap_article.add_argument("category")
|
|
ap_article.add_argument("--html", default=False, action="store_true")
|
|
ap_article.set_defaults(func=make_category)
|
|
|
|
|
|
|
|
args = ap.parse_args()
|
|
ret = args.func(args)
|
|
|