You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

320 lines
12 KiB
Python

8 years ago
from __future__ import print_function
import os, sys, re, urllib, urlparse, html5lib, json
from PIL import Image
from math import log
from argparse import ArgumentParser
from urllib2 import urlopen
from xml.etree import ElementTree as ET
# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
def wiki_url_to_title (url):
return urllib.unquote(url.split("/")[-1])
def parse_gallery(t):
""" returns [(imagepageurl, caption, articleurl), ...] """
galleryitems = t.findall(".//li[@class='gallerybox']")
items = []
for i in galleryitems:
image_link = i.find(".//a[@class='image']")
src = None
captiontext = None
article = None
if image_link != None:
src = image_link.attrib.get("href")
# src = src.split("/")[-1]
caption = i.find(".//*[@class='gallerytext']")
if caption:
captiontext = ET.tostring(caption, method="html")
articlelink = caption.find(".//a")
if articlelink != None:
article = articlelink.attrib.get("href")
# f = wiki.Pages[imgname]
# items.append((f.imageinfo['url'], captiontext))
items.append((src, captiontext, article))
return items
def mwfilepage_to_url (wiki, url):
filename = urllib.unquote(url.split("/")[-1])
page = wiki.Pages[filename]
return page, page.imageinfo['url']
def url_to_path (url):
""" https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
path = urllib.unquote(urlparse.urlparse(url).path)
return "/".join(path.split("/")[3:])
def wiki_absurl (wiki, url):
ret = ''
if type(wiki.host) == tuple:
ret = wiki.host[0]+"://"+wiki.host[1]
else:
ret = "http://"+wiki.host
return urlparse.urljoin(ret, url)
def wiki_title_to_url (wiki, title):
""" relies on wiki.site['base'] being set to the public facing URL of the Main page """
ret = ''
parts = urlparse.urlparse(wiki.site['base'])
base, main_page = os.path.split(parts.path)
ret = parts.scheme+"://"+parts.netloc+base
p = wiki.pages[title]
ret += "/" + p.normalize_title(p.name)
return ret
def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3, margin_right=0, margin_bottom=0):
8 years ago
print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
path = os.path.join(basepath, url_to_path(imageurl))
print ("imageurl, path", imageurl, path, file=sys.stderr)
ret = tiles_wrapper(path, imagepageurl, text=text)
tp = ret.get_tile_path(0, 0, 0)
if os.path.exists(tp) and not force:
return ret
try:
os.makedirs(path)
except OSError:
pass
im = Image.open(urlopen(imageurl))
tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor, margin_right, margin_bottom)
8 years ago
return ret
def textcell (paras):
node = {}
node['text'] = paras[:1]
moretext = paras[1:]
if moretext:
node['children'] = [textcell([x]) for x in moretext]
return node
def name_to_path (name):
return name.replace("/", "_")
def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3, tilewidth=256, tileheight=256):
8 years ago
print ("render_article", ref, file=sys.stderr)
if type(ref) == Page:
page = ref
title = page.name
ref = wiki_title_to_url(wiki, page.name)
elif ref.startswith("http"):
title = wiki_url_to_title(ref)
page = wiki.pages[title]
else:
title = ref
page = wiki.pages[title]
ref = wiki_title_to_url(wiki, page.name)
# pagetext = page.text()
# print ("WIKI PARSE", title, file=sys.stderr)
parse = wiki.parse(page=title)
html = parse['text']['*']
# print ("GOT HTML ", html, file=sys.stderr)
tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = tree.find("./body")
paras = []
images = []
imgsrcs = {}
for c in body:
if c.tag == "p":
# filter out paras like <p><br></p> but checking text-only render length
ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
if len(ptext) > 0:
ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
paras.append(ptext)
elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
# print ("GALLERY")
gallery = parse_gallery(c)
# Ensure image is downloaded ... at least the 00 image...
for src, caption, article in gallery:
src = wiki_absurl(wiki, src)
if src in imgsrcs:
continue
imgsrcs[src] = True
print ("GalleryImage", src, caption, article, file=sys.stderr)
# if article and depth < maxdepth:
# article = wiki_absurl(wiki, article)
# images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
# else:
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())
8 years ago
for a in body.findall('.//a[@class="image"]'):
caption = a.attrib.get("title", '')
src = wiki_absurl(wiki, a.attrib.get("href"))
# OEI... skippin svg for the moment (can't go straight to PIL)
if src.endswith(".svg"):
continue
print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
if src in imgsrcs:
continue
imgsrcs[src] = True
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())
8 years ago
print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)
if title == None:
title = page.name
basename = "tiles/" + name_to_path(page.name)
# gallerynode = gridrender(images, basename)
# return gallerynode
cells = []
if len(paras) > 0:
cells.append(textcell(paras))
cells.extend(images)
ret = recursiverender(cells, basename, tilewidth=tilewidth, tileheight=tileheight)
8 years ago
ret['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(title, ref)
if images:
ret['image'] = images[0]['image']
return ret
# article = {}
# article['text'] = title
# article['children'] = children = []
# children.append(textcell(paras))
# for iz in images[:2]:
# if 'image' not in article and 'image' in iz:
# article['image'] = iz['image']
# children.append(iz)
# restimages = images[2:]
# if len(restimages) == 1:
# children.append(restimages[0])
# elif len(restimages) > 1:
# children.append(gridrender(restimages, basename))
# return article
def render_category (wiki, cat, output="tiles", tilewidth=256, tileheight=256):
8 years ago
print ("Render Category", cat, file=sys.stderr)
# if type(cat) == Page:
# page = ref
# title = page.name
# ref = wiki_title_to_url(wiki, page.name)
if cat.startswith("http"):
title = wiki_url_to_title(cat)
cat = wiki.pages[title]
else:
title = ref
cat = wiki.pages[cat]
# ref = wiki_title_to_url(wiki, cat.name)
print ("cat", cat, file=sys.stderr)
pages = []
for m in cat.members():
pages.append(m)
pages.sort(key=lambda x: x.name)
pagenodes = [render_article(wiki, x.name, tilewidth=tilewidth, tileheight=tileheight) for x in pages]
8 years ago
for page, node in zip(pages, pagenodes):
node['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"), tilewidth=tilewidth, tileheight=tileheight)
8 years ago
ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
return ret
# for p in pages:
# print (p.name, wiki_title_to_url(wiki, p.name))
def make_category (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_category(wiki, args.category)
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_article (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_article(wiki, args.wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
8 years ago
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_gallery(args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
# apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
if len(args.wikipage) == 1:
root_node = render_article(wiki, args.wikipage[0], tilewidth=args.tilewidth, tileheight=args.tileheight)
8 years ago
else:
children = []
for wikipage in args.wikipage:
print ("rendering", wikipage, file=sys.stderr)
if "Category:" in wikipage:
print ("rendering", wikipage, file=sys.stderr)
cnode = render_category(wiki, wikipage, args.output)
else:
cnode = render_article(wiki, wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
8 years ago
children.append(cnode)
if args.recursive:
root_node = recursiverender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
8 years ago
else:
root_node = gridrender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
8 years ago
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def testwiki (args):
return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
8 years ago
if __name__ == "__main__":
ap = ArgumentParser("")
ap.add_argument("--wikiprotocol", default="https")
ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
ap.add_argument("--wikipath", default="/mw-mediadesign/")
ap.add_argument("--wikishortpath", default="/mediadesign/")
ap.add_argument("--tilewidth", type=int, default=256)
ap.add_argument("--tileheight", type=int, default=256)
# ap.add_argument("--zoom", type=int, default=3)
ap.add_argument("--output", default="tiles")
# ap.add_argument("--title", default="TITLE")
subparsers = ap.add_subparsers(help='sub-command help')
ap_article = subparsers.add_parser('article', help='Render an article')
ap_article.add_argument("wikipage")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_article)
ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
ap_gallery.add_argument("wikipage", nargs="+")
ap_gallery.add_argument("--html", default=False, action="store_true")
ap_gallery.add_argument("--recursive", default=False, action="store_true")
ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
ap_gallery.add_argument("--name", default=None)
ap_gallery.set_defaults(func=make_gallery)
ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
ap_gallery.set_defaults(func=testwiki)
ap_article = subparsers.add_parser('category', help='Render an article')
ap_article.add_argument("category")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_category)
args = ap.parse_args()
ret = args.func(args)