You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
11 KiB
Python

7 years ago
from __future__ import print_function
import os, sys, re, urllib, urlparse, html5lib, json
from PIL import Image
from math import log
from argparse import ArgumentParser
from urllib2 import urlopen
from xml.etree import ElementTree as ET
# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
def wiki_url_to_title (url):
return urllib.unquote(url.split("/")[-1])
def parse_gallery(t):
""" returns [(imagepageurl, caption, articleurl), ...] """
galleryitems = t.findall(".//li[@class='gallerybox']")
items = []
for i in galleryitems:
image_link = i.find(".//a[@class='image']")
src = None
captiontext = None
article = None
if image_link != None:
src = image_link.attrib.get("href")
# src = src.split("/")[-1]
caption = i.find(".//*[@class='gallerytext']")
if caption:
captiontext = ET.tostring(caption, method="html")
articlelink = caption.find(".//a")
if articlelink != None:
article = articlelink.attrib.get("href")
# f = wiki.Pages[imgname]
# items.append((f.imageinfo['url'], captiontext))
items.append((src, captiontext, article))
return items
def mwfilepage_to_url (wiki, url):
filename = urllib.unquote(url.split("/")[-1])
page = wiki.Pages[filename]
return page, page.imageinfo['url']
def url_to_path (url):
""" https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
path = urllib.unquote(urlparse.urlparse(url).path)
return "/".join(path.split("/")[3:])
def wiki_absurl (wiki, url):
ret = ''
if type(wiki.host) == tuple:
ret = wiki.host[0]+"://"+wiki.host[1]
else:
ret = "http://"+wiki.host
return urlparse.urljoin(ret, url)
def wiki_title_to_url (wiki, title):
""" relies on wiki.site['base'] being set to the public facing URL of the Main page """
ret = ''
parts = urlparse.urlparse(wiki.site['base'])
base, main_page = os.path.split(parts.path)
ret = parts.scheme+"://"+parts.netloc+base
p = wiki.pages[title]
ret += "/" + p.normalize_title(p.name)
return ret
def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3):
print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
path = os.path.join(basepath, url_to_path(imageurl))
print ("imageurl, path", imageurl, path, file=sys.stderr)
ret = tiles_wrapper(path, imagepageurl, text=text)
tp = ret.get_tile_path(0, 0, 0)
if os.path.exists(tp) and not force:
return ret
try:
os.makedirs(path)
except OSError:
pass
im = Image.open(urlopen(imageurl))
tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor)
return ret
def textcell (paras):
node = {}
node['text'] = paras[:1]
moretext = paras[1:]
if moretext:
node['children'] = [textcell([x]) for x in moretext]
return node
def name_to_path (name):
return name.replace("/", "_")
def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3):
print ("render_article", ref, file=sys.stderr)
if type(ref) == Page:
page = ref
title = page.name
ref = wiki_title_to_url(wiki, page.name)
elif ref.startswith("http"):
title = wiki_url_to_title(ref)
page = wiki.pages[title]
else:
title = ref
page = wiki.pages[title]
ref = wiki_title_to_url(wiki, page.name)
# pagetext = page.text()
# print ("WIKI PARSE", title, file=sys.stderr)
parse = wiki.parse(page=title)
html = parse['text']['*']
# print ("GOT HTML ", html, file=sys.stderr)
tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = tree.find("./body")
paras = []
images = []
imgsrcs = {}
for c in body:
if c.tag == "p":
# filter out paras like <p><br></p> but checking text-only render length
ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
if len(ptext) > 0:
ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
paras.append(ptext)
elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
# print ("GALLERY")
gallery = parse_gallery(c)
# Ensure image is downloaded ... at least the 00 image...
for src, caption, article in gallery:
src = wiki_absurl(wiki, src)
if src in imgsrcs:
continue
imgsrcs[src] = True
print ("GalleryImage", src, caption, article, file=sys.stderr)
# if article and depth < maxdepth:
# article = wiki_absurl(wiki, article)
# images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
# else:
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom())
for a in body.findall('.//a[@class="image"]'):
caption = a.attrib.get("title", '')
src = wiki_absurl(wiki, a.attrib.get("href"))
# OEI... skippin svg for the moment (can't go straight to PIL)
if src.endswith(".svg"):
continue
print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
if src in imgsrcs:
continue
imgsrcs[src] = True
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom())
print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)
if title == None:
title = page.name
basename = "tiles/" + name_to_path(page.name)
# gallerynode = gridrender(images, basename)
# return gallerynode
cells = []
if len(paras) > 0:
cells.append(textcell(paras))
cells.extend(images)
ret = recursiverender(cells, basename)
ret['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(title, ref)
if images:
ret['image'] = images[0]['image']
return ret
# article = {}
# article['text'] = title
# article['children'] = children = []
# children.append(textcell(paras))
# for iz in images[:2]:
# if 'image' not in article and 'image' in iz:
# article['image'] = iz['image']
# children.append(iz)
# restimages = images[2:]
# if len(restimages) == 1:
# children.append(restimages[0])
# elif len(restimages) > 1:
# children.append(gridrender(restimages, basename))
# return article
def render_category (wiki, cat, output="tiles"):
print ("Render Category", cat, file=sys.stderr)
# if type(cat) == Page:
# page = ref
# title = page.name
# ref = wiki_title_to_url(wiki, page.name)
if cat.startswith("http"):
title = wiki_url_to_title(cat)
cat = wiki.pages[title]
else:
title = ref
cat = wiki.pages[cat]
# ref = wiki_title_to_url(wiki, cat.name)
print ("cat", cat, file=sys.stderr)
pages = []
for m in cat.members():
pages.append(m)
pages.sort(key=lambda x: x.name)
pagenodes = [render_article(wiki, x.name) for x in pages]
for page, node in zip(pages, pagenodes):
node['text'] = u"""<p class="caption"><span class="text">{0}</span><a class="url" href="{1}">WIKI</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"))
ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
return ret
# for p in pages:
# print (p.name, wiki_title_to_url(wiki, p.name))
def make_category (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_category(wiki, args.category)
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_article (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_article(wiki, args.wikipage)
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_gallery(args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
# apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
if len(args.wikipage) == 1:
root_node = render_article(wiki, args.wikipage[0])
else:
children = []
for wikipage in args.wikipage:
print ("rendering", wikipage, file=sys.stderr)
if "Category:" in wikipage:
print ("rendering", wikipage, file=sys.stderr)
cnode = render_category(wiki, wikipage, args.output)
else:
cnode = render_article(wiki, wikipage)
children.append(cnode)
if args.recursive:
root_node = recursiverender(children, args.output+"/"+args.name, direction=1)
else:
root_node = gridrender(children, args.output+"/"+args.name, direction=1)
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def testwiki (args):
return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
if __name__ == "__main__":
ap = ArgumentParser("")
ap.add_argument("--wikiprotocol", default="https")
ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
ap.add_argument("--wikipath", default="/mw-mediadesign/")
ap.add_argument("--wikishortpath", default="/mediadesign/")
ap.add_argument("--tilewidth", type=int, default=256)
ap.add_argument("--tileheight", type=int, default=256)
# ap.add_argument("--zoom", type=int, default=3)
ap.add_argument("--output", default="tiles")
# ap.add_argument("--title", default="TITLE")
subparsers = ap.add_subparsers(help='sub-command help')
ap_article = subparsers.add_parser('article', help='Render an article')
ap_article.add_argument("wikipage")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_article)
ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
ap_gallery.add_argument("wikipage", nargs="+")
ap_gallery.add_argument("--html", default=False, action="store_true")
ap_gallery.add_argument("--recursive", default=False, action="store_true")
ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
ap_gallery.add_argument("--name", default=None)
ap_gallery.set_defaults(func=make_gallery)
ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
ap_gallery.set_defaults(func=testwiki)
ap_article = subparsers.add_parser('category', help='Render an article')
ap_article.add_argument("category")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_category)
args = ap.parse_args()
ret = args.func(args)