from __future__ import print_function import os, sys, re, urllib, urlparse, html5lib, json from PIL import Image from math import log from argparse import ArgumentParser from urllib2 import urlopen from xml.etree import ElementTree as ET # from wiki_get_html import page_html from mwclient import Site from mwclient.page import Page from leaflet import tiles_wrapper, recursiverender, gridrender, html from imagetile2 import tile_image def wiki_url_to_title (url): return urllib.unquote(url.split("/")[-1]) def parse_gallery(t): """ returns [(imagepageurl, caption, articleurl), ...] """ galleryitems = t.findall(".//li[@class='gallerybox']") items = [] for i in galleryitems: image_link = i.find(".//a[@class='image']") src = None captiontext = None article = None if image_link != None: src = image_link.attrib.get("href") # src = src.split("/")[-1] caption = i.find(".//*[@class='gallerytext']") if caption: captiontext = ET.tostring(caption, method="html") articlelink = caption.find(".//a") if articlelink != None: article = articlelink.attrib.get("href") # f = wiki.Pages[imgname] # items.append((f.imageinfo['url'], captiontext)) items.append((src, captiontext, article)) return items def mwfilepage_to_url (wiki, url): filename = urllib.unquote(url.split("/")[-1]) page = wiki.Pages[filename] return page, page.imageinfo['url'] def url_to_path (url): """ https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """ path = urllib.unquote(urlparse.urlparse(url).path) return "/".join(path.split("/")[3:]) def wiki_absurl (wiki, url): ret = '' if type(wiki.host) == tuple: ret = wiki.host[0]+"://"+wiki.host[1] else: ret = "http://"+wiki.host return urlparse.urljoin(ret, url) def wiki_title_to_url (wiki, title): """ relies on wiki.site['base'] being set to the public facing URL of the Main page """ ret = '' parts = urlparse.urlparse(wiki.site['base']) base, main_page = os.path.split(parts.path) ret = parts.scheme+"://"+parts.netloc+base p = wiki.pages[title] ret += "/" + p.normalize_title(p.name) return ret def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3): print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr) page, imageurl = mwfilepage_to_url(wiki, imagepageurl) path = os.path.join(basepath, url_to_path(imageurl)) print ("imageurl, path", imageurl, path, file=sys.stderr) ret = tiles_wrapper(path, imagepageurl, text=text) tp = ret.get_tile_path(0, 0, 0) if os.path.exists(tp) and not force: return ret try: os.makedirs(path) except OSError: pass im = Image.open(urlopen(imageurl)) tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor) return ret def textcell (paras): node = {} node['text'] = paras[:1] moretext = paras[1:] if moretext: node['children'] = [textcell([x]) for x in moretext] return node def name_to_path (name): return name.replace("/", "_") def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3): print ("render_article", ref, file=sys.stderr) if type(ref) == Page: page = ref title = page.name ref = wiki_title_to_url(wiki, page.name) elif ref.startswith("http"): title = wiki_url_to_title(ref) page = wiki.pages[title] else: title = ref page = wiki.pages[title] ref = wiki_title_to_url(wiki, page.name) # pagetext = page.text() # print ("WIKI PARSE", title, file=sys.stderr) parse = wiki.parse(page=title) html = parse['text']['*'] # print ("GOT HTML ", html, file=sys.stderr) tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) body = tree.find("./body") paras = [] images = [] imgsrcs = {} for c in body: if c.tag == "p": # filter out paras like


but checking text-only render length ptext = ET.tostring(c, encoding="utf-8", method="text").strip() if len(ptext) > 0: ptext = ET.tostring(c, encoding="utf-8", method="html").strip() paras.append(ptext) elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"): # print ("GALLERY") gallery = parse_gallery(c) # Ensure image is downloaded ... at least the 00 image... for src, caption, article in gallery: src = wiki_absurl(wiki, src) if src in imgsrcs: continue imgsrcs[src] = True print ("GalleryImage", src, caption, article, file=sys.stderr) # if article and depth < maxdepth: # article = wiki_absurl(wiki, article) # images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth)) # else: images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom()) for a in body.findall('.//a[@class="image"]'): caption = a.attrib.get("title", '') src = wiki_absurl(wiki, a.attrib.get("href")) # OEI... skippin svg for the moment (can't go straight to PIL) if src.endswith(".svg"): continue print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr) if src in imgsrcs: continue imgsrcs[src] = True images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath).zoom()) print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr) if title == None: title = page.name basename = "tiles/" + name_to_path(page.name) # gallerynode = gridrender(images, basename) # return gallerynode cells = [] if len(paras) > 0: cells.append(textcell(paras)) cells.extend(images) ret = recursiverender(cells, basename) ret['text'] = u"""

{0}WIKI

""".format(title, ref) if images: ret['image'] = images[0]['image'] return ret # article = {} # article['text'] = title # article['children'] = children = [] # children.append(textcell(paras)) # for iz in images[:2]: # if 'image' not in article and 'image' in iz: # article['image'] = iz['image'] # children.append(iz) # restimages = images[2:] # if len(restimages) == 1: # children.append(restimages[0]) # elif len(restimages) > 1: # children.append(gridrender(restimages, basename)) # return article def render_category (wiki, cat, output="tiles"): print ("Render Category", cat, file=sys.stderr) # if type(cat) == Page: # page = ref # title = page.name # ref = wiki_title_to_url(wiki, page.name) if cat.startswith("http"): title = wiki_url_to_title(cat) cat = wiki.pages[title] else: title = ref cat = wiki.pages[cat] # ref = wiki_title_to_url(wiki, cat.name) print ("cat", cat, file=sys.stderr) pages = [] for m in cat.members(): pages.append(m) pages.sort(key=lambda x: x.name) pagenodes = [render_article(wiki, x.name) for x in pages] for page, node in zip(pages, pagenodes): node['text'] = u"""

{0}WIKI

""".format(page.name, wiki_title_to_url(wiki, page.name)) ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_")) ret['text'] = u"""

{1}

""".format(wiki_title_to_url(wiki, cat.name), cat.name) return ret # for p in pages: # print (p.name, wiki_title_to_url(wiki, p.name)) def make_category (args): wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath) root_node = render_category(wiki, args.category) if args.html: print (html(root_node, "")) else: print (json.dumps(root_node, indent=2)) def make_article (args): wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath) root_node = render_article(wiki, args.wikipage) if args.html: print (html(root_node, "")) else: print (json.dumps(root_node, indent=2)) def make_gallery(args): wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath) # apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php" if len(args.wikipage) == 1: root_node = render_article(wiki, args.wikipage[0]) else: children = [] for wikipage in args.wikipage: print ("rendering", wikipage, file=sys.stderr) if "Category:" in wikipage: print ("rendering", wikipage, file=sys.stderr) cnode = render_category(wiki, wikipage, args.output) else: cnode = render_article(wiki, wikipage) children.append(cnode) if args.recursive: root_node = recursiverender(children, args.output+"/"+args.name, direction=1) else: root_node = gridrender(children, args.output+"/"+args.name, direction=1) if args.html: print (html(root_node, "")) else: print (json.dumps(root_node, indent=2)) def testwiki (args): return Site((args.wikiprotocol, args.wikihost), path=args.wikipath) if __name__ == "__main__": ap = ArgumentParser("") ap.add_argument("--wikiprotocol", default="https") ap.add_argument("--wikihost", default="pzwiki.wdka.nl") ap.add_argument("--wikipath", default="/mw-mediadesign/") ap.add_argument("--wikishortpath", default="/mediadesign/") ap.add_argument("--tilewidth", type=int, default=256) ap.add_argument("--tileheight", type=int, default=256) # ap.add_argument("--zoom", type=int, default=3) ap.add_argument("--output", default="tiles") # ap.add_argument("--title", default="TITLE") subparsers = ap.add_subparsers(help='sub-command help') ap_article = subparsers.add_parser('article', help='Render an article') ap_article.add_argument("wikipage") ap_article.add_argument("--html", default=False, action="store_true") ap_article.set_defaults(func=make_article) ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles') ap_gallery.add_argument("wikipage", nargs="+") ap_gallery.add_argument("--html", default=False, action="store_true") ap_gallery.add_argument("--recursive", default=False, action="store_true") ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)") ap_gallery.add_argument("--name", default=None) ap_gallery.set_defaults(func=make_gallery) ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles') ap_gallery.set_defaults(func=testwiki) ap_article = subparsers.add_parser('category', help='Render an article') ap_article.add_argument("category") ap_article.add_argument("--html", default=False, action="store_true") ap_article.set_defaults(func=make_category) args = ap.parse_args() ret = args.func(args)