xpub.nl/scripts/mediawiki.py

from __future__ import print_function

import os, sys, re, urllib, urlparse, html5lib, json
from PIL import Image
from math import log
from argparse import ArgumentParser
from urllib2 import urlopen

from xml.etree import ElementTree as ET 

# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from mwclient.errors import APIError

from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
from urllib import quote as urlquote


def wget (url, path, blocksize=4*1000):
    if type(url) == unicode:
        url = url.encode("utf-8")
    count = 0
    with open(path, "wb") as fout:
        fin = urlopen(url)
        while True:
            data = fin.read(blocksize)
            if not data:
                break
            fout.write(data)
            count += len(data)
    return count

def page_url (site, page):
    # print ("[page_url]", page.name, file=sys.stderr)
    base = os.path.split(site.site['base'])[0]
    path = page.normalize_title(page.name)
    if type(path) == unicode:
        path = path.encode("utf-8")
    path = urlquote(path)
    uret = os.path.join(base, path)
    # assert type(uret) == str
    return uret

def wiki_url_to_title (url):
    return urllib.unquote(url.split("/")[-1])

def parse_gallery(t):
    """ returns [(imagepageurl, caption, articleurl), ...] """
    galleryitems = t.findall(".//li[@class='gallerybox']")
    items = []
    for i in galleryitems:
        image_link = i.find(".//a[@class='image']")
        src = None
        captiontext = None
        article = None

        if image_link != None:
            src = image_link.attrib.get("href")
            # src = src.split("/")[-1]

        caption = i.find(".//*[@class='gallerytext']")
        if caption:
            captiontext = ET.tostring(caption, method="html")
            articlelink = caption.find(".//a")
            if articlelink != None:
                article = articlelink.attrib.get("href")

        # f = wiki.Pages[imgname]
        # items.append((f.imageinfo['url'], captiontext))
        items.append((src, captiontext, article))
    return items

def mwfilepage_to_url (wiki, url):
    filename = urllib.unquote(url.split("/")[-1])
    page = wiki.Pages[filename]
    return page, page.imageinfo['url']

def url_to_path (url):
    """  https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
    path = urllib.unquote(urlparse.urlparse(url).path)
    return "/".join(path.split("/")[3:])

def wiki_absurl (wiki, url):
    ret = ''
    if type(wiki.host) == tuple:
        ret = wiki.host[0]+"://"+wiki.host[1]
    else:
        ret = "http://"+wiki.host

    return urlparse.urljoin(ret, url)

def wiki_title_to_url (wiki, title):
    """ relies on wiki.site['base'] being set to the public facing URL of the Main page """
    ret = ''
    parts = urlparse.urlparse(wiki.site['base'])
    base, main_page = os.path.split(parts.path)
    ret = parts.scheme+"://"+parts.netloc+base
    p = wiki.pages[title]
    ret += "/" + p.normalize_title(p.name)
    return ret

def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3, margin_right=0, margin_bottom=0):
    print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
    page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
    path = os.path.join(basepath, url_to_path(imageurl))
    print ("imageurl, path", imageurl, path, file=sys.stderr)
    ret = tiles_wrapper(path, imagepageurl, text=text)
    tp = ret.get_tile_path(0, 0, 0)
    if os.path.exists(tp) and not force:
        return ret

    try:
        os.makedirs(path)
    except OSError:
        pass
    im = Image.open(urlopen(imageurl))
    tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor, margin_right, margin_bottom)
    return ret

def textcell (paras):
    node = {}
    node['text'] = paras[:1]
    moretext = paras[1:]
    if moretext:
        node['children'] = [textcell([x]) for x in moretext]
    return node

def name_to_path (name):
    return name.replace("/", "_")


def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3, tilewidth=256, tileheight=256):
    print ("render_article", ref, file=sys.stderr)
    if type(ref) == Page:
        page = ref
        title = page.name
        ref = wiki_title_to_url(wiki, page.name)
    elif ref.startswith("http"):
        title = wiki_url_to_title(ref)
        page = wiki.pages[title]
    else:
        title = ref
        page = wiki.pages[title]
        ref = wiki_title_to_url(wiki, page.name)
    # pagetext = page.text()
    # print ("WIKI PARSE", title, file=sys.stderr)
    parse = wiki.parse(page=title)
    html = parse['text']['*']
    # print ("GOT HTML ", html, file=sys.stderr)
    tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
    body = tree.find("./body")
    paras = []
    images = []
    imgsrcs = {}

    for c in body:
        if c.tag == "p":
            # filter out paras like <p><br></p> but checking text-only render length
            ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
            if len(ptext) > 0:
                ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
                paras.append(ptext)

        elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
            # print ("GALLERY")
            gallery = parse_gallery(c)
            # Ensure image is downloaded ... at least the 00 image...
            for src, caption, article in gallery:
                src = wiki_absurl(wiki, src)
                if src in imgsrcs:
                    continue
                imgsrcs[src] = True
                print ("GalleryImage", src, caption, article, file=sys.stderr)
                # if article and depth < maxdepth:
                #     article = wiki_absurl(wiki, article)
                #     images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
                # else:
                images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())

    for a in body.findall('.//a[@class="image"]'):
        caption = a.attrib.get("title", '')
        src = wiki_absurl(wiki, a.attrib.get("href"))
        # OEI... skippin svg for the moment (can't go straight to PIL)
        if src.endswith(".svg"):
            continue
        print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
        if src in imgsrcs:
            continue
        imgsrcs[src] = True
        images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())

    print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)


    if title == None:
        title = page.name

    basename = "tiles/" + name_to_path(page.name)

    # gallerynode = gridrender(images, basename)
    # return gallerynode
    cells = []
    if len(paras) > 0:
        cells.append(textcell(paras))
    cells.extend(images)

    ret = recursiverender(cells, basename, tilewidth=tilewidth, tileheight=tileheight)
    ret['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(title, ref)
    if images:
        ret['image'] = images[0]['image']
    return ret

    # article = {}
    # article['text'] = title
    # article['children'] = children = []
    # children.append(textcell(paras))
    # for iz in images[:2]:
    #     if 'image' not in article and 'image' in iz:
    #         article['image'] = iz['image']
    #     children.append(iz)
    # restimages = images[2:]
    # if len(restimages) == 1:
    #     children.append(restimages[0])
    # elif len(restimages) > 1:
    #     children.append(gridrender(restimages, basename))        
    # return article

def render_category (wiki, cat, output="tiles", tilewidth=256, tileheight=256):
    print ("Render Category", cat, file=sys.stderr)
    # if type(cat) == Page:
    #     page = ref
    #     title = page.name
    #     ref = wiki_title_to_url(wiki, page.name)
    if cat.startswith("http"):
        title = wiki_url_to_title(cat)
        cat = wiki.pages[title]
    else:
        title = ref
        cat = wiki.pages[cat]
        # ref = wiki_title_to_url(wiki, cat.name)
    print ("cat", cat, file=sys.stderr)
    pages = []
    for m in cat.members():
        pages.append(m)
    pages.sort(key=lambda x: x.name)
    pagenodes = [render_article(wiki, x.name, tilewidth=tilewidth, tileheight=tileheight) for x in pages]
    for page, node in zip(pages, pagenodes):
        node['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
    ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"), tilewidth=tilewidth, tileheight=tileheight)
    ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
    return ret
    # for p in pages:
    #     print (p.name, wiki_title_to_url(wiki, p.name))

def make_category (args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    root_node = render_category(wiki, args.category)
    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))


def make_article (args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    root_node = render_article(wiki, args.wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))

def make_gallery(args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    # apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
    if len(args.wikipage) == 1:
        root_node = render_article(wiki, args.wikipage[0], tilewidth=args.tilewidth, tileheight=args.tileheight)
    else:
        children = []
        for wikipage in args.wikipage:
            print ("rendering", wikipage, file=sys.stderr)
            if "Category:" in wikipage:
                print ("rendering", wikipage, file=sys.stderr)
                cnode = render_category(wiki, wikipage, args.output)
            else:
                cnode = render_article(wiki, wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
            children.append(cnode)
        if args.recursive:
            root_node = recursiverender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
        else:
            root_node = gridrender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)

    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))

from time import sleep
def testwiki (args):
    site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    return site

USER_NS = 2

def imageinfo_with_thumbnail (site, name):
    d = site.api(
        "query",
        titles=name,
        prop="imageinfo",
        iiprop="url|mime",
        iiurlwidth=1024
    )
    pp = d['query']['pages']
    for key in pp:
        return pp[key]['imageinfo'][0]

def recentfiles (args):
    # open connection to wiki
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)

    # Prepare user list to filter (if args.usercategory)
    filter_by_users = None
    if args.usercategory:
        filter_by_users = set()
        usercategory = wiki.categories.get(args.usercategory)
        for p in usercategory.members():
            if p.namespace == USER_NS:
                filter_by_users.add(p.page_title)

    # Load args.json for oldest timestamp
    last_date = None
    if args.json:
        try:
            with open (args.json) as f:
                print ("Reading {0}".format(args.json), file=sys.stderr)
                for line in f:
                    data = json.loads(line)
                    if 'date' in data:
                        last_date = data['date'] 
        except IOError as e:
            pass


    # Prepare the query arguments
    qargs = {
        'list': "allimages",
        'ailimit': 50,
        'aisort': 'timestamp',
        'aidir': 'descending',
        'aiprop': "timestamp|url|user|userid"
    }
    if args.oldest:
        qargs['aiend'] = args.oldest
    if last_date:
        print ("Using aiend {0}".format(last_date), file=sys.stderr)
        qargs['aiend'] = last_date

    count = 0 # used to satisfy --limit when given
    skipped_users = set() # nicety for outputting names only once when skipped
    items_to_output = []

    # LOOP for continuing queries as needed
    while True:
        qq = wiki.api('query', **qargs) 
        # print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
        results = qq['query']['allimages']
        for r in results:
            # Filter on user
            if filter_by_users != None:
                if r['user'] not in filter_by_users:
                    if r['user'] not in skipped_users:
                        print ("Skipping user {0}".format(r['user']), file=sys.stderr)
                        skipped_users.add(r['user'])
                    continue

            try:
                # Filter on mime type (image/*)
                filepage = wiki.pages.get(r['title'])
                # mwclient's imageinfo doesn't have mime (or thumbnail info)
                # imageinfo = filepage.imageinfo
                imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
                if not imageinfo['mime'].startswith("image/"):
                    print (u"Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']).encode("utf-8"), file=sys.stderr)
                    continue
                if 'thumburl' not in imageinfo:
                    print (u"Skipping item with no thumburl {0}".format(r['title']).encode("utf-8"), file=sys.stderr)
                    continue

                # Deal with edge case at items == aiend are returned
                if last_date and r['timestamp'] == last_date:
                    # print ("SKIPPING AIEND item", file=sys.stderr)
                    break

                # Construct an item for output
                print (u"[{0}], date:{1}".format(filepage.page_title, r['timestamp']).encode("utf-8"), file=sys.stderr)
                usagepage = None
                for usagepage in filepage.imageusage():
                    break # just grab the first usage page
                # url : local path to file
                # imageurl = imageinfo['url']
                imageurl = imageinfo['thumburl']
                localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
                # wget image from wiki to local folder
                if not os.path.exists(localpath):
                    try:
                        os.makedirs(os.path.split(localpath)[0])
                    except OSError:
                        pass
                    print ("  downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
                    wget(imageurl, localpath)

                item = {}
                item['url'] = localpath
                item['date'] = r['timestamp']
                item['src'] = page_url(wiki, filepage)
                userpage = wiki.pages.get('User:'+r['user'])
                if usagepage:
                    item['text'] = u'<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, usagepage),
                            usagepage.page_title,
                            page_url(wiki, userpage),
                            r['user']).encode("utf-8")
                else:
                    item['text'] = u'<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, filepage),
                            filepage.page_title,
                            page_url(wiki, userpage),
                            r['user']).encode("utf-8")

                # print (json.dumps(item))
                items_to_output.append(item)
                # honor --limit
                count += 1
                if args.limit and count == args.limit:
                    break
            
            except APIError as e:
                print ("Error {0}, skipping".format(e) ,file=sys.stderr)

        if args.limit and count == args.limit:
            break
        # continue the query if possible (pre-loop)...
        if 'continue' in qq:
            qargs['aicontinue'] = qq['continue']['aicontinue']
        else:
            # we've reached the end of the query data
            break
    
    # OUTPUT RESULTS
    # reverse to be chronological
    items_to_output.reverse()
    if args.json:
        with open(args.json, "a") as f:
            for x in items_to_output:
                print (json.dumps(x), file=f)
    else:
        for x in items_to_output:
            print (json.dumps(x))


if __name__ == "__main__":

    ap = ArgumentParser("")
    ap.add_argument("--wikiprotocol", default="https")
    ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
    ap.add_argument("--wikipath", default="/mw-mediadesign/")
    ap.add_argument("--wikishortpath", default="/mediadesign/")

    ap.add_argument("--tilewidth", type=int, default=256)
    ap.add_argument("--tileheight", type=int, default=256)
    # ap.add_argument("--zoom", type=int, default=3)

    ap.add_argument("--output", default="tiles")
    # ap.add_argument("--title", default="TITLE")

 
    subparsers = ap.add_subparsers(help='sub-command help')
    ap_article = subparsers.add_parser('article', help='Render an article')
    ap_article.add_argument("wikipage")
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_article)

    ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
    ap_gallery.add_argument("wikipage", nargs="+")
    ap_gallery.add_argument("--html", default=False, action="store_true")
    ap_gallery.add_argument("--recursive", default=False, action="store_true")
    ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
    ap_gallery.add_argument("--name", default=None)
    ap_gallery.set_defaults(func=make_gallery)

    ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
    ap_gallery.set_defaults(func=testwiki)

    ap_article = subparsers.add_parser('category', help='Render an article')
    ap_article.add_argument("category")
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_category)

    ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
    ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
    ap_recentfiles.add_argument("--limit", type=int, help="limit")
    ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
    ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
    ap_recentfiles.set_defaults(func=recentfiles)


    args = ap.parse_args()
    ret = args.func(args)
new site 8 years ago			`from __future__ import print_function`

			`import os, sys, re, urllib, urlparse, html5lib, json`
			`from PIL import Image`
			`from math import log`
			`from argparse import ArgumentParser`
			`from urllib2 import urlopen`

			`from xml.etree import ElementTree as ET`

			`# from wiki_get_html import page_html`
			`from mwclient import Site`
			`from mwclient.page import Page`
mediawiki recentfiles 7 years ago			`from mwclient.errors import APIError`
new site 8 years ago
			`from leaflet import tiles_wrapper, recursiverender, gridrender, html`
			`from imagetile2 import tile_image`
mediawiki recentfiles 7 years ago			`from urllib import quote as urlquote`


			`def wget (url, path, blocksize=4*1000):`
			`if type(url) == unicode:`
			`url = url.encode("utf-8")`
			`count = 0`
			`with open(path, "wb") as fout:`
			`fin = urlopen(url)`
			`while True:`
			`data = fin.read(blocksize)`
			`if not data:`
			`break`
			`fout.write(data)`
			`count += len(data)`
			`return count`

			`def page_url (site, page):`
			`# print ("[page_url]", page.name, file=sys.stderr)`
			`base = os.path.split(site.site['base'])[0]`
updates to leaflet and mediawiki for recentchanges 7 years ago			`path = page.normalize_title(page.name)`
			`if type(path) == unicode:`
			`path = path.encode("utf-8")`
			`path = urlquote(path)`
			`uret = os.path.join(base, path)`
mediawiki recentfiles 7 years ago			`# assert type(uret) == str`
			`return uret`
new site 8 years ago
			`def wiki_url_to_title (url):`
			`return urllib.unquote(url.split("/")[-1])`

			`def parse_gallery(t):`
			`""" returns [(imagepageurl, caption, articleurl), ...] """`
			`galleryitems = t.findall(".//li[@class='gallerybox']")`
			`items = []`
			`for i in galleryitems:`
			`image_link = i.find(".//a[@class='image']")`
			`src = None`
			`captiontext = None`
			`article = None`

			`if image_link != None:`
			`src = image_link.attrib.get("href")`
			`# src = src.split("/")[-1]`

			`caption = i.find(".//*[@class='gallerytext']")`
			`if caption:`
			`captiontext = ET.tostring(caption, method="html")`
			`articlelink = caption.find(".//a")`
			`if articlelink != None:`
			`article = articlelink.attrib.get("href")`

			`# f = wiki.Pages[imgname]`
			`# items.append((f.imageinfo['url'], captiontext))`
			`items.append((src, captiontext, article))`
			`return items`

			`def mwfilepage_to_url (wiki, url):`
			`filename = urllib.unquote(url.split("/")[-1])`
			`page = wiki.Pages[filename]`
			`return page, page.imageinfo['url']`

			`def url_to_path (url):`
			`""" https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """`
			`path = urllib.unquote(urlparse.urlparse(url).path)`
			`return "/".join(path.split("/")[3:])`

			`def wiki_absurl (wiki, url):`
			`ret = ''`
			`if type(wiki.host) == tuple:`
			`ret = wiki.host[0]+"://"+wiki.host[1]`
			`else:`
			`ret = "http://"+wiki.host`

			`return urlparse.urljoin(ret, url)`

			`def wiki_title_to_url (wiki, title):`
			`""" relies on wiki.site['base'] being set to the public facing URL of the Main page """`
			`ret = ''`
			`parts = urlparse.urlparse(wiki.site['base'])`
			`base, main_page = os.path.split(parts.path)`
			`ret = parts.scheme+"://"+parts.netloc+base`
			`p = wiki.pages[title]`
			`ret += "/" + p.normalize_title(p.name)`
			`return ret`

tilesize options and reduced index 8 years ago			`def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3, margin_right=0, margin_bottom=0):`
new site 8 years ago			`print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)`
			`page, imageurl = mwfilepage_to_url(wiki, imagepageurl)`
			`path = os.path.join(basepath, url_to_path(imageurl))`
			`print ("imageurl, path", imageurl, path, file=sys.stderr)`
			`ret = tiles_wrapper(path, imagepageurl, text=text)`
			`tp = ret.get_tile_path(0, 0, 0)`
			`if os.path.exists(tp) and not force:`
			`return ret`

			`try:`
			`os.makedirs(path)`
			`except OSError:`
			`pass`
			`im = Image.open(urlopen(imageurl))`
tilesize options and reduced index 8 years ago			`tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor, margin_right, margin_bottom)`
new site 8 years ago			`return ret`

			`def textcell (paras):`
			`node = {}`
			`node['text'] = paras[:1]`
			`moretext = paras[1:]`
			`if moretext:`
			`node['children'] = [textcell([x]) for x in moretext]`
			`return node`

			`def name_to_path (name):`
			`return name.replace("/", "_")`


tilesize options and reduced index 8 years ago			`def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3, tilewidth=256, tileheight=256):`
new site 8 years ago			`print ("render_article", ref, file=sys.stderr)`
			`if type(ref) == Page:`
			`page = ref`
			`title = page.name`
			`ref = wiki_title_to_url(wiki, page.name)`
			`elif ref.startswith("http"):`
			`title = wiki_url_to_title(ref)`
			`page = wiki.pages[title]`
			`else:`
			`title = ref`
			`page = wiki.pages[title]`
			`ref = wiki_title_to_url(wiki, page.name)`
			`# pagetext = page.text()`
			`# print ("WIKI PARSE", title, file=sys.stderr)`
			`parse = wiki.parse(page=title)`
			`html = parse['text']['*']`
			`# print ("GOT HTML ", html, file=sys.stderr)`
			`tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)`
			`body = tree.find("./body")`
			`paras = []`
			`images = []`
			`imgsrcs = {}`

			`for c in body:`
			`if c.tag == "p":`
			`# filter out paras like <p><br></p> but checking text-only render length`
			`ptext = ET.tostring(c, encoding="utf-8", method="text").strip()`
			`if len(ptext) > 0:`
			`ptext = ET.tostring(c, encoding="utf-8", method="html").strip()`
			`paras.append(ptext)`

			`elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):`
			`# print ("GALLERY")`
			`gallery = parse_gallery(c)`
			`# Ensure image is downloaded ... at least the 00 image...`
			`for src, caption, article in gallery:`
			`src = wiki_absurl(wiki, src)`
			`if src in imgsrcs:`
			`continue`
			`imgsrcs[src] = True`
			`print ("GalleryImage", src, caption, article, file=sys.stderr)`
			`# if article and depth < maxdepth:`
			`# article = wiki_absurl(wiki, article)`
			`# images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))`
			`# else:`
tilesize options and reduced index 8 years ago			`images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())`
new site 8 years ago
			`for a in body.findall('.//a[@class="image"]'):`
			`caption = a.attrib.get("title", '')`
			`src = wiki_absurl(wiki, a.attrib.get("href"))`
			`# OEI... skippin svg for the moment (can't go straight to PIL)`
			`if src.endswith(".svg"):`
			`continue`
			`print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)`
			`if src in imgsrcs:`
			`continue`
			`imgsrcs[src] = True`
tilesize options and reduced index 8 years ago			`images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())`
new site 8 years ago
			`print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)`


			`if title == None:`
			`title = page.name`

			`basename = "tiles/" + name_to_path(page.name)`

			`# gallerynode = gridrender(images, basename)`
			`# return gallerynode`
			`cells = []`
			`if len(paras) > 0:`
			`cells.append(textcell(paras))`
			`cells.extend(images)`

tilesize options and reduced index 8 years ago			`ret = recursiverender(cells, basename, tilewidth=tilewidth, tileheight=tileheight)`
style mod and text/structure changes 8 years ago			`ret['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(title, ref)`
new site 8 years ago			`if images:`
			`ret['image'] = images[0]['image']`
			`return ret`

			`# article = {}`
			`# article['text'] = title`
			`# article['children'] = children = []`
			`# children.append(textcell(paras))`
			`# for iz in images[:2]:`
			`# if 'image' not in article and 'image' in iz:`
			`# article['image'] = iz['image']`
			`# children.append(iz)`
			`# restimages = images[2:]`
			`# if len(restimages) == 1:`
			`# children.append(restimages[0])`
			`# elif len(restimages) > 1:`
			`# children.append(gridrender(restimages, basename))`
			`# return article`

tilesize options and reduced index 8 years ago			`def render_category (wiki, cat, output="tiles", tilewidth=256, tileheight=256):`
new site 8 years ago			`print ("Render Category", cat, file=sys.stderr)`
			`# if type(cat) == Page:`
			`# page = ref`
			`# title = page.name`
			`# ref = wiki_title_to_url(wiki, page.name)`
			`if cat.startswith("http"):`
			`title = wiki_url_to_title(cat)`
			`cat = wiki.pages[title]`
			`else:`
			`title = ref`
			`cat = wiki.pages[cat]`
			`# ref = wiki_title_to_url(wiki, cat.name)`
			`print ("cat", cat, file=sys.stderr)`
			`pages = []`
			`for m in cat.members():`
			`pages.append(m)`
			`pages.sort(key=lambda x: x.name)`
tilesize options and reduced index 8 years ago			`pagenodes = [render_article(wiki, x.name, tilewidth=tilewidth, tileheight=tileheight) for x in pages]`
new site 8 years ago			`for page, node in zip(pages, pagenodes):`
style mod and text/structure changes 8 years ago			`node['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))`
tilesize options and reduced index 8 years ago			`ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"), tilewidth=tilewidth, tileheight=tileheight)`
new site 8 years ago			`ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)`
			`return ret`
			`# for p in pages:`
			`# print (p.name, wiki_title_to_url(wiki, p.name))`

			`def make_category (args):`
			`wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)`
			`root_node = render_category(wiki, args.category)`
			`if args.html:`
			`print (html(root_node, ""))`
			`else:`
			`print (json.dumps(root_node, indent=2))`


			`def make_article (args):`
			`wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)`
tilesize options and reduced index 8 years ago			`root_node = render_article(wiki, args.wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)`
new site 8 years ago			`if args.html:`
			`print (html(root_node, ""))`
			`else:`
			`print (json.dumps(root_node, indent=2))`

			`def make_gallery(args):`
			`wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)`
			`# apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"`
			`if len(args.wikipage) == 1:`
tilesize options and reduced index 8 years ago			`root_node = render_article(wiki, args.wikipage[0], tilewidth=args.tilewidth, tileheight=args.tileheight)`
new site 8 years ago			`else:`
			`children = []`
			`for wikipage in args.wikipage:`
			`print ("rendering", wikipage, file=sys.stderr)`
			`if "Category:" in wikipage:`
			`print ("rendering", wikipage, file=sys.stderr)`
			`cnode = render_category(wiki, wikipage, args.output)`
			`else:`
tilesize options and reduced index 8 years ago			`cnode = render_article(wiki, wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)`
new site 8 years ago			`children.append(cnode)`
			`if args.recursive:`
tilesize options and reduced index 8 years ago			`root_node = recursiverender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)`
new site 8 years ago			`else:`
tilesize options and reduced index 8 years ago			`root_node = gridrender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)`
new site 8 years ago
			`if args.html:`
			`print (html(root_node, ""))`
			`else:`
			`print (json.dumps(root_node, indent=2))`

mediawiki recentfiles 7 years ago			`from time import sleep`
new site 8 years ago			`def testwiki (args):`
mediawiki recentfiles 7 years ago			`site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)`
			`return site`

			`USER_NS = 2`

			`def imageinfo_with_thumbnail (site, name):`
			`d = site.api(`
			`"query",`
			`titles=name,`
			`prop="imageinfo",`
			`iiprop="url\|mime",`
			`iiurlwidth=1024`
			`)`
			`pp = d['query']['pages']`
			`for key in pp:`
			`return pp[key]['imageinfo'][0]`

			`def recentfiles (args):`
			`# open connection to wiki`
			`wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)`

			`# Prepare user list to filter (if args.usercategory)`
			`filter_by_users = None`
			`if args.usercategory:`
			`filter_by_users = set()`
			`usercategory = wiki.categories.get(args.usercategory)`
			`for p in usercategory.members():`
			`if p.namespace == USER_NS:`
			`filter_by_users.add(p.page_title)`

			`# Load args.json for oldest timestamp`
			`last_date = None`
			`if args.json:`
			`try:`
			`with open (args.json) as f:`
			`print ("Reading {0}".format(args.json), file=sys.stderr)`
			`for line in f:`
			`data = json.loads(line)`
			`if 'date' in data:`
			`last_date = data['date']`
			`except IOError as e:`
			`pass`


			`# Prepare the query arguments`
			`qargs = {`
			`'list': "allimages",`
			`'ailimit': 50,`
			`'aisort': 'timestamp',`
			`'aidir': 'descending',`
			`'aiprop': "timestamp\|url\|user\|userid"`
			`}`
			`if args.oldest:`
			`qargs['aiend'] = args.oldest`
			`if last_date:`
			`print ("Using aiend {0}".format(last_date), file=sys.stderr)`
			`qargs['aiend'] = last_date`

			`count = 0 # used to satisfy --limit when given`
			`skipped_users = set() # nicety for outputting names only once when skipped`
			`items_to_output = []`

			`# LOOP for continuing queries as needed`
			`while True:`
			`qq = wiki.api('query', **qargs)`
			`# print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)`
			`results = qq['query']['allimages']`
			`for r in results:`
			`# Filter on user`
			`if filter_by_users != None:`
			`if r['user'] not in filter_by_users:`
			`if r['user'] not in skipped_users:`
			`print ("Skipping user {0}".format(r['user']), file=sys.stderr)`
			`skipped_users.add(r['user'])`
			`continue`

			`try:`
			`# Filter on mime type (image/*)`
			`filepage = wiki.pages.get(r['title'])`
			`# mwclient's imageinfo doesn't have mime (or thumbnail info)`
			`# imageinfo = filepage.imageinfo`
			`imageinfo = imageinfo_with_thumbnail(wiki, r['title'])`
			`if not imageinfo['mime'].startswith("image/"):`
updates to leaflet and mediawiki for recentchanges 7 years ago			`print (u"Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']).encode("utf-8"), file=sys.stderr)`
			`continue`
			`if 'thumburl' not in imageinfo:`
			`print (u"Skipping item with no thumburl {0}".format(r['title']).encode("utf-8"), file=sys.stderr)`
mediawiki recentfiles 7 years ago			`continue`

			`# Deal with edge case at items == aiend are returned`
			`if last_date and r['timestamp'] == last_date:`
updates to leaflet and mediawiki for recentchanges 7 years ago			`# print ("SKIPPING AIEND item", file=sys.stderr)`
mediawiki recentfiles 7 years ago			`break`

			`# Construct an item for output`
utf8 output issue 7 years ago			`print (u"[{0}], date:{1}".format(filepage.page_title, r['timestamp']).encode("utf-8"), file=sys.stderr)`
mediawiki recentfiles 7 years ago			`usagepage = None`
			`for usagepage in filepage.imageusage():`
			`break # just grab the first usage page`
			`# url : local path to file`
updates to leaflet and mediawiki for recentchanges 7 years ago			`# imageurl = imageinfo['url']`
			`imageurl = imageinfo['thumburl']`
mediawiki recentfiles 7 years ago			`localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")`
			`# wget image from wiki to local folder`
			`if not os.path.exists(localpath):`
			`try:`
			`os.makedirs(os.path.split(localpath)[0])`
			`except OSError:`
			`pass`
			`print (" downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)`
			`wget(imageurl, localpath)`

			`item = {}`
			`item['url'] = localpath`
			`item['date'] = r['timestamp']`
updates to leaflet and mediawiki for recentchanges 7 years ago			`item['src'] = page_url(wiki, filepage)`
mediawiki recentfiles 7 years ago			`userpage = wiki.pages.get('User:'+r['user'])`
			`if usagepage:`
updates to leaflet and mediawiki for recentchanges 7 years ago			`item['text'] = u'<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(`
mediawiki recentfiles 7 years ago			`page_url(wiki, usagepage),`
			`usagepage.page_title,`
			`page_url(wiki, userpage),`
updates to leaflet and mediawiki for recentchanges 7 years ago			`r['user']).encode("utf-8")`
mediawiki recentfiles 7 years ago			`else:`
updates to leaflet and mediawiki for recentchanges 7 years ago			`item['text'] = u'<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(`
mediawiki recentfiles 7 years ago			`page_url(wiki, filepage),`
			`filepage.page_title,`
			`page_url(wiki, userpage),`
updates to leaflet and mediawiki for recentchanges 7 years ago			`r['user']).encode("utf-8")`
mediawiki recentfiles 7 years ago
			`# print (json.dumps(item))`
			`items_to_output.append(item)`
			`# honor --limit`
			`count += 1`
			`if args.limit and count == args.limit:`
			`break`

			`except APIError as e:`
corrected exception trace to stderr 7 years ago			`print ("Error {0}, skipping".format(e) ,file=sys.stderr)`
mediawiki recentfiles 7 years ago
			`if args.limit and count == args.limit:`
			`break`
			`# continue the query if possible (pre-loop)...`
			`if 'continue' in qq:`
			`qargs['aicontinue'] = qq['continue']['aicontinue']`
			`else:`
			`# we've reached the end of the query data`
			`break`

			`# OUTPUT RESULTS`
			`# reverse to be chronological`
			`items_to_output.reverse()`
			`if args.json:`
			`with open(args.json, "a") as f:`
			`for x in items_to_output:`
			`print (json.dumps(x), file=f)`
			`else:`
			`for x in items_to_output:`
			`print (json.dumps(x))`
new site 8 years ago
tilesize options and reduced index 8 years ago
new site 8 years ago			`if __name__ == "__main__":`

			`ap = ArgumentParser("")`
			`ap.add_argument("--wikiprotocol", default="https")`
			`ap.add_argument("--wikihost", default="pzwiki.wdka.nl")`
			`ap.add_argument("--wikipath", default="/mw-mediadesign/")`
			`ap.add_argument("--wikishortpath", default="/mediadesign/")`

			`ap.add_argument("--tilewidth", type=int, default=256)`
			`ap.add_argument("--tileheight", type=int, default=256)`
			`# ap.add_argument("--zoom", type=int, default=3)`

			`ap.add_argument("--output", default="tiles")`
			`# ap.add_argument("--title", default="TITLE")`


			`subparsers = ap.add_subparsers(help='sub-command help')`
			`ap_article = subparsers.add_parser('article', help='Render an article')`
			`ap_article.add_argument("wikipage")`
			`ap_article.add_argument("--html", default=False, action="store_true")`
			`ap_article.set_defaults(func=make_article)`

			`ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')`
			`ap_gallery.add_argument("wikipage", nargs="+")`
			`ap_gallery.add_argument("--html", default=False, action="store_true")`
			`ap_gallery.add_argument("--recursive", default=False, action="store_true")`
			`ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")`
			`ap_gallery.add_argument("--name", default=None)`
			`ap_gallery.set_defaults(func=make_gallery)`

			`ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')`
			`ap_gallery.set_defaults(func=testwiki)`

			`ap_article = subparsers.add_parser('category', help='Render an article')`
			`ap_article.add_argument("category")`
			`ap_article.add_argument("--html", default=False, action="store_true")`
			`ap_article.set_defaults(func=make_category)`

mediawiki recentfiles 7 years ago			`ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')`
			`ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")`
			`ap_recentfiles.add_argument("--limit", type=int, help="limit")`
			`ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")`
			`ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")`
			`ap_recentfiles.set_defaults(func=recentfiles)`

new site 8 years ago

			`args = ap.parse_args()`
			`ret = args.func(args)`