xpub.nl/scripts/mediawiki.py

from __future__ import print_function

import os, sys, re, urllib, urlparse, html5lib, json
from PIL import Image
from math import log
from argparse import ArgumentParser
from urllib2 import urlopen

from xml.etree import ElementTree as ET

# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from mwclient.errors import APIError

from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
from urllib import quote as urlquote


def wget (url, path, blocksize=4*1000):
    if type(url) == unicode:
        url = url.encode("utf-8")
    count = 0
    with open(path, "wb") as fout:
        fin = urlopen(url)
        while True:
            data = fin.read(blocksize)
            if not data:
                break
            fout.write(data)
            count += len(data)
    return count

def page_url (site, page):
    # print ("[page_url]", page.name, file=sys.stderr)
    base = os.path.split(site.site['base'])[0]
    uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
    # assert type(uret) == str
    return uret

def wiki_url_to_title (url):
    return urllib.unquote(url.split("/")[-1])

def parse_gallery(t):
    """ returns [(imagepageurl, caption, articleurl), ...] """
    galleryitems = t.findall(".//li[@class='gallerybox']")
    items = []
    for i in galleryitems:
        image_link = i.find(".//a[@class='image']")
        src = None
        captiontext = None
        article = None

        if image_link != None:
            src = image_link.attrib.get("href")
            # src = src.split("/")[-1]

        caption = i.find(".//*[@class='gallerytext']")
        if caption:
            captiontext = ET.tostring(caption, method="html")
            articlelink = caption.find(".//a")
            if articlelink != None:
                article = articlelink.attrib.get("href")

        # f = wiki.Pages[imgname]
        # items.append((f.imageinfo['url'], captiontext))
        items.append((src, captiontext, article))
    return items

def mwfilepage_to_url (wiki, url):
    filename = urllib.unquote(url.split("/")[-1])
    page = wiki.Pages[filename]
    return page, page.imageinfo['url']

def url_to_path (url):
    """  https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
    path = urllib.unquote(urlparse.urlparse(url).path)
    return "/".join(path.split("/")[3:])

def wiki_absurl (wiki, url):
    ret = ''
    if type(wiki.host) == tuple:
        ret = wiki.host[0]+"://"+wiki.host[1]
    else:
        ret = "http://"+wiki.host

    return urlparse.urljoin(ret, url)

def wiki_title_to_url (wiki, title):
    """ relies on wiki.site['base'] being set to the public facing URL of the Main page """
    ret = ''
    parts = urlparse.urlparse(wiki.site['base'])
    base, main_page = os.path.split(parts.path)
    ret = parts.scheme+"://"+parts.netloc+base
    p = wiki.pages[title]
    ret += "/" + p.normalize_title(p.name)
    return ret

def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3, margin_right=0, margin_bottom=0):
    print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
    page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
    path = os.path.join(basepath, url_to_path(imageurl))
    print ("imageurl, path", imageurl, path, file=sys.stderr)
    ret = tiles_wrapper(path, imagepageurl, text=text)
    tp = ret.get_tile_path(0, 0, 0)
    if os.path.exists(tp) and not force:
        return ret

    try:
        os.makedirs(path)
    except OSError:
        pass
    im = Image.open(urlopen(imageurl))
    tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor, margin_right, margin_bottom)
    return ret

def textcell (paras):
    node = {}
    node['text'] = paras[:1]
    moretext = paras[1:]
    if moretext:
        node['children'] = [textcell([x]) for x in moretext]
    return node

def name_to_path (name):
    return name.replace("/", "_")


def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3, tilewidth=256, tileheight=256):
    print ("render_article", ref, file=sys.stderr)
    if type(ref) == Page:
        page = ref
        title = page.name
        ref = wiki_title_to_url(wiki, page.name)
    elif ref.startswith("http"):
        title = wiki_url_to_title(ref)
        page = wiki.pages[title]
    else:
        title = ref
        page = wiki.pages[title]
        ref = wiki_title_to_url(wiki, page.name)
    # pagetext = page.text()
    # print ("WIKI PARSE", title, file=sys.stderr)
    parse = wiki.parse(page=title)
    html = parse['text']['*']
    # print ("GOT HTML ", html, file=sys.stderr)
    tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
    body = tree.find("./body")
    paras = []
    images = []
    imgsrcs = {}

    for c in body:
        if c.tag == "p":
            # filter out paras like <p><br></p> but checking text-only render length
            ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
            if len(ptext) > 0:
                ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
                paras.append(ptext)

        elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
            # print ("GALLERY")
            gallery = parse_gallery(c)
            # Ensure image is downloaded ... at least the 00 image...
            for src, caption, article in gallery:
                src = wiki_absurl(wiki, src)
                if src in imgsrcs:
                    continue
                imgsrcs[src] = True
                print ("GalleryImage", src, caption, article, file=sys.stderr)
                # if article and depth < maxdepth:
                #     article = wiki_absurl(wiki, article)
                #     images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
                # else:
                images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())

    for a in body.findall('.//a[@class="image"]'):
        caption = a.attrib.get("title", '')
        src = wiki_absurl(wiki, a.attrib.get("href"))
        # OEI... skippin svg for the moment (can't go straight to PIL)
        if src.endswith(".svg"):
            continue
        print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
        if src in imgsrcs:
            continue
        imgsrcs[src] = True
        images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())

    print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)


    if title == None:
        title = page.name

    basename = "tiles/" + name_to_path(page.name)

    # gallerynode = gridrender(images, basename)
    # return gallerynode
    cells = []
    if len(paras) > 0:
        cells.append(textcell(paras))
    cells.extend(images)

    ret = recursiverender(cells, basename, tilewidth=tilewidth, tileheight=tileheight)
    ret['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(title, ref)
    if images:
        ret['image'] = images[0]['image']
    return ret

    # article = {}
    # article['text'] = title
    # article['children'] = children = []
    # children.append(textcell(paras))
    # for iz in images[:2]:
    #     if 'image' not in article and 'image' in iz:
    #         article['image'] = iz['image']
    #     children.append(iz)
    # restimages = images[2:]
    # if len(restimages) == 1:
    #     children.append(restimages[0])
    # elif len(restimages) > 1:
    #     children.append(gridrender(restimages, basename))
    # return article

def render_category (wiki, cat, output="tiles", tilewidth=256, tileheight=256):
    print ("Render Category", cat, file=sys.stderr)
    # if type(cat) == Page:
    #     page = ref
    #     title = page.name
    #     ref = wiki_title_to_url(wiki, page.name)
    if cat.startswith("http"):
        title = wiki_url_to_title(cat)
        cat = wiki.pages[title]
    else:
        title = ref
        cat = wiki.pages[cat]
        # ref = wiki_title_to_url(wiki, cat.name)
    print ("cat", cat, file=sys.stderr)
    pages = []
    for m in cat.members():
        pages.append(m)
    pages.sort(key=lambda x: x.name)
    pagenodes = [render_article(wiki, x.name, tilewidth=tilewidth, tileheight=tileheight) for x in pages]
    for page, node in zip(pages, pagenodes):
        node['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
    ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"), tilewidth=tilewidth, tileheight=tileheight)
    ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
    return ret
    # for p in pages:
    #     print (p.name, wiki_title_to_url(wiki, p.name))

def make_category (args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    root_node = render_category(wiki, args.category)
    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))


def make_article (args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    root_node = render_article(wiki, args.wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))

def make_gallery(args):
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    # apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
    if len(args.wikipage) == 1:
        root_node = render_article(wiki, args.wikipage[0], tilewidth=args.tilewidth, tileheight=args.tileheight)
    else:
        children = []
        for wikipage in args.wikipage:
            print ("rendering", wikipage, file=sys.stderr)
            if "Category:" in wikipage:
                print ("rendering", wikipage, file=sys.stderr)
                cnode = render_category(wiki, wikipage, args.output)
            else:
                cnode = render_article(wiki, wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
            children.append(cnode)
        if args.recursive:
            root_node = recursiverender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
        else:
            root_node = gridrender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)

    if args.html:
        print (html(root_node, ""))
    else:
        print (json.dumps(root_node, indent=2))

from time import sleep
def testwiki (args):
    site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    return site

USER_NS = 2

def imageinfo_with_thumbnail (site, name):
    d = site.api(
        "query",
        titles=name,
        prop="imageinfo",
        iiprop="url|mime",
        iiurlwidth=1024
    )
    pp = d['query']['pages']
    for key in pp:
        return pp[key]['imageinfo'][0]

def recentfiles (args):
    # open connection to wiki
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)

    # Prepare user list to filter (if args.usercategory)
    filter_by_users = None
    if args.usercategory:
        filter_by_users = set()
        usercategory = wiki.categories.get(args.usercategory)
        for p in usercategory.members():
            if p.namespace == USER_NS:
                filter_by_users.add(p.page_title)

    # Load args.json for oldest timestamp
    last_date = None
    if args.json:
        try:
            with open (args.json) as f:
                print ("Reading {0}".format(args.json), file=sys.stderr)
                for line in f:
                    data = json.loads(line)
                    if 'date' in data:
                        last_date = data['date']
        except IOError as e:
            pass


    # Prepare the query arguments
    qargs = {
        'list': "allimages",
        'ailimit': 50,
        'aisort': 'timestamp',
        'aidir': 'descending',
        'aiprop': "timestamp|url|user|userid"
    }
    if args.oldest:
        qargs['aiend'] = args.oldest
    if last_date:
        print ("Using aiend {0}".format(last_date), file=sys.stderr)
        qargs['aiend'] = last_date

    count = 0 # used to satisfy --limit when given
    skipped_users = set() # nicety for outputting names only once when skipped
    items_to_output = []

    # LOOP for continuing queries as needed
    while True:
        qq = wiki.api('query', **qargs)
        # print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
        results = qq['query']['allimages']
        for r in results:
            # Filter on user
            if filter_by_users != None:
                if r['user'] not in filter_by_users:
                    if r['user'] not in skipped_users:
                        print ("Skipping user {0}".format(r['user']), file=sys.stderr)
                        skipped_users.add(r['user'])
                    continue

            try:
                # Filter on mime type (image/*)
                filepage = wiki.pages.get(r['title'])
                # mwclient's imageinfo doesn't have mime (or thumbnail info)
                # imageinfo = filepage.imageinfo
                imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
                if not imageinfo['mime'].startswith("image/"):
                    print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
                    continue

                # Deal with edge case at items == aiend are returned
                if last_date and r['timestamp'] == last_date:
                    print ("SKIPPING AIEND item", file=sys.stderr)
                    break

                # Construct an item for output
                print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
                usagepage = None
                for usagepage in filepage.imageusage():
                    break # just grab the first usage page
                # url : local path to file
                imageurl = imageinfo['url']
                localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
                # wget image from wiki to local folder
                if not os.path.exists(localpath):
                    try:
                        os.makedirs(os.path.split(localpath)[0])
                    except OSError:
                        pass
                    print ("  downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
                    wget(imageurl, localpath)

                item = {}
                item['url'] = localpath
                item['date'] = r['timestamp']
                userpage = wiki.pages.get('User:'+r['user'])
                if usagepage:
                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, usagepage),
                            usagepage.page_title,
                            page_url(wiki, userpage),
                            r['user'])
                else:
                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, filepage),
                            filepage.page_title,
                            page_url(wiki, userpage),
                            r['user'])

                # print (json.dumps(item))
                items_to_output.append(item)
                # honor --limit
                count += 1
                if args.limit and count == args.limit:
                    break

            except APIError as e:
                print ("Error {0}, skipping".format(e))

        if args.limit and count == args.limit:
            break
        # continue the query if possible (pre-loop)...
        if 'continue' in qq:
            qargs['aicontinue'] = qq['continue']['aicontinue']
        else:
            # we've reached the end of the query data
            break

    # OUTPUT RESULTS
    # reverse to be chronological
    items_to_output.reverse()
    if args.json:
        with open(args.json, "a") as f:
            for x in items_to_output:
                print (json.dumps(x), file=f)
    else:
        for x in items_to_output:
            print (json.dumps(x))


if __name__ == "__main__":

    ap = ArgumentParser("")
    ap.add_argument("--wikiprotocol", default="https")
    ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
    ap.add_argument("--wikipath", default="/mw-mediadesign/")
    ap.add_argument("--wikishortpath", default="/mediadesign/")

    ap.add_argument("--tilewidth", type=int, default=256)
    ap.add_argument("--tileheight", type=int, default=256)
    # ap.add_argument("--zoom", type=int, default=3)

    ap.add_argument("--output", default="tiles")
    # ap.add_argument("--title", default="TITLE")


    subparsers = ap.add_subparsers(help='sub-command help')
    ap_article = subparsers.add_parser('article', help='Render an article')
    ap_article.add_argument("wikipage")
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_article)

    ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
    ap_gallery.add_argument("wikipage", nargs="+")
    ap_gallery.add_argument("--html", default=False, action="store_true")
    ap_gallery.add_argument("--recursive", default=False, action="store_true")
    ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
    ap_gallery.add_argument("--name", default=None)
    ap_gallery.set_defaults(func=make_gallery)

    ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
    ap_gallery.set_defaults(func=testwiki)

    ap_article = subparsers.add_parser('category', help='Render an article')
    ap_article.add_argument("category")
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_category)

    ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
    ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
    ap_recentfiles.add_argument("--limit", type=int, help="limit")
    ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
    ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
    ap_recentfiles.set_defaults(func=recentfiles)


    args = ap.parse_args()
    ret = args.func(args)