mediawiki recentfiles

7 years ago · bb8c09f9f1
parent 82bd63835f
commit bb8c09f9f1
3 changed files with 190 additions and 5 deletions
--- a/4
+++ b/4
@ -17,7 +17,7 @@ archive.json:
 		https://pzwiki.wdka.nl/mediadesign/Category:2004 > archive.json
 drop.node.json: drop.json
-	cat drop.json | python scripts/leaflet.py gallery --recursive --direction 2 > drop.node.json
+	cat drop.json | python scripts/leaflet.py gallery --reverse --recursive --direction 2 > drop.node.json
 about.json: about.txt
 	python scripts/texthierarchy.py < about.txt > about.json
@ -25,3 +25,5 @@ about.json: about.txt
 index.json: archive.json about.json drop.node.json
 	python scripts/includenodes.py xpub.top.json > index.json
 recent.json: web.touch
 	python scripts/mediawiki.py recentfiles --usercategory Xpub ...
--- a/scripts/leaflet.py
+++ b/scripts/leaflet.py
@ -20,7 +20,7 @@ def tiles_path_for (n):
 def autolink (text):
    def sub (m):
        return u'<a href="{0}">LINK</a>'.format(m.group(0))
-    return re.sub(r"https?://[\S]+", sub, text, re.I)
+    return re.sub(r'(?<!")https?://[\S]+(?!")', sub, text, re.I)
 def parse8601 (t, fmt=None):
    """ simple 8601 parser that doesn't care about more than YMDHMS"""
--- a/scripts/mediawiki.py
+++ b/scripts/mediawiki.py
@ -11,10 +11,33 @@ from xml.etree import ElementTree as ET
 # from wiki_get_html import page_html
 from mwclient import Site
 from mwclient.page import Page
 from mwclient.errors import APIError
 from leaflet import tiles_wrapper, recursiverender, gridrender, html
 from imagetile2 import tile_image
-
+from urllib import quote as urlquote
 def wget (url, path, blocksize=4*1000):
    if type(url) == unicode:
        url = url.encode("utf-8")
    count = 0
    with open(path, "wb") as fout:
        fin = urlopen(url)
        while True:
            data = fin.read(blocksize)
            if not data:
                break
            fout.write(data)
            count += len(data)
    return count
 def page_url (site, page):
    # print ("[page_url]", page.name, file=sys.stderr)
    base = os.path.split(site.site['base'])[0]
    uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
    # assert type(uret) == str
    return uret
 def wiki_url_to_title (url):
    return urllib.unquote(url.split("/")[-1])
@ -269,9 +292,162 @@ def make_gallery(args):
    else:
        print (json.dumps(root_node, indent=2))
-
+from time import sleep
 def testwiki (args):
-    return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
+    site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    return site
 USER_NS = 2
 def imageinfo_with_thumbnail (site, name):
    d = site.api(
        "query",
        titles=name,
        prop="imageinfo",
        iiprop="url|mime",
        iiurlwidth=1024
    )
    pp = d['query']['pages']
    for key in pp:
        return pp[key]['imageinfo'][0]
 def recentfiles (args):
    # open connection to wiki
    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
    # Prepare user list to filter (if args.usercategory)
    filter_by_users = None
    if args.usercategory:
        filter_by_users = set()
        usercategory = wiki.categories.get(args.usercategory)
        for p in usercategory.members():
            if p.namespace == USER_NS:
                filter_by_users.add(p.page_title)
    # Load args.json for oldest timestamp
    last_date = None
    if args.json:
        try:
            with open (args.json) as f:
                print ("Reading {0}".format(args.json), file=sys.stderr)
                for line in f:
                    data = json.loads(line)
                    if 'date' in data:
                        last_date = data['date'] 
        except IOError as e:
            pass
    # Prepare the query arguments
    qargs = {
        'list': "allimages",
        'ailimit': 50,
        'aisort': 'timestamp',
        'aidir': 'descending',
        'aiprop': "timestamp|url|user|userid"
    }
    if args.oldest:
        qargs['aiend'] = args.oldest
    if last_date:
        print ("Using aiend {0}".format(last_date), file=sys.stderr)
        qargs['aiend'] = last_date
    count = 0 # used to satisfy --limit when given
    skipped_users = set() # nicety for outputting names only once when skipped
    items_to_output = []
    # LOOP for continuing queries as needed
    while True:
        qq = wiki.api('query', **qargs) 
        # print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
        results = qq['query']['allimages']
        for r in results:
            # Filter on user
            if filter_by_users != None:
                if r['user'] not in filter_by_users:
                    if r['user'] not in skipped_users:
                        print ("Skipping user {0}".format(r['user']), file=sys.stderr)
                        skipped_users.add(r['user'])
                    continue
            try:
                # Filter on mime type (image/*)
                filepage = wiki.pages.get(r['title'])
                # mwclient's imageinfo doesn't have mime (or thumbnail info)
                # imageinfo = filepage.imageinfo
                imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
                if not imageinfo['mime'].startswith("image/"):
                    print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
                    continue
                # Deal with edge case at items == aiend are returned
                if last_date and r['timestamp'] == last_date:
                    print ("SKIPPING AIEND item", file=sys.stderr)
                    break
                # Construct an item for output
                print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
                usagepage = None
                for usagepage in filepage.imageusage():
                    break # just grab the first usage page
                # url : local path to file
                imageurl = imageinfo['url']
                localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
                # wget image from wiki to local folder
                if not os.path.exists(localpath):
                    try:
                        os.makedirs(os.path.split(localpath)[0])
                    except OSError:
                        pass
                    print ("  downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
                    wget(imageurl, localpath)
                item = {}
                item['url'] = localpath
                item['date'] = r['timestamp']
                userpage = wiki.pages.get('User:'+r['user'])
                if usagepage:
                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, usagepage),
                            usagepage.page_title,
                            page_url(wiki, userpage),
                            r['user'])
                else:
                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
                            page_url(wiki, filepage),
                            filepage.page_title,
                            page_url(wiki, userpage),
                            r['user'])
                # print (json.dumps(item))
                items_to_output.append(item)
                # honor --limit
                count += 1
                if args.limit and count == args.limit:
                    break
            except APIError as e:
                print ("Error {0}, skipping".format(e))
        if args.limit and count == args.limit:
            break
        # continue the query if possible (pre-loop)...
        if 'continue' in qq:
            qargs['aicontinue'] = qq['continue']['aicontinue']
        else:
            # we've reached the end of the query data
            break
    # OUTPUT RESULTS
    # reverse to be chronological
    items_to_output.reverse()
    if args.json:
        with open(args.json, "a") as f:
            for x in items_to_output:
                print (json.dumps(x), file=f)
    else:
        for x in items_to_output:
            print (json.dumps(x))
 if __name__ == "__main__":
@ -312,6 +488,13 @@ if __name__ == "__main__":
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_category)
    ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
    ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
    ap_recentfiles.add_argument("--limit", type=int, help="limit")
    ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
    ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
    ap_recentfiles.set_defaults(func=recentfiles)
    args = ap.parse_args()