mediawiki recentfiles

7 years ago · bb8c09f9f1
parent 82bd63835f
commit bb8c09f9f1
3 changed files with 190 additions and 5 deletions
--- a/4
+++ b/4
@ -17,7 +17,7 @@ archive.json:
 		https://pzwiki.wdka.nl/mediadesign/Category:2004 > archive.json

 drop.node.json: drop.json
-	cat drop.json | python scripts/leaflet.py gallery --recursive --direction 2 > drop.node.json
+	cat drop.json | python scripts/leaflet.py gallery --reverse --recursive --direction 2 > drop.node.json

 about.json: about.txt
 	python scripts/texthierarchy.py < about.txt > about.json
@ -25,3 +25,5 @@ about.json: about.txt
 index.json: archive.json about.json drop.node.json
 	python scripts/includenodes.py xpub.top.json > index.json

+recent.json: web.touch
+	python scripts/mediawiki.py recentfiles --usercategory Xpub ...
--- a/scripts/leaflet.py
+++ b/scripts/leaflet.py
@ -20,7 +20,7 @@ def tiles_path_for (n):
 def autolink (text):
    def sub (m):
        return u'<a href="{0}">LINK</a>'.format(m.group(0))
-    return re.sub(r"https?://[\S]+", sub, text, re.I)
+    return re.sub(r'(?<!")https?://[\S]+(?!")', sub, text, re.I)

 def parse8601 (t, fmt=None):
    """ simple 8601 parser that doesn't care about more than YMDHMS"""
--- a/scripts/mediawiki.py
+++ b/scripts/mediawiki.py
@ -11,10 +11,33 @@ from xml.etree import ElementTree as ET
 # from wiki_get_html import page_html
 from mwclient import Site
 from mwclient.page import Page
+from mwclient.errors import APIError

 from leaflet import tiles_wrapper, recursiverender, gridrender, html
 from imagetile2 import tile_image
-
+from urllib import quote as urlquote
+
+
+def wget (url, path, blocksize=4*1000):
+    if type(url) == unicode:
+        url = url.encode("utf-8")
+    count = 0
+    with open(path, "wb") as fout:
+        fin = urlopen(url)
+        while True:
+            data = fin.read(blocksize)
+            if not data:
+                break
+            fout.write(data)
+            count += len(data)
+    return count
+
+def page_url (site, page):
+    # print ("[page_url]", page.name, file=sys.stderr)
+    base = os.path.split(site.site['base'])[0]
+    uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
+    # assert type(uret) == str
+    return uret

 def wiki_url_to_title (url):
    return urllib.unquote(url.split("/")[-1])
@ -269,9 +292,162 @@ def make_gallery(args):
    else:
        print (json.dumps(root_node, indent=2))

-
+from time import sleep
 def testwiki (args):
-    return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
+    site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
+    return site
+
+USER_NS = 2
+
+def imageinfo_with_thumbnail (site, name):
+    d = site.api(
+        "query",
+        titles=name,
+        prop="imageinfo",
+        iiprop="url|mime",
+        iiurlwidth=1024
+    )
+    pp = d['query']['pages']
+    for key in pp:
+        return pp[key]['imageinfo'][0]
+
+def recentfiles (args):
+    # open connection to wiki
+    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
+
+    # Prepare user list to filter (if args.usercategory)
+    filter_by_users = None
+    if args.usercategory:
+        filter_by_users = set()
+        usercategory = wiki.categories.get(args.usercategory)
+        for p in usercategory.members():
+            if p.namespace == USER_NS:
+                filter_by_users.add(p.page_title)
+
+    # Load args.json for oldest timestamp
+    last_date = None
+    if args.json:
+        try:
+            with open (args.json) as f:
+                print ("Reading {0}".format(args.json), file=sys.stderr)
+                for line in f:
+                    data = json.loads(line)
+                    if 'date' in data:
+                        last_date = data['date'] 
+        except IOError as e:
+            pass
+
+
+    # Prepare the query arguments
+    qargs = {
+        'list': "allimages",
+        'ailimit': 50,
+        'aisort': 'timestamp',
+        'aidir': 'descending',
+        'aiprop': "timestamp|url|user|userid"
+    }
+    if args.oldest:
+        qargs['aiend'] = args.oldest
+    if last_date:
+        print ("Using aiend {0}".format(last_date), file=sys.stderr)
+        qargs['aiend'] = last_date
+
+    count = 0 # used to satisfy --limit when given
+    skipped_users = set() # nicety for outputting names only once when skipped
+    items_to_output = []
+
+    # LOOP for continuing queries as needed
+    while True:
+        qq = wiki.api('query', **qargs) 
+        # print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
+        results = qq['query']['allimages']
+        for r in results:
+            # Filter on user
+            if filter_by_users != None:
+                if r['user'] not in filter_by_users:
+                    if r['user'] not in skipped_users:
+                        print ("Skipping user {0}".format(r['user']), file=sys.stderr)
+                        skipped_users.add(r['user'])
+                    continue
+
+            try:
+                # Filter on mime type (image/*)
+                filepage = wiki.pages.get(r['title'])
+                # mwclient's imageinfo doesn't have mime (or thumbnail info)
+                # imageinfo = filepage.imageinfo
+                imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
+                if not imageinfo['mime'].startswith("image/"):
+                    print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
+                    continue
+
+                # Deal with edge case at items == aiend are returned
+                if last_date and r['timestamp'] == last_date:
+                    print ("SKIPPING AIEND item", file=sys.stderr)
+                    break
+
+                # Construct an item for output
+                print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
+                usagepage = None
+                for usagepage in filepage.imageusage():
+                    break # just grab the first usage page
+                # url : local path to file
+                imageurl = imageinfo['url']
+                localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
+                # wget image from wiki to local folder
+                if not os.path.exists(localpath):
+                    try:
+                        os.makedirs(os.path.split(localpath)[0])
+                    except OSError:
+                        pass
+                    print ("  downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
+                    wget(imageurl, localpath)
+
+                item = {}
+                item['url'] = localpath
+                item['date'] = r['timestamp']
+                userpage = wiki.pages.get('User:'+r['user'])
+                if usagepage:
+                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
+                            page_url(wiki, usagepage),
+                            usagepage.page_title,
+                            page_url(wiki, userpage),
+                            r['user'])
+                else:
+                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
+                            page_url(wiki, filepage),
+                            filepage.page_title,
+                            page_url(wiki, userpage),
+                            r['user'])
+
+                # print (json.dumps(item))
+                items_to_output.append(item)
+                # honor --limit
+                count += 1
+                if args.limit and count == args.limit:
+                    break
+            
+            except APIError as e:
+                print ("Error {0}, skipping".format(e))
+
+        if args.limit and count == args.limit:
+            break
+        # continue the query if possible (pre-loop)...
+        if 'continue' in qq:
+            qargs['aicontinue'] = qq['continue']['aicontinue']
+        else:
+            # we've reached the end of the query data
+            break
+    
+    # OUTPUT RESULTS
+    # reverse to be chronological
+    items_to_output.reverse()
+    if args.json:
+        with open(args.json, "a") as f:
+            for x in items_to_output:
+                print (json.dumps(x), file=f)
+    else:
+        for x in items_to_output:
+            print (json.dumps(x))


 if __name__ == "__main__":
@ -312,6 +488,13 @@ if __name__ == "__main__":
    ap_article.add_argument("--html", default=False, action="store_true")
    ap_article.set_defaults(func=make_category)

+    ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
+    ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
+    ap_recentfiles.add_argument("--limit", type=int, help="limit")
+    ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
+    ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
+    ap_recentfiles.set_defaults(func=recentfiles)
+


    args = ap.parse_args()