Browse Source

mediawiki recentfiles

master
Michael Murtaugh 3 years ago
parent
commit
bb8c09f9f1
3 changed files with 190 additions and 5 deletions
  1. 3
    1
      makefile
  2. 1
    1
      scripts/leaflet.py
  3. 186
    3
      scripts/mediawiki.py

+ 3
- 1
makefile View File

@@ -17,7 +17,7 @@ archive.json:
17 17
 		https://pzwiki.wdka.nl/mediadesign/Category:2004 > archive.json
18 18
 
19 19
 drop.node.json: drop.json
20
-	cat drop.json | python scripts/leaflet.py gallery --recursive --direction 2 > drop.node.json
20
+	cat drop.json | python scripts/leaflet.py gallery --reverse --recursive --direction 2 > drop.node.json
21 21
 
22 22
 about.json: about.txt
23 23
 	python scripts/texthierarchy.py < about.txt > about.json
@@ -25,3 +25,5 @@ about.json: about.txt
25 25
 index.json: archive.json about.json drop.node.json
26 26
 	python scripts/includenodes.py xpub.top.json > index.json
27 27
 
28
+recent.json: web.touch
29
+	python scripts/mediawiki.py recentfiles --usercategory Xpub ...

+ 1
- 1
scripts/leaflet.py View File

@@ -20,7 +20,7 @@ def tiles_path_for (n):
20 20
 def autolink (text):
21 21
     def sub (m):
22 22
         return u'<a href="{0}">LINK</a>'.format(m.group(0))
23
-    return re.sub(r"https?://[\S]+", sub, text, re.I)
23
+    return re.sub(r'(?<!")https?://[\S]+(?!")', sub, text, re.I)
24 24
 
25 25
 def parse8601 (t, fmt=None):
26 26
     """ simple 8601 parser that doesn't care about more than YMDHMS"""

+ 186
- 3
scripts/mediawiki.py View File

@@ -11,10 +11,33 @@ from xml.etree import ElementTree as ET
11 11
 # from wiki_get_html import page_html
12 12
 from mwclient import Site
13 13
 from mwclient.page import Page
14
+from mwclient.errors import APIError
14 15
 
15 16
 from leaflet import tiles_wrapper, recursiverender, gridrender, html
16 17
 from imagetile2 import tile_image
17
-
18
+from urllib import quote as urlquote
19
+
20
+
21
+def wget (url, path, blocksize=4*1000):
22
+    if type(url) == unicode:
23
+        url = url.encode("utf-8")
24
+    count = 0
25
+    with open(path, "wb") as fout:
26
+        fin = urlopen(url)
27
+        while True:
28
+            data = fin.read(blocksize)
29
+            if not data:
30
+                break
31
+            fout.write(data)
32
+            count += len(data)
33
+    return count
34
+
35
+def page_url (site, page):
36
+    # print ("[page_url]", page.name, file=sys.stderr)
37
+    base = os.path.split(site.site['base'])[0]
38
+    uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
39
+    # assert type(uret) == str
40
+    return uret
18 41
 
19 42
 def wiki_url_to_title (url):
20 43
     return urllib.unquote(url.split("/")[-1])
@@ -269,9 +292,162 @@ def make_gallery(args):
269 292
     else:
270 293
         print (json.dumps(root_node, indent=2))
271 294
 
272
-
295
+from time import sleep
273 296
 def testwiki (args):
274
-    return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
297
+    site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
298
+    return site
299
+
300
+USER_NS = 2
301
+
302
+def imageinfo_with_thumbnail (site, name):
303
+    d = site.api(
304
+        "query",
305
+        titles=name,
306
+        prop="imageinfo",
307
+        iiprop="url|mime",
308
+        iiurlwidth=1024
309
+    )
310
+    pp = d['query']['pages']
311
+    for key in pp:
312
+        return pp[key]['imageinfo'][0]
313
+
314
+def recentfiles (args):
315
+    # open connection to wiki
316
+    wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
317
+
318
+    # Prepare user list to filter (if args.usercategory)
319
+    filter_by_users = None
320
+    if args.usercategory:
321
+        filter_by_users = set()
322
+        usercategory = wiki.categories.get(args.usercategory)
323
+        for p in usercategory.members():
324
+            if p.namespace == USER_NS:
325
+                filter_by_users.add(p.page_title)
326
+
327
+    # Load args.json for oldest timestamp
328
+    last_date = None
329
+    if args.json:
330
+        try:
331
+            with open (args.json) as f:
332
+                print ("Reading {0}".format(args.json), file=sys.stderr)
333
+                for line in f:
334
+                    data = json.loads(line)
335
+                    if 'date' in data:
336
+                        last_date = data['date'] 
337
+        except IOError as e:
338
+            pass
339
+
340
+
341
+    # Prepare the query arguments
342
+    qargs = {
343
+        'list': "allimages",
344
+        'ailimit': 50,
345
+        'aisort': 'timestamp',
346
+        'aidir': 'descending',
347
+        'aiprop': "timestamp|url|user|userid"
348
+    }
349
+    if args.oldest:
350
+        qargs['aiend'] = args.oldest
351
+    if last_date:
352
+        print ("Using aiend {0}".format(last_date), file=sys.stderr)
353
+        qargs['aiend'] = last_date
354
+
355
+    count = 0 # used to satisfy --limit when given
356
+    skipped_users = set() # nicety for outputting names only once when skipped
357
+    items_to_output = []
358
+
359
+    # LOOP for continuing queries as needed
360
+    while True:
361
+        qq = wiki.api('query', **qargs) 
362
+        # print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
363
+        results = qq['query']['allimages']
364
+        for r in results:
365
+            # Filter on user
366
+            if filter_by_users != None:
367
+                if r['user'] not in filter_by_users:
368
+                    if r['user'] not in skipped_users:
369
+                        print ("Skipping user {0}".format(r['user']), file=sys.stderr)
370
+                        skipped_users.add(r['user'])
371
+                    continue
372
+
373
+            try:
374
+                # Filter on mime type (image/*)
375
+                filepage = wiki.pages.get(r['title'])
376
+                # mwclient's imageinfo doesn't have mime (or thumbnail info)
377
+                # imageinfo = filepage.imageinfo
378
+                imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
379
+                if not imageinfo['mime'].startswith("image/"):
380
+                    print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
381
+                    continue
382
+
383
+                # Deal with edge case at items == aiend are returned
384
+                if last_date and r['timestamp'] == last_date:
385
+                    print ("SKIPPING AIEND item", file=sys.stderr)
386
+                    break
387
+
388
+                # Construct an item for output
389
+                print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
390
+                usagepage = None
391
+                for usagepage in filepage.imageusage():
392
+                    break # just grab the first usage page
393
+                # url : local path to file
394
+                imageurl = imageinfo['url']
395
+                localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
396
+                # wget image from wiki to local folder
397
+                if not os.path.exists(localpath):
398
+                    try:
399
+                        os.makedirs(os.path.split(localpath)[0])
400
+                    except OSError:
401
+                        pass
402
+                    print ("  downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
403
+                    wget(imageurl, localpath)
404
+
405
+                item = {}
406
+                item['url'] = localpath
407
+                item['date'] = r['timestamp']
408
+                userpage = wiki.pages.get('User:'+r['user'])
409
+                if usagepage:
410
+                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
411
+                            page_url(wiki, usagepage),
412
+                            usagepage.page_title,
413
+                            page_url(wiki, userpage),
414
+                            r['user'])
415
+                else:
416
+                    item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
417
+                            page_url(wiki, filepage),
418
+                            filepage.page_title,
419
+                            page_url(wiki, userpage),
420
+                            r['user'])
421
+
422
+                # print (json.dumps(item))
423
+                items_to_output.append(item)
424
+                # honor --limit
425
+                count += 1
426
+                if args.limit and count == args.limit:
427
+                    break
428
+            
429
+            except APIError as e:
430
+                print ("Error {0}, skipping".format(e))
431
+
432
+        if args.limit and count == args.limit:
433
+            break
434
+        # continue the query if possible (pre-loop)...
435
+        if 'continue' in qq:
436
+            qargs['aicontinue'] = qq['continue']['aicontinue']
437
+        else:
438
+            # we've reached the end of the query data
439
+            break
440
+    
441
+    # OUTPUT RESULTS
442
+    # reverse to be chronological
443
+    items_to_output.reverse()
444
+    if args.json:
445
+        with open(args.json, "a") as f:
446
+            for x in items_to_output:
447
+                print (json.dumps(x), file=f)
448
+    else:
449
+        for x in items_to_output:
450
+            print (json.dumps(x))
275 451
 
276 452
 
277 453
 if __name__ == "__main__":
@@ -312,6 +488,13 @@ if __name__ == "__main__":
312 488
     ap_article.add_argument("--html", default=False, action="store_true")
313 489
     ap_article.set_defaults(func=make_category)
314 490
 
491
+    ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
492
+    ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
493
+    ap_recentfiles.add_argument("--limit", type=int, help="limit")
494
+    ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
495
+    ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
496
+    ap_recentfiles.set_defaults(func=recentfiles)
497
+
315 498
 
316 499
 
317 500
     args = ap.parse_args()

Loading…
Cancel
Save