mediawiki recentfiles

master
Michael Murtaugh 7 years ago
parent 82bd63835f
commit bb8c09f9f1

@ -17,7 +17,7 @@ archive.json:
https://pzwiki.wdka.nl/mediadesign/Category:2004 > archive.json
drop.node.json: drop.json
cat drop.json | python scripts/leaflet.py gallery --recursive --direction 2 > drop.node.json
cat drop.json | python scripts/leaflet.py gallery --reverse --recursive --direction 2 > drop.node.json
about.json: about.txt
python scripts/texthierarchy.py < about.txt > about.json
@ -25,3 +25,5 @@ about.json: about.txt
index.json: archive.json about.json drop.node.json
python scripts/includenodes.py xpub.top.json > index.json
recent.json: web.touch
python scripts/mediawiki.py recentfiles --usercategory Xpub ...

@ -20,7 +20,7 @@ def tiles_path_for (n):
def autolink (text):
def sub (m):
return u'<a href="{0}">LINK</a>'.format(m.group(0))
return re.sub(r"https?://[\S]+", sub, text, re.I)
return re.sub(r'(?<!")https?://[\S]+(?!")', sub, text, re.I)
def parse8601 (t, fmt=None):
""" simple 8601 parser that doesn't care about more than YMDHMS"""

@ -11,10 +11,33 @@ from xml.etree import ElementTree as ET
# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from mwclient.errors import APIError
from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
from urllib import quote as urlquote
def wget (url, path, blocksize=4*1000):
if type(url) == unicode:
url = url.encode("utf-8")
count = 0
with open(path, "wb") as fout:
fin = urlopen(url)
while True:
data = fin.read(blocksize)
if not data:
break
fout.write(data)
count += len(data)
return count
def page_url (site, page):
# print ("[page_url]", page.name, file=sys.stderr)
base = os.path.split(site.site['base'])[0]
uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
# assert type(uret) == str
return uret
def wiki_url_to_title (url):
return urllib.unquote(url.split("/")[-1])
@ -269,9 +292,162 @@ def make_gallery(args):
else:
print (json.dumps(root_node, indent=2))
from time import sleep
def testwiki (args):
return Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
return site
USER_NS = 2
def imageinfo_with_thumbnail (site, name):
d = site.api(
"query",
titles=name,
prop="imageinfo",
iiprop="url|mime",
iiurlwidth=1024
)
pp = d['query']['pages']
for key in pp:
return pp[key]['imageinfo'][0]
def recentfiles (args):
# open connection to wiki
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
# Prepare user list to filter (if args.usercategory)
filter_by_users = None
if args.usercategory:
filter_by_users = set()
usercategory = wiki.categories.get(args.usercategory)
for p in usercategory.members():
if p.namespace == USER_NS:
filter_by_users.add(p.page_title)
# Load args.json for oldest timestamp
last_date = None
if args.json:
try:
with open (args.json) as f:
print ("Reading {0}".format(args.json), file=sys.stderr)
for line in f:
data = json.loads(line)
if 'date' in data:
last_date = data['date']
except IOError as e:
pass
# Prepare the query arguments
qargs = {
'list': "allimages",
'ailimit': 50,
'aisort': 'timestamp',
'aidir': 'descending',
'aiprop': "timestamp|url|user|userid"
}
if args.oldest:
qargs['aiend'] = args.oldest
if last_date:
print ("Using aiend {0}".format(last_date), file=sys.stderr)
qargs['aiend'] = last_date
count = 0 # used to satisfy --limit when given
skipped_users = set() # nicety for outputting names only once when skipped
items_to_output = []
# LOOP for continuing queries as needed
while True:
qq = wiki.api('query', **qargs)
# print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
results = qq['query']['allimages']
for r in results:
# Filter on user
if filter_by_users != None:
if r['user'] not in filter_by_users:
if r['user'] not in skipped_users:
print ("Skipping user {0}".format(r['user']), file=sys.stderr)
skipped_users.add(r['user'])
continue
try:
# Filter on mime type (image/*)
filepage = wiki.pages.get(r['title'])
# mwclient's imageinfo doesn't have mime (or thumbnail info)
# imageinfo = filepage.imageinfo
imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
if not imageinfo['mime'].startswith("image/"):
print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
continue
# Deal with edge case at items == aiend are returned
if last_date and r['timestamp'] == last_date:
print ("SKIPPING AIEND item", file=sys.stderr)
break
# Construct an item for output
print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
usagepage = None
for usagepage in filepage.imageusage():
break # just grab the first usage page
# url : local path to file
imageurl = imageinfo['url']
localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
# wget image from wiki to local folder
if not os.path.exists(localpath):
try:
os.makedirs(os.path.split(localpath)[0])
except OSError:
pass
print (" downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
wget(imageurl, localpath)
item = {}
item['url'] = localpath
item['date'] = r['timestamp']
userpage = wiki.pages.get('User:'+r['user'])
if usagepage:
item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
page_url(wiki, usagepage),
usagepage.page_title,
page_url(wiki, userpage),
r['user'])
else:
item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
page_url(wiki, filepage),
filepage.page_title,
page_url(wiki, userpage),
r['user'])
# print (json.dumps(item))
items_to_output.append(item)
# honor --limit
count += 1
if args.limit and count == args.limit:
break
except APIError as e:
print ("Error {0}, skipping".format(e))
if args.limit and count == args.limit:
break
# continue the query if possible (pre-loop)...
if 'continue' in qq:
qargs['aicontinue'] = qq['continue']['aicontinue']
else:
# we've reached the end of the query data
break
# OUTPUT RESULTS
# reverse to be chronological
items_to_output.reverse()
if args.json:
with open(args.json, "a") as f:
for x in items_to_output:
print (json.dumps(x), file=f)
else:
for x in items_to_output:
print (json.dumps(x))
if __name__ == "__main__":
@ -312,6 +488,13 @@ if __name__ == "__main__":
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_category)
ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
ap_recentfiles.add_argument("--limit", type=int, help="limit")
ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
ap_recentfiles.set_defaults(func=recentfiles)
args = ap.parse_args()

Loading…
Cancel
Save