You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

503 lines
19 KiB
Python

7 years ago
from __future__ import print_function
import os, sys, re, urllib, urlparse, html5lib, json
from PIL import Image
from math import log
from argparse import ArgumentParser
from urllib2 import urlopen
from xml.etree import ElementTree as ET
# from wiki_get_html import page_html
from mwclient import Site
from mwclient.page import Page
from mwclient.errors import APIError
7 years ago
from leaflet import tiles_wrapper, recursiverender, gridrender, html
from imagetile2 import tile_image
from urllib import quote as urlquote
def wget (url, path, blocksize=4*1000):
if type(url) == unicode:
url = url.encode("utf-8")
count = 0
with open(path, "wb") as fout:
fin = urlopen(url)
while True:
data = fin.read(blocksize)
if not data:
break
fout.write(data)
count += len(data)
return count
def page_url (site, page):
# print ("[page_url]", page.name, file=sys.stderr)
base = os.path.split(site.site['base'])[0]
uret = os.path.join(base, urlquote(page.normalize_title(page.name)))
# assert type(uret) == str
return uret
7 years ago
def wiki_url_to_title (url):
return urllib.unquote(url.split("/")[-1])
def parse_gallery(t):
""" returns [(imagepageurl, caption, articleurl), ...] """
galleryitems = t.findall(".//li[@class='gallerybox']")
items = []
for i in galleryitems:
image_link = i.find(".//a[@class='image']")
src = None
captiontext = None
article = None
if image_link != None:
src = image_link.attrib.get("href")
# src = src.split("/")[-1]
caption = i.find(".//*[@class='gallerytext']")
if caption:
captiontext = ET.tostring(caption, method="html")
articlelink = caption.find(".//a")
if articlelink != None:
article = articlelink.attrib.get("href")
# f = wiki.Pages[imgname]
# items.append((f.imageinfo['url'], captiontext))
items.append((src, captiontext, article))
return items
def mwfilepage_to_url (wiki, url):
filename = urllib.unquote(url.split("/")[-1])
page = wiki.Pages[filename]
return page, page.imageinfo['url']
def url_to_path (url):
""" https://pzwiki.wdka.nl/mediadesign/File:I-could-have-written-that_these-are-the-words_mb_300dpi.png """
path = urllib.unquote(urlparse.urlparse(url).path)
return "/".join(path.split("/")[3:])
def wiki_absurl (wiki, url):
ret = ''
if type(wiki.host) == tuple:
ret = wiki.host[0]+"://"+wiki.host[1]
else:
ret = "http://"+wiki.host
return urlparse.urljoin(ret, url)
def wiki_title_to_url (wiki, title):
""" relies on wiki.site['base'] being set to the public facing URL of the Main page """
ret = ''
parts = urlparse.urlparse(wiki.site['base'])
base, main_page = os.path.split(parts.path)
ret = parts.scheme+"://"+parts.netloc+base
p = wiki.pages[title]
ret += "/" + p.normalize_title(p.name)
return ret
def ensure_wiki_image_tiles (wiki, imagepageurl, text='', basepath="tiles", force=False, bgcolor=None, tilewidth=256, tileheight=256, zoom=3, margin_right=0, margin_bottom=0):
7 years ago
print ("ensure_wiki_image_tiles", imagepageurl, file=sys.stderr)
page, imageurl = mwfilepage_to_url(wiki, imagepageurl)
path = os.path.join(basepath, url_to_path(imageurl))
print ("imageurl, path", imageurl, path, file=sys.stderr)
ret = tiles_wrapper(path, imagepageurl, text=text)
tp = ret.get_tile_path(0, 0, 0)
if os.path.exists(tp) and not force:
return ret
try:
os.makedirs(path)
except OSError:
pass
im = Image.open(urlopen(imageurl))
tile_image(im, zoom, tilewidth, tileheight, path+"/", ret.tilename, bgcolor, margin_right, margin_bottom)
7 years ago
return ret
def textcell (paras):
node = {}
node['text'] = paras[:1]
moretext = paras[1:]
if moretext:
node['children'] = [textcell([x]) for x in moretext]
return node
def name_to_path (name):
return name.replace("/", "_")
def render_article (wiki, ref, basepath="tiles", depth=0, maxdepth=3, tilewidth=256, tileheight=256):
7 years ago
print ("render_article", ref, file=sys.stderr)
if type(ref) == Page:
page = ref
title = page.name
ref = wiki_title_to_url(wiki, page.name)
elif ref.startswith("http"):
title = wiki_url_to_title(ref)
page = wiki.pages[title]
else:
title = ref
page = wiki.pages[title]
ref = wiki_title_to_url(wiki, page.name)
# pagetext = page.text()
# print ("WIKI PARSE", title, file=sys.stderr)
parse = wiki.parse(page=title)
html = parse['text']['*']
# print ("GOT HTML ", html, file=sys.stderr)
tree = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = tree.find("./body")
paras = []
images = []
imgsrcs = {}
for c in body:
if c.tag == "p":
# filter out paras like <p><br></p> but checking text-only render length
ptext = ET.tostring(c, encoding="utf-8", method="text").strip()
if len(ptext) > 0:
ptext = ET.tostring(c, encoding="utf-8", method="html").strip()
paras.append(ptext)
elif c.tag == "ul" and c.attrib.get("class") != None and "gallery" in c.attrib.get("class"):
# print ("GALLERY")
gallery = parse_gallery(c)
# Ensure image is downloaded ... at least the 00 image...
for src, caption, article in gallery:
src = wiki_absurl(wiki, src)
if src in imgsrcs:
continue
imgsrcs[src] = True
print ("GalleryImage", src, caption, article, file=sys.stderr)
# if article and depth < maxdepth:
# article = wiki_absurl(wiki, article)
# images.append(render_article(wiki, article, caption, basepath, depth+1, maxdepth))
# else:
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())
7 years ago
for a in body.findall('.//a[@class="image"]'):
caption = a.attrib.get("title", '')
src = wiki_absurl(wiki, a.attrib.get("href"))
# OEI... skippin svg for the moment (can't go straight to PIL)
if src.endswith(".svg"):
continue
print (u"Image_link {0}:'{1}'".format(src, caption).encode("utf-8"), file=sys.stderr)
if src in imgsrcs:
continue
imgsrcs[src] = True
images.append(ensure_wiki_image_tiles(wiki, src, caption, basepath, tilewidth=tilewidth, tileheight=tileheight).zoom())
7 years ago
print ("{0} paras, {1} images".format(len(paras), len(images)), file=sys.stderr)
if title == None:
title = page.name
basename = "tiles/" + name_to_path(page.name)
# gallerynode = gridrender(images, basename)
# return gallerynode
cells = []
if len(paras) > 0:
cells.append(textcell(paras))
cells.extend(images)
ret = recursiverender(cells, basename, tilewidth=tilewidth, tileheight=tileheight)
ret['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(title, ref)
7 years ago
if images:
ret['image'] = images[0]['image']
return ret
# article = {}
# article['text'] = title
# article['children'] = children = []
# children.append(textcell(paras))
# for iz in images[:2]:
# if 'image' not in article and 'image' in iz:
# article['image'] = iz['image']
# children.append(iz)
# restimages = images[2:]
# if len(restimages) == 1:
# children.append(restimages[0])
# elif len(restimages) > 1:
# children.append(gridrender(restimages, basename))
# return article
def render_category (wiki, cat, output="tiles", tilewidth=256, tileheight=256):
7 years ago
print ("Render Category", cat, file=sys.stderr)
# if type(cat) == Page:
# page = ref
# title = page.name
# ref = wiki_title_to_url(wiki, page.name)
if cat.startswith("http"):
title = wiki_url_to_title(cat)
cat = wiki.pages[title]
else:
title = ref
cat = wiki.pages[cat]
# ref = wiki_title_to_url(wiki, cat.name)
print ("cat", cat, file=sys.stderr)
pages = []
for m in cat.members():
pages.append(m)
pages.sort(key=lambda x: x.name)
pagenodes = [render_article(wiki, x.name, tilewidth=tilewidth, tileheight=tileheight) for x in pages]
7 years ago
for page, node in zip(pages, pagenodes):
node['text'] = u"""<p class="caption"><a class="url" href="{1}">{0}</a></p>""".format(page.name, wiki_title_to_url(wiki, page.name))
ret = gridrender(pagenodes, output+"/"+cat.name.replace(":", "_"), tilewidth=tilewidth, tileheight=tileheight)
7 years ago
ret['text'] = u"""<p class="caption"><a class="url" href="{0}">{1}</a></p>""".format(wiki_title_to_url(wiki, cat.name), cat.name)
return ret
# for p in pages:
# print (p.name, wiki_title_to_url(wiki, p.name))
def make_category (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_category(wiki, args.category)
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_article (args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
root_node = render_article(wiki, args.wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
7 years ago
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
def make_gallery(args):
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
# apiurl = args.wikiprotocol+"://"+args.wikihost+args.wikipath+"api.php"
if len(args.wikipage) == 1:
root_node = render_article(wiki, args.wikipage[0], tilewidth=args.tilewidth, tileheight=args.tileheight)
7 years ago
else:
children = []
for wikipage in args.wikipage:
print ("rendering", wikipage, file=sys.stderr)
if "Category:" in wikipage:
print ("rendering", wikipage, file=sys.stderr)
cnode = render_category(wiki, wikipage, args.output)
else:
cnode = render_article(wiki, wikipage, tilewidth=args.tilewidth, tileheight=args.tileheight)
7 years ago
children.append(cnode)
if args.recursive:
root_node = recursiverender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
7 years ago
else:
root_node = gridrender(children, args.output+"/"+args.name, direction=1, tilewidth=args.tilewidth, tileheight=args.tileheight)
7 years ago
if args.html:
print (html(root_node, ""))
else:
print (json.dumps(root_node, indent=2))
from time import sleep
7 years ago
def testwiki (args):
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
return site
USER_NS = 2
def imageinfo_with_thumbnail (site, name):
d = site.api(
"query",
titles=name,
prop="imageinfo",
iiprop="url|mime",
iiurlwidth=1024
)
pp = d['query']['pages']
for key in pp:
return pp[key]['imageinfo'][0]
def recentfiles (args):
# open connection to wiki
wiki = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
# Prepare user list to filter (if args.usercategory)
filter_by_users = None
if args.usercategory:
filter_by_users = set()
usercategory = wiki.categories.get(args.usercategory)
for p in usercategory.members():
if p.namespace == USER_NS:
filter_by_users.add(p.page_title)
# Load args.json for oldest timestamp
last_date = None
if args.json:
try:
with open (args.json) as f:
print ("Reading {0}".format(args.json), file=sys.stderr)
for line in f:
data = json.loads(line)
if 'date' in data:
last_date = data['date']
except IOError as e:
pass
# Prepare the query arguments
qargs = {
'list': "allimages",
'ailimit': 50,
'aisort': 'timestamp',
'aidir': 'descending',
'aiprop': "timestamp|url|user|userid"
}
if args.oldest:
qargs['aiend'] = args.oldest
if last_date:
print ("Using aiend {0}".format(last_date), file=sys.stderr)
qargs['aiend'] = last_date
count = 0 # used to satisfy --limit when given
skipped_users = set() # nicety for outputting names only once when skipped
items_to_output = []
# LOOP for continuing queries as needed
while True:
qq = wiki.api('query', **qargs)
# print ("Got {0} results".format(len(qq['query']['allimages'])), file=sys.stderr)
results = qq['query']['allimages']
for r in results:
# Filter on user
if filter_by_users != None:
if r['user'] not in filter_by_users:
if r['user'] not in skipped_users:
print ("Skipping user {0}".format(r['user']), file=sys.stderr)
skipped_users.add(r['user'])
continue
try:
# Filter on mime type (image/*)
filepage = wiki.pages.get(r['title'])
# mwclient's imageinfo doesn't have mime (or thumbnail info)
# imageinfo = filepage.imageinfo
imageinfo = imageinfo_with_thumbnail(wiki, r['title'])
if not imageinfo['mime'].startswith("image/"):
print ("Skipping non image ({0}) {1}".format(imageinfo['mime'], r['title']))
continue
# Deal with edge case at items == aiend are returned
if last_date and r['timestamp'] == last_date:
print ("SKIPPING AIEND item", file=sys.stderr)
break
# Construct an item for output
print ("[{0}], date:{1}".format(filepage.page_title, r['timestamp']), file=sys.stderr)
usagepage = None
for usagepage in filepage.imageusage():
break # just grab the first usage page
# url : local path to file
imageurl = imageinfo['url']
localpath = imageurl.replace("https://pzwiki.wdka.nl/mw-mediadesign/images/", "wiki/")
# wget image from wiki to local folder
if not os.path.exists(localpath):
try:
os.makedirs(os.path.split(localpath)[0])
except OSError:
pass
print (" downloading {0} to {1}".format(imageurl, localpath), file=sys.stderr)
wget(imageurl, localpath)
item = {}
item['url'] = localpath
item['date'] = r['timestamp']
userpage = wiki.pages.get('User:'+r['user'])
if usagepage:
item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
page_url(wiki, usagepage),
usagepage.page_title,
page_url(wiki, userpage),
r['user'])
else:
item['text'] = '<a href="{0}">{1}</a><br>Uploaded by <a href="{2}">{3}</a>'.format(
page_url(wiki, filepage),
filepage.page_title,
page_url(wiki, userpage),
r['user'])
# print (json.dumps(item))
items_to_output.append(item)
# honor --limit
count += 1
if args.limit and count == args.limit:
break
except APIError as e:
print ("Error {0}, skipping".format(e))
if args.limit and count == args.limit:
break
# continue the query if possible (pre-loop)...
if 'continue' in qq:
qargs['aicontinue'] = qq['continue']['aicontinue']
else:
# we've reached the end of the query data
break
# OUTPUT RESULTS
# reverse to be chronological
items_to_output.reverse()
if args.json:
with open(args.json, "a") as f:
for x in items_to_output:
print (json.dumps(x), file=f)
else:
for x in items_to_output:
print (json.dumps(x))
7 years ago
7 years ago
if __name__ == "__main__":
ap = ArgumentParser("")
ap.add_argument("--wikiprotocol", default="https")
ap.add_argument("--wikihost", default="pzwiki.wdka.nl")
ap.add_argument("--wikipath", default="/mw-mediadesign/")
ap.add_argument("--wikishortpath", default="/mediadesign/")
ap.add_argument("--tilewidth", type=int, default=256)
ap.add_argument("--tileheight", type=int, default=256)
# ap.add_argument("--zoom", type=int, default=3)
ap.add_argument("--output", default="tiles")
# ap.add_argument("--title", default="TITLE")
subparsers = ap.add_subparsers(help='sub-command help')
ap_article = subparsers.add_parser('article', help='Render an article')
ap_article.add_argument("wikipage")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_article)
ap_gallery = subparsers.add_parser('gallery', help='Render a gallery of articles')
ap_gallery.add_argument("wikipage", nargs="+")
ap_gallery.add_argument("--html", default=False, action="store_true")
ap_gallery.add_argument("--recursive", default=False, action="store_true")
ap_gallery.add_argument("--direction", type=int, default=3, help="cell to recursively expand into, 0-3, default: 3 (bottom-right)")
ap_gallery.add_argument("--name", default=None)
ap_gallery.set_defaults(func=make_gallery)
ap_gallery = subparsers.add_parser('testwiki', help='Render a gallery of articles')
ap_gallery.set_defaults(func=testwiki)
ap_article = subparsers.add_parser('category', help='Render an article')
ap_article.add_argument("category")
ap_article.add_argument("--html", default=False, action="store_true")
ap_article.set_defaults(func=make_category)
ap_recentfiles = subparsers.add_parser('recentfiles', help='Incrementally update a json file with information about recent files')
ap_recentfiles.add_argument("--usercategory", help="limit to activity by users that are members of this category")
ap_recentfiles.add_argument("--limit", type=int, help="limit")
ap_recentfiles.add_argument("--oldest", default=None, help="No results earlier than this timestamp (e.g. 2018-01-01T00:00:00Z)")
ap_recentfiles.add_argument("--json", default=None, help="Use this json file as both input (to check last timestampt) and output -- append results chronologically as json-stream.")
ap_recentfiles.set_defaults(func=recentfiles)
7 years ago
args = ap.parse_args()
ret = args.func(args)