#! /usr/bin/env python # -*- coding: utf-8 -*- from mwclient import Site, image from argparse import RawTextHelpFormatter from argparse import ArgumentParser import pprint, os, urllib2 #### # Arguments # todo add search query #### p = ArgumentParser(description=""" # Description: Script dowloads media files from a wiki, according to a sematic ask query. # Examples: Downloads files with property Modification_date: python wiki-download.py -d imgs -a [[Modification_date::+]] Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster: python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter) p.add_argument("--host", metavar='', default="aa.xpub.nl") p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /") p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.") p.add_argument("--download", "-d", metavar='', default='imgs', help="Local directory to store files from wiki. If not specified, defaults to imgs") p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded") args = p.parse_args() print args ######### # defs ######### def mwsite(host, path): site = Site(('http',host), path) return site site = mwsite(args.host, args.path) #print site query= ('[[File:+]]'+ args.ask) print 'Query:', query # examples: # [[File:+]][[Year::+]][[Document_Type::Poster]] # [[File:+]][[Year::2007]][[Document_Type::Poster]] # same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]] # can write compound query such as Year::2000 Medium::Flyer for answer in site.ask(query): img_key = answer.keys()[0] img_name = answer[img_key]['fulltext'] img_page_url = answer[img_key]['fullurl'] img = image.Image(site, answer) img_info = img.imageinfo # dict includes url and extensive metadata print img_name # FILE DOWNLOAD if 'url' in img_info.keys() and len(args.download) > 0: img_url = (img_info['url']).replace('https','http') if os.path.exists(args.download) is False: os.makedirs(args.download) img_data = urllib2.urlopen( img_url ) img_file = open(u'{}/{}'.format(args.download, img_name), 'wb') img_file.write(img_data.read()) img_file.close() global download_path if args.download[-1] != '/': download_path = args.download+'/' else: download_path = args.download print u'Saved to {}'.format(download_path + img_name) if args.verbose: pprint.pprint( img_info ) print