scan-utils/wiki-download.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-

from mwclient import Site, image
from argparse import RawTextHelpFormatter
from argparse import ArgumentParser
import pprint, os, urllib2

####
# Arguments
# todo add search query
####
p = ArgumentParser(description="""
# Description: Script dowloads media files from a wiki, according to a sematic ask query.

# Examples:
Downloads files with property Modification_date:
python wiki-download.py -d imgs -a [[Modification_date::+]]

Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster:
python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter)

p.add_argument("--host",  metavar='', default="aa.xpub.nl")
p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /")
p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.")
p.add_argument("--download", "-d", metavar='', default='', help="Local directory to store files from wiki. If no directory provided files wont be downloaded")
p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded")

args = p.parse_args()
print args

#########
# defs
#########
def mwsite(host, path):
    site = Site(('http',host), path)
    return site


site = mwsite(args.host, args.path)
#print site


query= ('[[File:+]]'+ args.ask)

# examples:
# [[File:+]][[Year::+]][[Document_Type::Poster]]
# [[File:+]][[Year::2007]][[Document_Type::Poster]]
# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]
# can write compound query such as Year::2000 Medium::Flyer


for answer in site.ask(query):
    img_key = answer.keys()[0]
    img_name = answer[img_key]['fulltext']
    img_page_url = answer[img_key]['fullurl']
    img = image.Image(site, answer)
    img_info =  img.imageinfo # dict includes url and extensive metadata
    # FILE DOWNLOAD
    if 'url' in img_info.keys() and len(args.download) > 0:
        img_url =  (img_info['url']).replace('https','http')

        if os.path.exists(args.download) is False:
            os.makedirs(args.download)

        img_data = urllib2.urlopen( img_url )
        img_file = open(u'{}/{}'.format(args.download, img_name), 'wb')
        img_file.write(img_data.read())
        img_file.close()
        global download_path
        if args.download[-1] != '/':
            download_path = args.download+'/'
        else:
            download_path = args.download
        print u'Saved to {}'.format(download_path +  img_name)
    if args.verbose:
        print 'File text:',img_text
        pprint.pprint( img_info )

    print