#! /usr/bin/env python
# -*- coding: utf-8 -*-
from mwclient import Site , image
from argparse import RawTextHelpFormatter
from argparse import ArgumentParser
import pprint , os , urllib2
####
# Arguments
# todo add search query
####
p = ArgumentParser ( description = """
# Description: Script dowloads media files from a wiki, according to a sematic ask query.
# Examples:
Downloads files with property Modification_date :
python wiki - download . py - d imgs - a [ [ Modification_date : : + ] ]
Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster :
python wiki - download . py - d imgs - a [ [ Year : : 2007 ] ] [ [ Document_Type : : Poster ] ] """ , formatter_class=RawTextHelpFormatter)
p . add_argument ( " --host " , metavar = ' ' , default = " aa.xpub.nl " )
p . add_argument ( " --path " , metavar = ' ' , default = " / " , help = " Wiki path. Should end with / " )
p . add_argument ( " --ask " , " -a " , metavar = ' ' , default = " " , help = " Ask query to be sent to the wiki API. " )
p . add_argument ( " --download " , " -d " , metavar = ' ' , default = ' ' , help = " Local directory to store files from wiki. If no directory provided files wont be downloaded " )
p . add_argument ( " --verbose " , " -v " , action = ' store_true ' , help = " Increase verbosity. If not given no files will be downloaded " )
args = p . parse_args ( )
print args
#########
# defs
#########
def mwsite ( host , path ) :
site = Site ( ( ' http ' , host ) , path )
return site
site = mwsite ( args . host , args . path )
#print site
query = ( ' [[File:+]] ' + args . ask )
print ' Query: ' , query
# examples:
# [[File:+]][[Year::+]][[Document_Type::Poster]]
# [[File:+]][[Year::2007]][[Document_Type::Poster]]
# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]
# can write compound query such as Year::2000 Medium::Flyer
for answer in site . ask ( query ) :
img_key = answer . keys ( ) [ 0 ]
img_name = answer [ img_key ] [ ' fulltext ' ]
img_page_url = answer [ img_key ] [ ' fullurl ' ]
img = image . Image ( site , answer )
img_info = img . imageinfo # dict includes url and extensive metadata
print img_name
# FILE DOWNLOAD
if ' url ' in img_info . keys ( ) and len ( args . download ) > 0 :
img_url = ( img_info [ ' url ' ] ) . replace ( ' https ' , ' http ' )
if os . path . exists ( args . download ) is False :
os . makedirs ( args . download )
img_data = urllib2 . urlopen ( img_url )
img_file = open ( u ' {} / {} ' . format ( args . download , img_name ) , ' wb ' )
img_file . write ( img_data . read ( ) )
img_file . close ( )
global download_path
if args . download [ - 1 ] != ' / ' :
download_path = args . download + ' / '
else :
download_path = args . download
print u ' Saved to {} ' . format ( download_path + img_name )
if args . verbose :
pprint . pprint ( img_info )
print