You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
2.7 KiB
Python

#! /usr/bin/env python
# -*- coding: utf-8 -*-
from mwclient import Site, image
from argparse import RawTextHelpFormatter
from argparse import ArgumentParser
import pprint, os, urllib2
####
# Arguments
# todo add search query
####
p = ArgumentParser(description="""
# Description: Script dowloads media files from a wiki, according to a sematic ask query.
# Examples:
Downloads files with property Modification_date:
python wiki-download.py -d imgs -a [[Modification_date::+]]
Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster:
python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter)
p.add_argument("--host", metavar='', default="aa.xpub.nl")
p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /")
p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.")
p.add_argument("--download", "-d", metavar='', default='', help="Local directory to store files from wiki. If no directory provided files wont be downloaded")
p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded")
args = p.parse_args()
print args
#########
# defs
#########
def mwsite(host, path):
site = Site(('http',host), path)
return site
site = mwsite(args.host, args.path)
#print site
query= ('[[File:+]]'+ args.ask)
# examples:
# [[File:+]][[Year::+]][[Document_Type::Poster]]
# [[File:+]][[Year::2007]][[Document_Type::Poster]]
# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]
# can write compound query such as Year::2000 Medium::Flyer
for answer in site.ask(query):
img_key = answer.keys()[0]
img_name = answer[img_key]['fulltext']
img_page_url = answer[img_key]['fullurl']
img = image.Image(site, answer)
img_info = img.imageinfo # dict includes url and extensive metadata
# FILE DOWNLOAD
if 'url' in img_info.keys() and len(args.download) > 0:
img_url = (img_info['url']).replace('https','http')
if os.path.exists(args.download) is False:
os.makedirs(args.download)
img_data = urllib2.urlopen( img_url )
img_file = open(u'{}/{}'.format(args.download, img_name), 'wb')
img_file.write(img_data.read())
img_file.close()
global download_path
if args.download[-1] != '/':
download_path = args.download+'/'
else:
download_path = args.download
print u'Saved to {}'.format(download_path + img_name)
if args.verbose:
print 'File text:',img_text
pprint.pprint( img_info )
print