You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
#! /usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from mwclient import Site, image
|
|
from argparse import RawTextHelpFormatter
|
|
from argparse import ArgumentParser
|
|
import pprint, os, urllib2
|
|
|
|
####
|
|
# Arguments
|
|
# todo add search query
|
|
####
|
|
p = ArgumentParser(description="""
|
|
# Description: Script dowloads media files from a wiki, according to a sematic ask query.
|
|
|
|
# Examples:
|
|
Downloads files with property Modification_date:
|
|
python wiki-download.py -d imgs -a [[Modification_date::+]]
|
|
|
|
Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster:
|
|
python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter)
|
|
|
|
p.add_argument("--host", metavar='', default="aa.xpub.nl")
|
|
p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /")
|
|
p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.")
|
|
p.add_argument("--download", "-d", metavar='', default='', help="Local directory to store files from wiki. If no directory provided files wont be downloaded")
|
|
p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded")
|
|
|
|
args = p.parse_args()
|
|
print args
|
|
|
|
#########
|
|
# defs
|
|
#########
|
|
def mwsite(host, path):
|
|
site = Site(('http',host), path)
|
|
return site
|
|
|
|
|
|
site = mwsite(args.host, args.path)
|
|
#print site
|
|
|
|
|
|
|
|
query= ('[[File:+]]'+ args.ask)
|
|
|
|
# examples:
|
|
# [[File:+]][[Year::+]][[Document_Type::Poster]]
|
|
# [[File:+]][[Year::2007]][[Document_Type::Poster]]
|
|
# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]
|
|
# can write compound query such as Year::2000 Medium::Flyer
|
|
|
|
|
|
for answer in site.ask(query):
|
|
img_key = answer.keys()[0]
|
|
img_name = answer[img_key]['fulltext']
|
|
img_page_url = answer[img_key]['fullurl']
|
|
img = image.Image(site, answer)
|
|
img_info = img.imageinfo # dict includes url and extensive metadata
|
|
# FILE DOWNLOAD
|
|
if 'url' in img_info.keys() and len(args.download) > 0:
|
|
img_url = (img_info['url']).replace('https','http')
|
|
|
|
if os.path.exists(args.download) is False:
|
|
os.makedirs(args.download)
|
|
|
|
img_data = urllib2.urlopen( img_url )
|
|
img_file = open(u'{}/{}'.format(args.download, img_name), 'wb')
|
|
img_file.write(img_data.read())
|
|
img_file.close()
|
|
global download_path
|
|
if args.download[-1] != '/':
|
|
download_path = args.download+'/'
|
|
else:
|
|
download_path = args.download
|
|
print u'Saved to {}'.format(download_path + img_name)
|
|
if args.verbose:
|
|
print 'File text:',img_text
|
|
pprint.pprint( img_info )
|
|
|
|
print
|
|
|
|
|
|
|
|
|
|
|
|
|