scan-utils/wiki-download.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-

from mwclient import Site, image
from argparse import RawTextHelpFormatter
from argparse import ArgumentParser
import pprint, os, urllib2

####
# Arguments
# todo add search query
####
p = ArgumentParser(description="""
# Description: Script dowloads media files from a wiki, according to a sematic ask query. 

# Examples: 
Downloads files with property Modification_date: 
python wiki-download.py -d imgs -a [[Modification_date::+]]  

Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster: 
python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter)

p.add_argument("--host",  metavar='', default="aa.xpub.nl")
p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /")
p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.")
p.add_argument("--download", "-d", metavar='', default='imgs', help="Local directory to store files from wiki. If not specified, defaults to imgs")
p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded")

args = p.parse_args()
print args

#########
# defs
#########
def mwsite(host, path):
    site = Site(('http',host), path)
    return site


site = mwsite(args.host, args.path)
#print site


query= ('[[File:+]]'+ args.ask)
print 'Query:', query

# examples:
# [[File:+]][[Year::+]][[Document_Type::Poster]]
# [[File:+]][[Year::2007]][[Document_Type::Poster]]
# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]
# can write compound query such as Year::2000 Medium::Flyer


for answer in site.ask(query):
    img_key = answer.keys()[0]
    img_name = answer[img_key]['fulltext']        
    img_page_url = answer[img_key]['fullurl']        
    img = image.Image(site, answer)
    img_info =  img.imageinfo # dict includes url and extensive metadata
    print img_name
    # FILE DOWNLOAD        
    if 'url' in img_info.keys() and len(args.download) > 0:
        img_url =  (img_info['url']).replace('https','http')

        if os.path.exists(args.download) is False:
            os.makedirs(args.download)

        img_data = urllib2.urlopen( img_url )
        img_file = open(u'{}/{}'.format(args.download, img_name), 'wb')
        img_file.write(img_data.read())
        img_file.close()
        global download_path
        if args.download[-1] != '/':
            download_path = args.download+'/'
        else:
            download_path = args.download            
        print u'Saved to {}'.format(download_path +  img_name)
    if args.verbose:
        pprint.pprint( img_info )

    print
imposition and wiki-download in da house 7 years ago			`#! /usr/bin/env python`
			`# -- coding: utf-8 --`

			`from mwclient import Site, image`
			`from argparse import RawTextHelpFormatter`
			`from argparse import ArgumentParser`
			`import pprint, os, urllib2`

			`####`
			`# Arguments`
			`# todo add search query`
			`####`
			`p = ArgumentParser(description="""`
			`# Description: Script dowloads media files from a wiki, according to a sematic ask query.`

			`# Examples:`
			`Downloads files with property Modification_date:`
			`python wiki-download.py -d imgs -a [[Modification_date::+]]`

			`Downloads files semantically tagged as belonging to Year 2007 and having Document_Type Poster:`
			`python wiki-download.py -d imgs -a [[Year::2007]][[Document_Type::Poster]]""", formatter_class=RawTextHelpFormatter)`

			`p.add_argument("--host", metavar='', default="aa.xpub.nl")`
			`p.add_argument("--path", metavar='', default="/", help="Wiki path. Should end with /")`
			`p.add_argument("--ask", "-a", metavar='', default="", help="Ask query to be sent to the wiki API.")`
Fixed help message 7 years ago			`p.add_argument("--download", "-d", metavar='', default='imgs', help="Local directory to store files from wiki. If not specified, defaults to imgs")`
imposition and wiki-download in da house 7 years ago			`p.add_argument("--verbose", "-v", action='store_true', help="Increase verbosity. If not given no files will be downloaded")`

			`args = p.parse_args()`
			`print args`

			`#########`
			`# defs`
			`#########`
			`def mwsite(host, path):`
			`site = Site(('http',host), path)`
			`return site`


			`site = mwsite(args.host, args.path)`
running on mwclient 0.8.6 (current version) 7 years ago			`#print site`
imposition and wiki-download in da house 7 years ago


			`query= ('[[File:+]]'+ args.ask)`
adding print statements 7 years ago			`print 'Query:', query`
imposition and wiki-download in da house 7 years ago
			`# examples:`
			`# [[File:+]][[Year::+]][[Document_Type::Poster]]`
			`# [[File:+]][[Year::2007]][[Document_Type::Poster]]`
			`# same as: api.php?action=ask&query=[[File:%2B]][[Year::2007]][[Document_Type::Poster]]`
			`# can write compound query such as Year::2000 Medium::Flyer`


			`for answer in site.ask(query):`
running on mwclient 0.8.6 (current version) 7 years ago			`img_key = answer.keys()[0]`
			`img_name = answer[img_key]['fulltext']`
			`img_page_url = answer[img_key]['fullurl']`
imposition and wiki-download in da house 7 years ago			`img = image.Image(site, answer)`
			`img_info = img.imageinfo # dict includes url and extensive metadata`
adding print statements 7 years ago			`print img_name`
imposition and wiki-download in da house 7 years ago			`# FILE DOWNLOAD`
			`if 'url' in img_info.keys() and len(args.download) > 0:`
			`img_url = (img_info['url']).replace('https','http')`

			`if os.path.exists(args.download) is False:`
			`os.makedirs(args.download)`

			`img_data = urllib2.urlopen( img_url )`
running on mwclient 0.8.6 (current version) 7 years ago			`img_file = open(u'{}/{}'.format(args.download, img_name), 'wb')`
imposition and wiki-download in da house 7 years ago			`img_file.write(img_data.read())`
			`img_file.close()`
running on mwclient 0.8.6 (current version) 7 years ago			`global download_path`
			`if args.download[-1] != '/':`
			`download_path = args.download+'/'`
			`else:`
			`download_path = args.download`
			`print u'Saved to {}'.format(download_path + img_name)`
imposition and wiki-download in da house 7 years ago			`if args.verbose:`
			`pprint.pprint( img_info )`

			`print`