#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2 , json , pprint , re
import xml . etree . ElementTree as ET
import subprocess , shlex
sid = ' 1234 '
useragent = " Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101 "
endpoint = " http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json& "
# API MODULES
def api_request ( action , pagename ) : #get page: content, metadata, images, imageifnp
print ' API REQUEST '
print pagename
print ' TEST ' , action . format ( pagename )
url = endpoint + ( action . format ( pagename ) )
print ' API REQUEST ' , url
request = urllib2 . urlopen ( url )
jsonp = json . loads ( request . read ( ) )
json_dic = ( jsonp . get ( ' query ' ) . get ( ' pages ' ) )
# pprint.pprint( json_dic )
page_id = json_dic . keys ( ) [ 0 ]
page_content = json_dic . get ( page_id )
return page_content
def api_page ( pageid , query ) :
if query == ' content ' :
api_response = api_request ( ' action=query&pageids= {} &prop=revisions&rvprop=content ' , pageid )
response = ( ( api_response . get ( ' revisions ' ) ) [ 0 ] ) [ ' * ' ]
elif query == ' metadata ' :
response = api_request ( ' action=query&pageids= {} &prop=info ' , pageid )
elif query == ' articleimgs ' :
response = api_request ( ' action=query&pageids= {} &prop=images ' , pageid )
elif query == ' file ' :
response = api_request ( ' action=query&titles=File: {} &prop=imageinfo&iiprop=url ' , pageid )
elif query == ' imageinfo ' :
pagename = pageid # in imageinfo titles are used instead of id
print ' IMAGEINFO ' , pagename
response = api_request ( " action=query&titles=File: {} &prop=imageinfo&iiprop=url&iiurlwidth=500 " , pagename ) # iiurlwidht dermines with of thumbnail
return response
def api_file_url ( filename ) : # get full urls
page_content_dict = api_page ( filename , ' file ' )
if ' imageinfo ' in page_content_dict . keys ( ) :
imgurl = ( ( page_content_dict . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' url ' ) )
return imgurl
else :
return None
def api_thumb_url ( filename ) :
print ''' get thumbnail url of image '''
thumburl = api_page ( filename , ' imageinfo ' )
thumburl = ( ( thumburl . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' thumburl ' ) )
print thumburl
return thumburl
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
# XML MODULES
def write_html_file ( html_tree , filename ) :
doctype = " <!DOCTYPE HTML> "
html = doctype + ET . tostring ( html_tree , encoding = ' utf-8 ' , method = ' html ' )
edited = open ( filename , ' w ' ) #write
edited . write ( html )
edited . close ( )
# mw article modules
def parse_work_page ( title , content ) :
# content = content.encode('utf-8')
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
work_dict = { }
work_dict [ ' Title ' ] = title
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
if extra :
extra = ( ' Extra ' , extra )
keyval . append ( extra )
# checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
# if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
for pair in keyval :
key = pair [ 0 ]
val = pair [ 1 ]
val = val . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
thumburl = api_thumb_url ( val )
work_dict [ ' Thumbnail_url ' ] = thumburl
print ' THUMB: ' , thumburl
work_dict [ key ] = val
return work_dict , extra
# Alternative to parse_work_page
def parse_work ( title , content ) :
workdict = { ' Title ' : title , ' Creator ' : ' ' , ' Date ' : ' ' , ' Website ' : ' ' , ' Thumbnail ' : ' ' , ' Bio ' : ' ' , ' Description ' : ' ' , ' Extra ' : ' ' }
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
workdict [ ' Extra ' ] = extra #.encode('utf-8')
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
for pair in keyval :
key = pair [ 0 ]
val = ( pair [ 1 ] ) . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
print ' calling API '
val = api_thumb_url ( val )
print ' THUMB ' , val
workdict [ key ] = val
pprint . pprint ( workdict )
return workdict
# Conversion Modules
def pandoc2html ( mw_content ) :
if mw_content :
mw_content = mw_content #.encode('utf-8')
# convert from mw to html
args_echo = shlex . split ( ( ' echo " {} " ' . format ( mw_content ) ) )
args_pandoc = shlex . split ( ' pandoc -f mediawiki -t html5 ' )
p1 = subprocess . Popen ( args_echo , stdout = subprocess . PIPE )
p2 = subprocess . Popen ( args_pandoc , stdin = p1 . stdout , stdout = subprocess . PIPE )
html = ( p2 . communicate ( ) ) [ 0 ]
return html
def pandoc ( filename , title , creator , date , website , thumbnail , bio , description , extra , template ) :
''' pandoc: convert mediawiki syntax to html '''
# mw_content = mw_content.encode('utf-8')
args_echo = shlex . split ( ( ' echo " {} " ' . format ( extra ) ) )
args_pandoc = shlex . split ( ' pandoc -s -f mediawiki -t html \
- - template { template } - - variable title = " {title} " - - variable creator = " {creator} " - - variable date = " {date} " - - variable website = " {website} " - - variable website = " {website} " - - variable thumbnail = " {thumbnail} " - - variable bio = """ {bio} """ - o { filename } ' .format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) )
print args_pandoc
p1 = subprocess . Popen ( args_echo , stdout = subprocess . PIPE )
p2 = subprocess . Popen ( args_pandoc , stdin = p1 . stdout , stdout = subprocess . PIPE )
html = ( p2 . communicate ( ) ) [ 0 ]
# return html
# pandoc either reades input from stdin or through input file
# pandoc DOES NOT convert variables; it has to receive the input from stdin.
# to create html convert: bio, description, extra, of at time
# insert them into HTML template by:
## gerating html in python? and insertion sub elements ?
## ??
# pandoc = 'pandoc -s -f mediawiki -t html5 \
# --template template_article.html \
# --variable title="{title}" \
# --variable section="{section}" \
# --variable topics="{topics}" \
# --variable issueName="{iname}" \
# --variable issueNumber="{inum}" \
# "articles/tmp_content.mw" -o "{articlepath}/{htmlfile}.html"'.format(articlepath=path, title=(pagename).replace("_"," "), section=in_section, topics=in_topic, iname=in_issuename, inum=in_issue, htmlfile=pagename)
# subprocess.call(pandoc, shell=True) # saved in tmp_content.html html
# html = open('tmp_content.html', 'r') #write mediawiki content to html in tmp_content.html
# html = html.read()
# return html
def img_fullurl ( parent ) :
imgs = parent . findall ( ' .//img ' )
print ' len IMG ' , len ( imgs )
for img in imgs :
src = img . get ( ' src ' )
fullurl = api_thumb_url ( src )
print ' ----- IMG ' , ET . tostring ( img ) , src , fullurl
if fullurl != None :
img . set ( ' src ' , fullurl )
# fileurl = api_request(src, endpoint)# find url of file