#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2 , json , pprint , re
import xml . etree . ElementTree as ET
import subprocess , shlex
sid = ' 1234 '
useragent = " Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101 "
endpoint = " http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json& "
# API MODULES
def api_request ( action , pagename ) : #get page: content, metadata, images, imageifnp
print ' API REQUEST '
print pagename
print ' TEST ' , action . format ( pagename )
url = endpoint + ( action . format ( pagename ) )
print ' API REQUEST ' , url
request = urllib2 . urlopen ( url )
jsonp = json . loads ( request . read ( ) )
json_dic = ( jsonp . get ( ' query ' ) . get ( ' pages ' ) )
# pprint.pprint( json_dic )
page_id = json_dic . keys ( ) [ 0 ]
page_content = json_dic . get ( page_id )
return page_content
def api_page ( pageid , query ) :
if query == ' content ' :
api_response = api_request ( ' action=query&pageids= {} &prop=revisions&rvprop=content ' , pageid )
response = ( ( api_response . get ( ' revisions ' ) ) [ 0 ] ) [ ' * ' ]
elif query == ' metadata ' :
response = api_request ( ' action=query&pageids= {} &prop=info ' , pageid )
elif query == ' articleimgs ' :
response = api_request ( ' action=query&pageids= {} &prop=images ' , pageid )
elif query == ' file ' :
response = api_request ( ' action=query&titles=File: {} &prop=imageinfo&iiprop=url ' , pageid )
elif query == ' imageinfo ' :
pagename = pageid # in imageinfo titles are used instead of id
print ' IMAGEINFO ' , pagename
response = api_request ( " action=query&titles=File: {} &prop=imageinfo&iiprop=url&iiurlwidth=500 " , pagename ) # iiurlwidht dermines with of thumbnail
return response
def api_file_url ( filename ) : # get full urls
page_content_dict = api_page ( filename , ' file ' )
if ' imageinfo ' in page_content_dict . keys ( ) :
imgurl = ( ( page_content_dict . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' url ' ) )
return imgurl
else :
return None
def api_thumb_url ( filename ) :
print ''' get thumbnail url of image '''
thumburl = api_page ( filename , ' imageinfo ' )
thumburl = ( ( thumburl . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' thumburl ' ) )
print thumburl
return thumburl
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
# XML MODULES
def write_html_file ( html_tree , filename ) :
doctype = " <!DOCTYPE HTML> "
html = doctype + ET . tostring ( html_tree , encoding = ' utf-8 ' , method = ' html ' )
edited = open ( filename , ' w ' ) #write
edited . write ( html )
edited . close ( )
# mw article modules
def parse_work_page ( title , content ) :
# content = content.encode('utf-8')
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
work_dict = { }
work_dict [ ' Title ' ] = title
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
if extra :
extra = ( ' Extra ' , extra )
keyval . append ( extra )
# checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
# if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
for pair in keyval :
key = pair [ 0 ]
val = pair [ 1 ]
val = val . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
thumburl = api_thumb_url ( val )
work_dict [ ' Thumbnail_url ' ] = thumburl
print ' THUMB: ' , thumburl
work_dict [ key ] = val
return work_dict , extra
# Alternative to parse_work_page
def parse_work ( title , content ) :
workdict = { ' Title ' : title , ' Creator ' : ' ' , ' Date ' : ' ' , ' Website ' : ' ' , ' Thumbnail ' : ' ' , ' Bio ' : ' ' , ' Description ' : ' ' , ' Extra ' : ' ' }
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
workdict [ ' Extra ' ] = extra #.encode('utf-8')
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
for pair in keyval :
key = pair [ 0 ]
val = ( pair [ 1 ] ) . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
print ' calling API '
val = api_thumb_url ( val )
print ' THUMB ' , val
workdict [ key ] = val
pprint . pprint ( workdict )
return workdict
# Conversion Modules
def pandoc2html ( mw_content ) :
if mw_content :
mw_content = mw_content #.encode('utf-8')
# convert from mw to html
args_echo = shlex . split ( ( ' echo " {} " ' . format ( mw_content ) ) )
args_pandoc = shlex . split ( ' pandoc -f mediawiki -t html5 ' )
p1 = subprocess . Popen ( args_echo , stdout = subprocess . PIPE )
p2 = subprocess . Popen ( args_pandoc , stdin = p1 . stdout , stdout = subprocess . PIPE )
html = ( p2 . communicate ( ) ) [ 0 ]
return html
def pandoc ( filename , title , creator , date , website , thumbnail , bio , description , extra , template ) :
''' pandoc: convert mediawiki syntax to html '''
# mw_content = mw_content.encode('utf-8')
args_echo = shlex . split ( ( ' echo " {} " ' . format ( extra ) ) )
args_pandoc = shlex . split ( ' pandoc -s -f mediawiki -t html \
- - template { template } - - variable title = " {title} " - - variable creator = " {creator} " - - variable date = " {date} " - - variable website = " {website} " - - variable website = " {website} " - - variable thumbnail = " {thumbnail} " - - variable bio = """ {bio} """ - o { filename } ' .format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) )
print args_pandoc
p1 = subprocess . Popen ( args_echo , stdout = subprocess . PIPE )
p2 = subprocess . Popen ( args_pandoc , stdin = p1 . stdout , stdout = subprocess . PIPE )
html = ( p2 . communicate ( ) ) [ 0 ]
def img_fullurl ( parent ) :
imgs = parent . findall ( ' .//img ' )
print ' len IMG ' , len ( imgs )
for img in imgs :
src = img . get ( ' src ' )
fullurl = api_thumb_url ( src )
print ' ----- IMG ' , ET . tostring ( img ) , src , fullurl
if fullurl != None :
img . set ( ' src ' , fullurl )
# fileurl = api_request(src, endpoint)# find url of file
def replace_gallery ( content ) :
gallery_imgs = [ ]
gallery_found = re . findall ( gallery_exp , content )
content = re . sub ( gallery_exp , ' ' , content )
for gallery in gallery_found : # in case there is more than 1 <gallery>
allfiles = re . findall ( img_exp , gallery )
for imgfile in allfiles :
imgfile = imgfile [ 1 ]
imgsrc = api_file_url ( imgfile ) # search for original image
gallery_imgs . append ( imgsrc )
print ' gallery_imgs ' , gallery_imgs
# from <gallery>.*</gallery> imgs, return list of img ET elements
# replace <gallery>.*</gallery> with ''
return content , gallery_imgs
def replace_video ( content ) :
videos = [ ]
videos_found = re . findall ( video_exp , content )
for video in videos_found :
video_provider = str ( video [ 0 ] )
video_hash = str ( video [ 1 ] )
video_src = None
if ( video_provider . lower ( ) ) == ' youtube ' :
video_src = " https://www.youtube.com/embed/ " + video_hash
elif ( video_provider . lower ( ) ) == ' vimeo ' :
video_src = " https://player.vimeo.com/video/ " + video_hash
if video_src :
videos . append ( video_src )
iframe = " <iframe src= ' {} ' width= ' 600px ' height= ' 450px ' ></iframe> " . format ( video_src )
# content = re.sub(video_exp, ' iframe ', content)
else :
content = re . sub ( video_exp , ' ' , content )