#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2 , json , pprint , re
import xml . etree . ElementTree as ET
import subprocess , shlex , urllib
sid = ' 1234 '
useragent = " Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101 "
endpoint = " http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json& "
# API MODULES
def api_request ( action , pagename ) : #get page: content, metadata, images, imageifnp
print ' API REQUEST '
print pagename
print ' TEST ' , action . format ( pagename )
url = endpoint + ( action . format ( pagename ) )
print ' API REQUEST ' , url
request = urllib2 . urlopen ( url )
jsonp = json . loads ( request . read ( ) )
json_dic = ( jsonp . get ( ' query ' ) . get ( ' pages ' ) )
# pprint.pprint( json_dic )
page_id = json_dic . keys ( ) [ 0 ]
page_content = json_dic . get ( page_id )
return page_content
def api_page ( pageid , query ) :
if query == ' content ' :
api_response = api_request ( ' action=query&pageids= {} &prop=revisions&rvprop=content ' , pageid )
response = ( ( api_response . get ( ' revisions ' ) ) [ 0 ] ) [ ' * ' ]
elif query == ' metadata ' :
response = api_request ( ' action=query&pageids= {} &prop=info ' , pageid )
elif query == ' articleimgs ' :
response = api_request ( ' action=query&pageids= {} &prop=images ' , pageid )
elif query == ' file ' :
response = api_request ( ' action=query&titles=File: {} &prop=imageinfo&iiprop=url ' , pageid )
elif query == ' imageinfo ' :
pagename = pageid # in imageinfo titles are used instead of id
print ' IMAGEINFO ' , pagename
response = api_request ( " action=query&titles=File: {} &prop=imageinfo&iiprop=url&iiurlwidth=500 " , pagename ) # iiurlwidht dermines with of thumbnail
return response
##############################
# CATEGORIES AND PAGES
################
# * MUST BE REPLACE BY SMARTER CODE (USING PY MD LIB)
##############################
def api_pagecategories ( pageid ) :
''' Find all the categories, and their parent category of a page '''
query = ' action=query&pageids= {} &prop=categories ' . format ( pageid )
url = endpoint + query
request = urllib2 . urlopen ( url )
jsonp = json . loads ( request . read ( ) )
json_dic = jsonp [ ' query ' ] [ ' pages ' ]
page_id = json_dic . keys ( ) [ 0 ]
page_categories = json_dic [ page_id ] [ u ' categories ' ]
all_cats = [ entry [ u ' title ' ] . encode ( ' utf-8 ' ) for entry in page_categories ] #.replace('Category:', '')
return all_cats
def api_pagesincategories ( category , year ) :
# Find all pages incategory and add to allworks dictionary
category = category . replace ( ' ' , ' _ ' )
apiCatMembers = endpoint + ' action=query&list=categorymembers&cmlimit=1000&cmtitle=Category: {} ' . format ( category )
request = urllib2 . urlopen ( apiCatMembers )
jsonp = json . loads ( request . read ( ) )
graduationWorkMembers = jsonp [ ' query ' ] [ ' categorymembers ' ]
intersectCatMembers = [ ]
if year :
for member in graduationWorkMembers :
page_cats = api_pagecategories ( member [ ' pageid ' ] )
if ( ' Category: {} ' . format ( year ) ) in page_cats :
print year , ' in ' , page_cats
intersectCatMembers . append ( member ) # add member to intersectCatMembers
else :
intersectCatMembers = graduation_work_members
return intersectCatMembers
# for page in intersectCatMembers:
# title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
# pageid = page['pageid']
# article = api_page(pageid, 'content')
# # print title
# # pprint.pprint(article)
# work = parse_work_page(title, article)
# if work:
# allworks[pageid] = work #dictionary(allworks) entry
# print pprint.pprint( work )
# # Create work page
# else:
# print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
# print '-------------'
# print
def api_file_url ( filename ) : # get full urls
page_content_dict = api_page ( filename , ' file ' )
if ' imageinfo ' in page_content_dict . keys ( ) :
imgurl = ( ( page_content_dict . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' url ' ) )
return imgurl
else :
return None
def api_thumb_url ( filename ) :
print ''' get thumbnail url of image '''
thumburl = api_page ( filename , ' imageinfo ' )
thumburl = ( ( thumburl . get ( ' imageinfo ' ) ) [ 0 ] . get ( ' thumburl ' ) )
print thumburl
return thumburl
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
# PROCESSING MODULES
def write_html_file ( html_tree , filename ) :
doctype = " <!DOCTYPE HTML> "
html = doctype + ET . tostring ( html_tree , encoding = ' utf-8 ' , method = ' html ' )
edited = open ( filename , ' w ' ) #write
edited . write ( html )
edited . close ( )
# mw article modules
def parse_work_page ( title , content ) :
# content = content.encode('utf-8')
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
work_dict = { }
work_dict [ ' Title ' ] = title
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL ) # template's key/value pair
if extra : #append extra
extra = ( ' Extra ' , extra )
keyval . append ( extra ) #?
# checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
# if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
for pair in keyval :
key = pair [ 0 ]
val = pair [ 1 ]
val = val . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
thumburl = api_thumb_url ( val )
work_dict [ ' Thumbnail_url ' ] = thumburl
print ' THUMB: ' , thumburl
work_dict [ key ] = val
return work_dict , extra
# Alternative to parse_work_page
def parse_work ( title , content ) :
workdict = { ' Title ' : title , ' Creator ' : ' ' , ' Date ' : ' ' , ' Website ' : ' ' , ' Thumbnail ' : ' ' , ' Bio ' : ' ' , ' Description ' : ' ' , ' Extra ' : ' ' }
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
if extra :
workdict [ ' Extra ' ] = extra . encode ( ' utf-8 ' )
# template's key/value pair
# Note:Extra value is NOT CAPTURED by this regex
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
for pair in keyval :
key = pair [ 0 ]
val = ( pair [ 1 ] ) . replace ( ' \n ' , ' ' )
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
elif ' Thumbnail ' in key :
val = api_thumb_url ( val )
elif ' Website ' in key :
val = urllib . unquote ( val )
workdict [ key ] = val . encode ( ' utf-8 ' )
# pprint.pprint(workdict)
return workdict
# Conversion Modules
def pandoc2html ( mw_content ) :
''' convert individual mw sections to html '''
mw_content = mw_content . encode ( ' utf-8 ' )
# convert from mw to html
args_echo = shlex . split ( ( ' echo " {} " ' . format ( mw_content ) ) )
args_pandoc = shlex . split ( ' pandoc -f mediawiki -t html5 ' )
p1 = subprocess . Popen ( args_echo , stdout = subprocess . PIPE )
p2 = subprocess . Popen ( args_pandoc , stdin = p1 . stdout , stdout = subprocess . PIPE )
html = ( p2 . communicate ( ) ) [ 0 ]
return html
def img_fullurl ( parent ) :
imgs = parent . findall ( ' .//img ' )
for img in imgs :
src = img . get ( ' src ' )
fullurl = api_thumb_url ( src )
if fullurl != None :
img . set ( ' src ' , fullurl )
gallery_exp = re . compile ( ' <gallery>(.*?)</gallery> ' , re . S )
imgfile_exp = re . compile ( ' (File:(.*?) \ .(gif|jpg|jpeg|png)) ' )
def replace_gallery ( content ) :
content = re . sub ( imgfile_exp , ' [[ \ g<1>]] ' , content ) #add [[ ]] to File:.*?
content = re . sub ( gallery_exp , ' \ g<1> ' , content ) #remove gallery wrapper
return content
video_exp = re . compile ( ' \ { \ { (.*?) \ |(.*?) \ } \ } ' )
vimeo_exp = re . compile ( ' \ { \ { vimeo \ |(.*?) \ } \ } ' )
youtube_exp = re . compile ( ' \ { \ { youtube \ |(.*?) \ } \ } ' )
def replace_video ( content ) :
content = re . sub ( vimeo_exp , " <iframe src= ' https://player.vimeo.com/video/ \ g<1> ' width= ' 600px ' height= ' 450px ' > </iframe> " , content )
content = re . sub ( youtube_exp , " <iframe src= ' https://www.youtube.com/embed/ \ g<1> ' width= ' 600px ' height= ' 450px ' > </iframe> " , content )
return content