@ -41,6 +41,60 @@ def api_page(pageid, query):
response = api_request ( " action=query&titles=File: {} &prop=imageinfo&iiprop=url&iiurlwidth=500 " , pagename ) # iiurlwidht dermines with of thumbnail
return response
##############################
# CATEGORIES AND PAGES
################
# * MUST BE REPLACE BY SMARTER CODE (USING PY MD LIB)
##############################
def api_pagecategories ( pageid ) :
''' Find all the categories, and their parent category of a page '''
query = ' action=query&pageids= {} &prop=categories ' . format ( pageid )
url = endpoint + query
request = urllib2 . urlopen ( url )
jsonp = json . loads ( request . read ( ) )
json_dic = jsonp [ ' query ' ] [ ' pages ' ]
page_id = json_dic . keys ( ) [ 0 ]
page_categories = json_dic [ page_id ] [ u ' categories ' ]
all_cats = [ entry [ u ' title ' ] . encode ( ' utf-8 ' ) for entry in page_categories ] #.replace('Category:', '')
return all_cats
def api_pagesincategories ( category , year ) :
# Find all pages incategory and add to allworks dictionary
category = category . replace ( ' ' , ' _ ' )
apiCatMembers = endpoint + ' action=query&list=categorymembers&cmlimit=1000&cmtitle=Category: {} ' . format ( category )
request = urllib2 . urlopen ( apiCatMembers )
jsonp = json . loads ( request . read ( ) )
graduationWorkMembers = jsonp [ ' query ' ] [ ' categorymembers ' ]
intersectCatMembers = [ ]
if year :
for member in graduationWorkMembers :
page_cats = api_pagecategories ( member [ ' pageid ' ] )
if ( ' Category: {} ' . format ( year ) ) in page_cats :
print year , ' in ' , page_cats
intersectCatMembers . append ( member ) # add member to intersectCatMembers
else :
intersectCatMembers = graduation_work_members
return intersectCatMembers
# for page in intersectCatMembers:
# title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
# pageid = page['pageid']
# article = api_page(pageid, 'content')
# # print title
# # pprint.pprint(article)
# work = parse_work_page(title, article)
# if work:
# allworks[pageid] = work #dictionary(allworks) entry
# print pprint.pprint( work )
# # Create work page
# else:
# print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
# print '-------------'
# print
def api_file_url ( filename ) : # get full urls
page_content_dict = api_page ( filename , ' file ' )
if ' imageinfo ' in page_content_dict . keys ( ) :
@ -60,7 +114,7 @@ def api_thumb_url(filename):
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
# XML MODULES
# PROCESSING MODULES
def write_html_file ( html_tree , filename ) :
doctype = " <!DOCTYPE HTML> "
@ -77,11 +131,13 @@ def parse_work_page(title, content):
work_dict = { }
work_dict [ ' Title ' ] = title
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
print ' template ' , template ,
print ' extra ' , extra
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
if extra :
extra = ( ' Extra ' , extra )
keyval . append ( extra )
keyval . append ( extra ) #?
# checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
# if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
@ -106,7 +162,7 @@ def parse_work(title, content):
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
template , extra = ( re . findall ( ' \ { \ { Graduation work \n (.*?) \ } \ }(.*) ' , content , re . DOTALL ) ) [ 0 ]
workdict [ ' Extra ' ] = extra #.encode('utf-8' )
workdict [ ' Extra ' ] = extra . encode ( ' utf-8 ' )
# template's key/value pair
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
for pair in keyval :
@ -118,7 +174,7 @@ def parse_work(title, content):
print ' calling API '
val = api_thumb_url ( val )
print ' THUMB ' , val
workdict [ key ] = val
workdict [ key ] = val . encode ( ' utf-8 ' )
pprint . pprint ( workdict )
return workdict
@ -128,7 +184,7 @@ def parse_work(title, content):
# Conversion Modules
def pandoc2html ( mw_content ) :
if mw_content :
mw_content = mw_content #.encode('utf-8' )
mw_content = mw_content . encode ( ' utf-8 ' )
# convert from mw to html
args_echo = shlex . split ( ( ' echo " {} " ' . format ( mw_content ) ) )
args_pandoc = shlex . split ( ' pandoc -f mediawiki -t html5 ' )