@ -3,7 +3,7 @@
import xml . etree . ElementTree as ET
import html5lib , pprint
from mmdc_modules import api_page, pandoc2html, parse_work , replace_gallery , replace_video , index_addwork , write_html_file , mw_cats , mw_imgsurl , site
from mmdc_modules import pandoc2html, parse_work , replace_gallery , replace_video , index_addwork , write_html_file , mw_cats , mw_imgsurl , site , mw_page_text
from argparse import ArgumentParser
from mwclient import Site
@ -15,14 +15,7 @@ args = p.parse_args()
print args
########
# QUERY API
########
sid = ' 1234 '
useragent = " Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101 "
endpoint = " http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json& "
########
# CREATE INDEX
# Index
########
memberpages = mw_cats ( site , args )
#memberpages['Ctrl-F Reader','As We Speak']
@ -37,20 +30,15 @@ index_tree = html5lib.parse(index_file, namespaceHTMLElements=False)
index_container = index_tree . find ( " .//div[@class= ' isotope ' ] " ) #maybe id is important, to destinguish it
########
# C REATE PAGE
# C reate Page
########
for member in memberpages :
print ' member ' , member
# download mw work page
# pageid=member['pageid']
# pagetitle=(member['title'].encode('utf-8'))
workpage_mw = api_page ( member , ' content ' )
workpage_mw = mw_page_text ( site , member ) #CONTENT
workpage_mw = replace_gallery ( workpage_mw )
workpage_mw = replace_video ( workpage_mw )
workdict = parse_work ( member , workpage_mw ) # create dictionary w orkpage_mw template
workdict = parse_work ( member , workpage_mw ) # create dictionary w/ page content
workpage_imgs = mw_imgsurl ( site , member )
print ' WORKPAGE_IMGS ' , workpage_imgs
# only parse pages with Creator, Title, Thumbnail
if len ( workdict [ ' Creator ' ] ) > 1 and len ( workdict [ ' Title ' ] ) > 1 and len ( workdict [ ' Description ' ] ) > 1 and len ( workdict [ ' Thumbnail ' ] ) > 1 :
@ -58,12 +46,9 @@ for member in memberpages:
if key in [ ' Extra ' , ' Description ' , ' Bio ' ] and workdict [ key ] :
workdict [ key ] = pandoc2html ( ( workdict [ key ] . decode ( ' utf-8 ' ) ) )
elif key in [ ' Creator ' ] :
workdict [ key ] = workdict [ key ] . replace ( ' , ' , ' ' ) #remove comma
#replace empty dict values with ' ' # to avoid empty tags
for key in workdict . keys ( ) :
if workdict [ key ] is ' ' : # and key is not 'Thumbnail':
workdict [ key ] = ' '
workdict [ key ] = workdict [ key ] . replace ( ' , ' , ' ' )
for key in workdict . keys ( ) :
if type ( workdict [ key ] ) is unicode :
workdict [ key ] = workdict [ key ] . encode ( ' utf-8 ' )
@ -98,16 +83,12 @@ for member in memberpages:
for img in imgs :
img_class = img . get ( ' class ' )
if img_class != ' template ' :
print ' img_class ' , img_class
src = unicode ( img . get ( ' src ' ) )
print src
for pair in workpage_imgs :
if src . replace ( " _ " , " " ) in pair [ 0 ] : #if img in html match img in workpage_imgs
print ' FOUND IMG ' , pair
if src . replace ( " _ " , " " ) in pair [ 0 ] : #if img in html matchs img in workpage_imgs
img . set ( ' src ' , pair [ 1 ] )
# newsrc = api_file_url(src) MOVE FULL URl OPERATION TO MW CONTENT
# if newsrc:
# img.set('src', newsrc)
# save work page
creator = workdict [ ' Creator ' ] #.decode('ascii', 'ignore')
creator = creator . replace ( ' ' , ' _ ' )
@ -115,9 +96,8 @@ for member in memberpages:
write_html_file ( page_tree , work_filename )
#######
# I NDEX
# I nsert Work to Index
#######
# insert work to index
index_addwork ( parent = index_container ,
workid = key ,
href = work_filename . replace ( ' web/ ' , ' ' ) ,