@ -77,6 +77,7 @@ def find_imgs(article):
return thumbs_list
return thumbs_list
def parse_work_page ( title , content ) :
def parse_work_page ( title , content ) :
content = content . encode ( ' utf-8 ' )
content = content . encode ( ' utf-8 ' )
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
if re . match ( ' \ { \ { \ Graduation work ' , content ) :
@ -87,30 +88,21 @@ def parse_work_page(title, content):
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
keyval = re . findall ( ' \ |(.*?) \ =(.*? \n ) ' , template , re . DOTALL )
extra = ( ' Extra ' , extra )
extra = ( ' Extra ' , extra )
keyval . append ( extra )
keyval . append ( extra )
for pair in keyval :
keys = [ keyval [ i ] [ 0 ] for i in range ( len ( keyval ) ) ]
key = pair [ 0 ]
#checkkeys: list of mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
val = pair [ 1 ]
checkkeys = [ keyval [ i ] for i in range ( len ( keyval ) ) if keyval [ i ] [ 0 ] in mainkeys and len ( keyval [ i ] [ 1 ] ) > 3 ]
val = val . replace ( ' \n ' , ' ' )
if len ( checkkeys ) == 3 : # checkkeys contains all mainkeys and values
if ' Creator ' in key :
for pair in keyval :
val = val . replace ( ' , ' , ' ' )
key = pair [ 0 ]
work_dict [ key ] = val
val = pair [ 1 ]
val = val . replace ( ' \n ' , ' ' )
return work_dict
if ' Creator ' in key :
val = val . replace ( ' , ' , ' ' )
'''
work_dict [ key ] = val
TEMPLATE
return work_dict
| Description =
| Creator =
| Date =
| Thumbnail =
| Website =
Description = Based on her written thesis : The Web Cheated on Me , Marie is trying to figure out where her disappointment with the web comes from . She analyzed her webbrowser history for half a year to find out what kind of information she is looking up . Her graduation work is an audio installation based on this research . \n | Creator = Marie Wocher , \n | Date = 2013 \n | Thumbnail = 4 FromHypertextToApplePie . jpg \n
'''
def api_category ( category , year ) :
def api_category ( category , year ) :
''' Finds all pages within category and returns a dictionary with info on those pages '''
''' Finds all pages within category and eact to allworks dictionary '''
category = category . replace ( ' ' , ' _ ' )
category = category . replace ( ' ' , ' _ ' )
if year :
if year :
api_url = endpoint + ' action=query&list=categorymembers&cmlimit=500&cmtitle=Category: {} &cmtitle=Category: {} ' . format ( category , year ) #BUG: API only queries last cmtitle: YEAR
api_url = endpoint + ' action=query&list=categorymembers&cmlimit=500&cmtitle=Category: {} &cmtitle=Category: {} ' . format ( category , year ) #BUG: API only queries last cmtitle: YEAR
@ -119,21 +111,23 @@ def api_category(category, year):
request = urllib2 . urlopen ( api_url )
request = urllib2 . urlopen ( api_url )
jsonp = json . loads ( request . read ( ) )
jsonp = json . loads ( request . read ( ) )
# dict_page = {}
for page in jsonp [ ' query ' ] [ ' categorymembers ' ] :
for page in jsonp [ ' query ' ] [ ' categorymembers ' ] :
print ' Page: ' , page
print ' Page: ' , page
title = ( ( page [ ' title ' ] ) . encode ( ' utf-8 ' ) ) . replace ( " " , " _ " ) #snakecase for page titles
title = ( ( page [ ' title ' ] ) . encode ( ' utf-8 ' ) ) . replace ( " " , " _ " ) #snakecase for page titles
pageid = page [ ' pageid ' ]
pageid = page [ ' pageid ' ]
print ' Pageid: ' , pageid
## NOTE: instead of using page name to query page, use PAGE ID
## NOTE: instead of using page name to query page, use PAGE ID
article = api_page ( pageid , ' content ' )
article = api_page ( pageid , ' content ' )
print ' Content: '
print title
pprint . pprint ( article )
# pprint.pprint(article)
work = parse_work_page ( title , article )
if work :
allworks [ pageid ] = work #dictionary(allworks) entry
print work
else :
print ' WORK DOES NOT CONTAIN REQUIRED CONTENT '
print ' ------------- '
print
print
work = parse_work_page ( title , article ) #
if work and set ( mainkeys ) . issubset ( work . keys ( ) ) and len ( [ work [ key ] for key in mainkeys if work [ key ] ] ) == 3 : # work must exist, have mainkeys as keys w/ values
allworks [ pageid ] = work
api_category ( ' Graduation work ' , ' 2013 ' )
api_category ( ' Graduation work ' , ' 2013 ' )
#pprint.pprint(allworks)
#pprint.pprint(allworks)
@ -141,8 +135,10 @@ api_category('Graduation work', '2013')
# save json
# save json
json_allworks = open ( ' md_allworks.json ' , ' w ' )
json_allworks = open ( ' md_allworks.json ' , ' w ' )
json . dump ( allworks , json_allworks )
json . dump ( allworks , json_allworks )
print " wrote json dictionary to: " , ' md_allworks.json '
#print "wrote json dictionary to:", 'md_allworks.json '
## TO DO
## TO DO
# How do handle work['Extra'] value?
# How do handle work['Extra'] value? some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc
# some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc
# Do template values need to be converted to html?
# Thumbnails need a full url