md_index.py: JSON only with works that contain essential info

master
Castro0o 10 years ago
parent 4a8acfb6a0
commit 11386978fd

@ -77,6 +77,7 @@ def find_imgs(article):
return thumbs_list return thumbs_list
def parse_work_page(title, content): def parse_work_page(title, content):
content = content.encode('utf-8') content = content.encode('utf-8')
if re.match('\{\{\Graduation work', content): if re.match('\{\{\Graduation work', content):
@ -87,6 +88,10 @@ def parse_work_page(title, content):
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
extra = ('Extra', extra) extra = ('Extra', extra)
keyval.append(extra) keyval.append(extra)
keys = [keyval[i][0] for i in range(len(keyval))]
#checkkeys: list of mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3]
if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
for pair in keyval: for pair in keyval:
key = pair[0] key = pair[0]
val = pair[1] val = pair[1]
@ -94,23 +99,10 @@ def parse_work_page(title, content):
if 'Creator' in key: if 'Creator' in key:
val = val.replace(', ', '') val = val.replace(', ', '')
work_dict[key]=val work_dict[key]=val
return work_dict return work_dict
'''
TEMPLATE
|Description=
|Creator=
|Date=
|Thumbnail=
|Website=
Description=Based on her written thesis: The Web Cheated on Me, Marie is trying to figure out where her disappointment with the web comes from. She analyzed her webbrowser history for half a year to find out what kind of information she is looking up. Her graduation work is an audio installation based on this research.\n|Creator=Marie Wocher,\n|Date=2013\n|Thumbnail=4 FromHypertextToApplePie.jpg\n
'''
def api_category(category, year): def api_category(category, year):
'''Finds all pages within category and returns a dictionary with info on those pages''' '''Finds all pages within category and eact to allworks dictionary'''
category = category.replace(' ', '_') category = category.replace(' ', '_')
if year: if year:
api_url = endpoint + 'action=query&list=categorymembers&cmlimit=500&cmtitle=Category:{}&cmtitle=Category:{}'.format(category, year) #BUG: API only queries last cmtitle: YEAR api_url = endpoint + 'action=query&list=categorymembers&cmlimit=500&cmtitle=Category:{}&cmtitle=Category:{}'.format(category, year) #BUG: API only queries last cmtitle: YEAR
@ -119,21 +111,23 @@ def api_category(category, year):
request = urllib2.urlopen(api_url) request = urllib2.urlopen(api_url)
jsonp = json.loads(request.read()) jsonp = json.loads(request.read())
# dict_page = {}
for page in jsonp['query']['categorymembers']: for page in jsonp['query']['categorymembers']:
print 'Page:', page print 'Page:', page
title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
pageid = page['pageid'] pageid = page['pageid']
print 'Pageid:', pageid
## NOTE: instead of using page name to query page, use PAGE ID ## NOTE: instead of using page name to query page, use PAGE ID
article = api_page(pageid, 'content') article = api_page(pageid, 'content')
print 'Content:' print title
pprint.pprint(article) # pprint.pprint(article)
work = parse_work_page(title, article)
if work:
allworks[pageid] = work #dictionary(allworks) entry
print work
else:
print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
print '-------------'
print print
work = parse_work_page(title, article) #
if work and set(mainkeys).issubset(work.keys()) and len([ work[key] for key in mainkeys if work[key] ])==3: # work must exist, have mainkeys as keys w/ values
allworks[pageid] = work
api_category('Graduation work', '2013') api_category('Graduation work', '2013')
#pprint.pprint(allworks) #pprint.pprint(allworks)
@ -141,8 +135,10 @@ api_category('Graduation work', '2013')
# save json # save json
json_allworks = open('md_allworks.json', 'w') json_allworks = open('md_allworks.json', 'w')
json.dump(allworks, json_allworks ) json.dump(allworks, json_allworks )
print "wrote json dictionary to:", 'md_allworks.json' #print "wrote json dictionary to:", 'md_allworks.json'
## TO DO ## TO DO
# How do handle work['Extra'] value? # How do handle work['Extra'] value? some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc
# some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc # Do template values need to be converted to html?
# Thumbnails need a full url

@ -2,25 +2,46 @@
<html> <html>
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<script type="text/javascript" src="jquery-1.10.2.js"></script> <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!--script type="text/javascript" src="jquery-1.10.2.js"></script-->
<script type="text/javascript"> <script type="text/javascript">
var myjson; var myjson;
function query(workid){
console.log(workid);
var title = myjson[workid]['Title'];
var creator = myjson[workid]['Creator'];
var description = myjson[workid]['Description'];
console.log(title, creator, description);
}
function readJSON(){ function readJSON(){
$.getJSON( "md_allworks.json", function(data){ $.getJSON( "md_allworks.json", function(data){
myjson=data; myjson=data;
console.log(data); console.log(myjson);
console.log(Object.keys(myjson));
testJSON(myjson);
hover();
}) })
} $('span').hover(
function(){
var thisid = $(this).attr('id')
query(thisid);
}
)
}
</script> </script>
</head> </head>
<body onload="javascript:readJSON();" > <body onload="javascript:readJSON();" >
Testing JSON é <h3>Testing <span id="9961">JSON</span></h3>
<h3>Hover over the words <span id="9939">JSON</span> and look at the console</h3>
</body> </body>
</html> </html>

Loading…
Cancel
Save