md_index.py: JSON only with works that contain essential info

master
Castro0o 10 years ago
parent 4a8acfb6a0
commit 11386978fd

@ -77,6 +77,7 @@ def find_imgs(article):
return thumbs_list
def parse_work_page(title, content):
content = content.encode('utf-8')
if re.match('\{\{\Graduation work', content):
@ -87,30 +88,21 @@ def parse_work_page(title, content):
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
extra = ('Extra', extra)
keyval.append(extra)
for pair in keyval:
key = pair[0]
val = pair[1]
val = val.replace('\n','')
if 'Creator' in key:
val = val.replace(', ', '')
work_dict[key]=val
return work_dict
'''
TEMPLATE
|Description=
|Creator=
|Date=
|Thumbnail=
|Website=
Description=Based on her written thesis: The Web Cheated on Me, Marie is trying to figure out where her disappointment with the web comes from. She analyzed her webbrowser history for half a year to find out what kind of information she is looking up. Her graduation work is an audio installation based on this research.\n|Creator=Marie Wocher,\n|Date=2013\n|Thumbnail=4 FromHypertextToApplePie.jpg\n
'''
keys = [keyval[i][0] for i in range(len(keyval))]
#checkkeys: list of mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3]
if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values
for pair in keyval:
key = pair[0]
val = pair[1]
val = val.replace('\n','')
if 'Creator' in key:
val = val.replace(', ', '')
work_dict[key]=val
return work_dict
def api_category(category, year):
'''Finds all pages within category and returns a dictionary with info on those pages'''
'''Finds all pages within category and eact to allworks dictionary'''
category = category.replace(' ', '_')
if year:
api_url = endpoint + 'action=query&list=categorymembers&cmlimit=500&cmtitle=Category:{}&cmtitle=Category:{}'.format(category, year) #BUG: API only queries last cmtitle: YEAR
@ -119,21 +111,23 @@ def api_category(category, year):
request = urllib2.urlopen(api_url)
jsonp = json.loads(request.read())
# dict_page = {}
for page in jsonp['query']['categorymembers']:
print 'Page:', page
title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
pageid = page['pageid']
print 'Pageid:', pageid
## NOTE: instead of using page name to query page, use PAGE ID
article = api_page(pageid, 'content')
print 'Content:'
pprint.pprint(article)
print title
# pprint.pprint(article)
work = parse_work_page(title, article)
if work:
allworks[pageid] = work #dictionary(allworks) entry
print work
else:
print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
print '-------------'
print
work = parse_work_page(title, article) #
if work and set(mainkeys).issubset(work.keys()) and len([ work[key] for key in mainkeys if work[key] ])==3: # work must exist, have mainkeys as keys w/ values
allworks[pageid] = work
api_category('Graduation work', '2013')
#pprint.pprint(allworks)
@ -141,8 +135,10 @@ api_category('Graduation work', '2013')
# save json
json_allworks = open('md_allworks.json', 'w')
json.dump(allworks, json_allworks )
print "wrote json dictionary to:", 'md_allworks.json'
#print "wrote json dictionary to:", 'md_allworks.json'
## TO DO
# How do handle work['Extra'] value?
# some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc
# How do handle work['Extra'] value? some tiles work['Extra'] contains: <gallery>, [[Pages]], text, etc
# Do template values need to be converted to html?
# Thumbnails need a full url

@ -2,25 +2,46 @@
<html>
<head>
<meta charset="utf-8" />
<script type="text/javascript" src="jquery-1.10.2.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!--script type="text/javascript" src="jquery-1.10.2.js"></script-->
<script type="text/javascript">
var myjson;
function query(workid){
console.log(workid);
var title = myjson[workid]['Title'];
var creator = myjson[workid]['Creator'];
var description = myjson[workid]['Description'];
console.log(title, creator, description);
}
function readJSON(){
$.getJSON( "md_allworks.json", function(data){
myjson=data;
console.log(data);
console.log(myjson);
console.log(Object.keys(myjson));
testJSON(myjson);
hover();
})
}
$('span').hover(
function(){
var thisid = $(this).attr('id')
query(thisid);
}
)
}
</script>
</head>
<body onload="javascript:readJSON();" >
Testing JSON é
<h3>Testing <span id="9961">JSON</span></h3>
<h3>Hover over the words <span id="9939">JSON</span> and look at the console</h3>
</body>
</html>

Loading…
Cancel
Save