md_index.py: JSON only with works that contain essential info

10 years ago · 11386978fd
parent 4a8acfb6a0
commit 11386978fd
2 changed files with 56 additions and 39 deletions
--- a/md_index.py
+++ b/md_index.py
@ -77,6 +77,7 @@ def find_imgs(article):
        return thumbs_list 
 def parse_work_page(title, content):
    content = content.encode('utf-8')
    if re.match('\{\{\Graduation work', content):
@ -87,30 +88,21 @@ def parse_work_page(title, content):
        keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
        extra = ('Extra', extra)
        keyval.append(extra)
-        for pair in keyval:
+        keys = [keyval[i][0] for i in range(len(keyval))]
-            key = pair[0]
+        #checkkeys: list of mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
-            val = pair[1]
+        checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] 
-            val = val.replace('\n','')            
+        if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values 
-            if 'Creator' in key:
+            for pair in keyval:
-                val = val.replace(', ', '')
+                key = pair[0]
-            work_dict[key]=val
+                val = pair[1]
-            
+                val = val.replace('\n','')            
-        return work_dict
+                if 'Creator' in key:
-
+                    val = val.replace(', ', '')
-'''
+                work_dict[key]=val            
-TEMPLATE
+            return work_dict
 |Description=
 |Creator=
 |Date=
 |Thumbnail=
 |Website=
 Description=Based on her written thesis: The Web Cheated on Me, Marie is trying to figure out where her disappointment with the web comes from. She analyzed her webbrowser history for half a year to find out what kind of information she is looking up. Her graduation work is an audio installation based on this research.\n|Creator=Marie Wocher,\n|Date=2013\n|Thumbnail=4 FromHypertextToApplePie.jpg\n
 '''
 def api_category(category, year):
-    '''Finds all pages within  category and returns a dictionary with info on those pages'''
+    '''Finds all pages within  category and eact to allworks dictionary'''
    category =  category.replace(' ', '_')
    if year:
        api_url = endpoint + 'action=query&list=categorymembers&cmlimit=500&cmtitle=Category:{}&cmtitle=Category:{}'.format(category, year) #BUG: API only queries last cmtitle: YEAR
@ -119,21 +111,23 @@ def api_category(category, year):
    request = urllib2.urlopen(api_url)
    jsonp = json.loads(request.read())    
-#    dict_page = {}
+
    for page in  jsonp['query']['categorymembers']:
        print 'Page:', page
        title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
        pageid = page['pageid']
        print 'Pageid:', pageid
        ## NOTE: instead of using page name to query page, use PAGE ID 
        article = api_page(pageid, 'content')
-        print 'Content:'
+        print title
-        pprint.pprint(article)
+        #        pprint.pprint(article)
        work = parse_work_page(title, article)
        if work:
            allworks[pageid] = work #dictionary(allworks) entry
            print work
        else:
            print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
        print '-------------'
        print 
        work = parse_work_page(title, article)  #
        if work and set(mainkeys).issubset(work.keys()) and len([ work[key] for key in mainkeys if work[key] ])==3: # work must exist, have mainkeys as keys w/ values
            allworks[pageid] = work
 api_category('Graduation work', '2013')
 #pprint.pprint(allworks)
@ -141,8 +135,10 @@ api_category('Graduation work', '2013')
 # save json 
 json_allworks = open('md_allworks.json', 'w')
 json.dump(allworks, json_allworks )
-print "wrote json dictionary to:", 'md_allworks.json'
+#print "wrote json dictionary to:", 'md_allworks.json'
 ## TO DO
-# How do handle work['Extra'] value?
+# How do handle work['Extra'] value? some tiles  work['Extra'] contains: <gallery>, [[Pages]], text, etc
-# some tiles  work['Extra'] contains: <gallery>, [[Pages]], text, etc
+# Do template values need to be converted to html?
 # Thumbnails need a full url
--- a/prototype_json.html
+++ b/prototype_json.html
@ -2,25 +2,46 @@
 <html>
  <head>
    <meta charset="utf-8" />
-    <script type="text/javascript" src="jquery-1.10.2.js"></script>
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
    <!--script type="text/javascript" src="jquery-1.10.2.js"></script-->
    <script type="text/javascript">
 var myjson;
 function query(workid){
    console.log(workid);
    var title = myjson[workid]['Title'];
    var creator = myjson[workid]['Creator'];
    var description = myjson[workid]['Description'];
    console.log(title, creator, description);
 }
 function readJSON(){
    $.getJSON( "md_allworks.json", function(data){
    	myjson=data;
-	console.log(data);
+	console.log(myjson);	
-    })
+        console.log(Object.keys(myjson));
-    
+	testJSON(myjson);
-}
+	hover();
    })    
 $('span').hover(
    function(){
 	var thisid = $(this).attr('id')
 	query(thisid);
    }
 )
 }
    </script>
 </head>
  <body onload="javascript:readJSON();" >
-    Testing JSON é
+    <h3>Testing <span id="9961">JSON</span></h3>
    <h3>Hover over the words <span id="9939">JSON</span> and look at the console</h3>
  </body>
 </html>