md_index.py: JSON only with works that contain essential info

10 years ago · 11386978fd
parent 4a8acfb6a0
commit 11386978fd
2 changed files with 56 additions and 39 deletions
--- a/md_index.py
+++ b/md_index.py
@ -77,6 +77,7 @@ def find_imgs(article):
                
        return thumbs_list 

+    
 def parse_work_page(title, content):
    content = content.encode('utf-8')
    if re.match('\{\{\Graduation work', content):
@ -87,30 +88,21 @@ def parse_work_page(title, content):
        keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
        extra = ('Extra', extra)
        keyval.append(extra)
-        for pair in keyval:
-            key = pair[0]
-            val = pair[1]
-            val = val.replace('\n','')            
-            if 'Creator' in key:
-                val = val.replace(', ', '')
-            work_dict[key]=val
-            
-        return work_dict
-
-'''
-TEMPLATE
-
-|Description=
-|Creator=
-|Date=
-|Thumbnail=
-|Website=
-
-Description=Based on her written thesis: The Web Cheated on Me, Marie is trying to figure out where her disappointment with the web comes from. She analyzed her webbrowser history for half a year to find out what kind of information she is looking up. Her graduation work is an audio installation based on this research.\n|Creator=Marie Wocher,\n|Date=2013\n|Thumbnail=4 FromHypertextToApplePie.jpg\n
-'''
+        keys = [keyval[i][0] for i in range(len(keyval))]
+        #checkkeys: list of mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
+        checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] 
+        if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values 
+            for pair in keyval:
+                key = pair[0]
+                val = pair[1]
+                val = val.replace('\n','')            
+                if 'Creator' in key:
+                    val = val.replace(', ', '')
+                work_dict[key]=val            
+            return work_dict

 def api_category(category, year):
-    '''Finds all pages within  category and returns a dictionary with info on those pages'''
+    '''Finds all pages within  category and eact to allworks dictionary'''
    category =  category.replace(' ', '_')
    if year:
        api_url = endpoint + 'action=query&list=categorymembers&cmlimit=500&cmtitle=Category:{}&cmtitle=Category:{}'.format(category, year) #BUG: API only queries last cmtitle: YEAR
@ -119,21 +111,23 @@ def api_category(category, year):
    
    request = urllib2.urlopen(api_url)
    jsonp = json.loads(request.read())    
-#    dict_page = {}
+
    for page in  jsonp['query']['categorymembers']:
        print 'Page:', page
        title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
        pageid = page['pageid']
-        print 'Pageid:', pageid
        ## NOTE: instead of using page name to query page, use PAGE ID 
        article = api_page(pageid, 'content')
-        print 'Content:'
-        pprint.pprint(article)
+        print title
+        #        pprint.pprint(article)
+        work = parse_work_page(title, article)
+        if work:
+            allworks[pageid] = work #dictionary(allworks) entry
+            print work
+        else:
+            print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
+        print '-------------'
        print 
-        work = parse_work_page(title, article)  #
-        if work and set(mainkeys).issubset(work.keys()) and len([ work[key] for key in mainkeys if work[key] ])==3: # work must exist, have mainkeys as keys w/ values
-            allworks[pageid] = work
-

 api_category('Graduation work', '2013')
 #pprint.pprint(allworks)
@ -141,8 +135,10 @@ api_category('Graduation work', '2013')
 # save json 
 json_allworks = open('md_allworks.json', 'w')
 json.dump(allworks, json_allworks )
-print "wrote json dictionary to:", 'md_allworks.json'
+#print "wrote json dictionary to:", 'md_allworks.json'

 ## TO DO
-# How do handle work['Extra'] value?
-# some tiles  work['Extra'] contains: <gallery>, [[Pages]], text, etc
+# How do handle work['Extra'] value? some tiles  work['Extra'] contains: <gallery>, [[Pages]], text, etc
+# Do template values need to be converted to html?
+
+# Thumbnails need a full url
--- a/prototype_json.html
+++ b/prototype_json.html
@ -2,25 +2,46 @@
 <html>
  <head>
    <meta charset="utf-8" />
-    <script type="text/javascript" src="jquery-1.10.2.js"></script>
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
+    <!--script type="text/javascript" src="jquery-1.10.2.js"></script-->
    <script type="text/javascript">

+
 var myjson;

+function query(workid){
+    console.log(workid);
+    var title = myjson[workid]['Title'];
+    var creator = myjson[workid]['Creator'];
+    var description = myjson[workid]['Description'];
+    console.log(title, creator, description);
+
+}
+
 function readJSON(){
    $.getJSON( "md_allworks.json", function(data){
    	myjson=data;
-	console.log(data);
+	console.log(myjson);	
+        console.log(Object.keys(myjson));
+	testJSON(myjson);
+	hover();
    })    

-}
+$('span').hover(
+    function(){
+	var thisid = $(this).attr('id')
+	query(thisid);
+    }
+)

+}
      
    </script>

 </head>

  <body onload="javascript:readJSON();" >
-    Testing JSON é
+    <h3>Testing <span id="9961">JSON</span></h3>
+    <h3>Hover over the words <span id="9939">JSON</span> and look at the console</h3>
  </body>
 </html>