cleaning

10 years ago · 1ef796e8ba
parent 9e3e0fb91f
commit 1ef796e8ba
4 changed files with 17 additions and 126 deletions
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ Or index all the gaduation works:


 ## To Do
-* remove thumbnail from page_imgs
+



--- a/mmdc_create_json.py
+++ b/mmdc_create_json.py
@ -1,113 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
-
-##############
-# CREATE JSON DICTIONARY WITH AN ENTRY FOR EACH WORK
-#####
-import urllib2, json, pprint, re
-from mmdc_modules import api_request, api_page, api_thumb_url
-
-sid = '1234'
-useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
-endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
-allworks = {}
-mainkeys = ['Thumbnail','Date','Creator']
-
-# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
-# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
-# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
-    
-def parse_work_page(title, content):
-    content = content.encode('utf-8')
-    if re.match('\{\{\Graduation work', content):
-        work_dict = {}
-        work_dict['Title']=title
-        template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
-        # template's key/value pair
-        keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
-        if extra:
-            extra = ('Extra', extra)
-            keyval.append(extra)
-
-        checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
-        if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values 
-            for pair in keyval:
-                key = pair[0]
-                val = pair[1]
-                val = val.replace('\n','')            
-                if 'Creator' in key:
-                    val = val.replace(', ', '')
-                elif 'Thumbnail' in key:
-                    thumburl = api_thumb_url(val)
-                    work_dict['Thumbnail_url']=thumburl
-                    print 'THUMB:', thumburl
-                work_dict[key]=val
-            return work_dict
-
-def api_PageCategories(pageid):
-    '''Find all the categories, and their parent category of a page '''
-    query = 'action=query&pageids={}&prop=categories'.format(pageid)
-    url = endpoint + query
-    request = urllib2.urlopen(url)
-    jsonp = json.loads(request.read())    
-    json_dic = jsonp['query']['pages']
-    page_id =  json_dic.keys()[0]
-    page_categories = json_dic[page_id][u'categories']
-    all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '')
-    return all_cats
-
-        
-def api_category(category, year): #Find all pages incategory and add to allworks dictionary
-    category =  category.replace(' ', '_')
-    apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category)    
-    request = urllib2.urlopen(apiCatMembers)
-    jsonp = json.loads(request.read())    
-    Graduation_work_Members = jsonp['query']['categorymembers']
-    intersectCatMembers = []
-    if year:          
-        for member in Graduation_work_Members:
-            page_cats = api_PageCategories(member['pageid'])
-            if ('Category:{}'.format(year)) in page_cats:
-                print year, 'in', page_cats
-                intersectCatMembers.append(member)# add member to intersectCatMembers
-    else:
-        intersectCatMembers =  Graduation_work_Members
-        
-    for page in  intersectCatMembers: 
-        title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
-        pageid = page['pageid']
-        article = api_page(pageid, 'content')
-        #        print title
-        #        pprint.pprint(article)
-        work = parse_work_page(title, article)
-        if work:
-            allworks[pageid] = work #dictionary(allworks) entry
-            print pprint.pprint( work )
-            # Create work page                    
-        else:
-            print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
-        print '-------------'
-        print 
-
-api_category('Graduation work', '2015')
-json_allworks = open('allworks_mmdc.json', 'w') # save json 
-json.dump(allworks, json_allworks )
-
-
-
-'''
-Title
-{{Graduation_work
-|Description=
-|Creator=
-|Date=
-|Bio=
-|Thumbnail=
-|Website=
-}}
-Description=
-Extra=
-'''
-
-
-
--- a/mmdc_wiki2web.py
+++ b/mmdc_wiki2web.py
@ -7,7 +7,6 @@ from mmdc_modules import pandoc2html, parse_work, write_html_file, mw_cats, mw_p
 from argparse import ArgumentParser
 from random import shuffle as shuffle

-
 #####
 # Args
 ####
@ -16,6 +15,7 @@ p.add_argument("--host", default="pzwiki.wdka.nl")
 p.add_argument("--path", default="/mw-mediadesign/", help="nb: should end with /")
 p.add_argument("--category", "-c", nargs="*", default=[["2015", "Graduation_work"]], action="append", help="category to query, use -c foo -c bar to intersect multiple categories")
 p.add_argument("--preview", help='Preview page. Will override category querying. Use: --page "Name Of Wiki Page"')
+
 args = p.parse_args()
 print 'args', args

@ -32,7 +32,7 @@ def create_page(memberpages, mode):
        articledict = parse_work(site, member, page_text) # create dictionary
        # Title, Creator, Date, Website, Thumbnail, Bio, Description, Extra
        if len(articledict['Creator'])>0 and len(articledict['Title'])>0  and len(articledict['Thumbnail'])>0:
-            for key in articledict.keys():# convert Extra, Description, Bio to HTML
+            for key in articledict.keys():
                if key in ['Extra', 'Description', 'Bio']:
                    articledict[key] =  pandoc2html(articledict[key])
                elif key in ['Creator']:
@ -67,14 +67,22 @@ def create_page(memberpages, mode):
        page_thumb = page_tree.find('.//img[@id="thumbnail"]')
        page_thumb.set('src', articledict['Thumbnail'])

-        # give work page's imgs full url
-        imgs = page_tree.findall('.//img')        
-        for img in imgs: #replace src: full url
-            src = (('File:'+img.get('src')).capitalize()).decode('utf-8')
-            if src in articledict['Imgs'].keys():
+        figures = page_tree.findall('.//figure')
+        for figure in figures:
+            img = figure.find('.//img')            
+            figcaption = figure.find('.//figcaption')
+            img_src = img.get('src')
+            figcaption_text = figcaption.text
+            if figcaption_text == img_src:# remove figcation if ==  src
+                figure.remove(figcaption)                 
+
+            src = (('File:'+img_src).capitalize()).decode('utf-8')
+            if src in articledict['Imgs'].keys(): #full-url
                url = articledict['Imgs'][src]
                img.set('src', url)

+
+                
        # save work page
        creator = articledict['Creator'].encode('ascii', 'ignore')
        creator = creator.replace(' ','_')
@ -109,18 +117,16 @@ site = mwsite(args.host, args.path)

 if args.preview is not None:
    print "** Page Preview Mode**"
-    memberpages = [args.preview.encode('utf-8')]
+    memberpages = [(args.preview).decode('utf-8')]
    print 'memberpages:', memberpages
    create_page(memberpages, 'preview')
    
 else:
    print "** New Index Mode **"
    memberpages=mw_cats(site, args)
-    #memberpages=[u'Unintended Images']
    shuffle(memberpages)
    print 'memberpages:', memberpages
    indexdict = create_page(memberpages, 'index')
-#    pprint.pprint(indexdict)
    create_index(indexdict)


--- a/update.sh
+++ b/update.sh
@ -1,2 +0,0 @@
-#!/bin/sh
-python mmdc_wiki2web.py --category Graduation_work --category 2015