cleaning

10 years ago · 1ef796e8ba
parent 9e3e0fb91f
commit 1ef796e8ba
4 changed files with 17 additions and 126 deletions
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ Or index all the gaduation works:
 ## To Do
-* remove thumbnail from page_imgs
+
--- a/mmdc_create_json.py
+++ b/mmdc_create_json.py
@ -1,113 +0,0 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
 ##############
 # CREATE JSON DICTIONARY WITH AN ENTRY FOR EACH WORK
 #####
 import urllib2, json, pprint, re
 from mmdc_modules import api_request, api_page, api_thumb_url
 sid = '1234'
 useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
 endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
 allworks = {}
 mainkeys = ['Thumbnail','Date','Creator']
 # http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
 # http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content
 # http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content
 def parse_work_page(title, content):
    content = content.encode('utf-8')
    if re.match('\{\{\Graduation work', content):
        work_dict = {}
        work_dict['Title']=title
        template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
        # template's key/value pair
        keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
        if extra:
            extra = ('Extra', extra)
            keyval.append(extra)
        checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...]
        if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values 
            for pair in keyval:
                key = pair[0]
                val = pair[1]
                val = val.replace('\n','')            
                if 'Creator' in key:
                    val = val.replace(', ', '')
                elif 'Thumbnail' in key:
                    thumburl = api_thumb_url(val)
                    work_dict['Thumbnail_url']=thumburl
                    print 'THUMB:', thumburl
                work_dict[key]=val
            return work_dict
 def api_PageCategories(pageid):
    '''Find all the categories, and their parent category of a page '''
    query = 'action=query&pageids={}&prop=categories'.format(pageid)
    url = endpoint + query
    request = urllib2.urlopen(url)
    jsonp = json.loads(request.read())    
    json_dic = jsonp['query']['pages']
    page_id =  json_dic.keys()[0]
    page_categories = json_dic[page_id][u'categories']
    all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '')
    return all_cats
 def api_category(category, year): #Find all pages incategory and add to allworks dictionary
    category =  category.replace(' ', '_')
    apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category)    
    request = urllib2.urlopen(apiCatMembers)
    jsonp = json.loads(request.read())    
    Graduation_work_Members = jsonp['query']['categorymembers']
    intersectCatMembers = []
    if year:          
        for member in Graduation_work_Members:
            page_cats = api_PageCategories(member['pageid'])
            if ('Category:{}'.format(year)) in page_cats:
                print year, 'in', page_cats
                intersectCatMembers.append(member)# add member to intersectCatMembers
    else:
        intersectCatMembers =  Graduation_work_Members
    for page in  intersectCatMembers: 
        title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles
        pageid = page['pageid']
        article = api_page(pageid, 'content')
        #        print title
        #        pprint.pprint(article)
        work = parse_work_page(title, article)
        if work:
            allworks[pageid] = work #dictionary(allworks) entry
            print pprint.pprint( work )
            # Create work page                    
        else:
            print 'WORK DOES NOT CONTAIN REQUIRED CONTENT'
        print '-------------'
        print 
 api_category('Graduation work', '2015')
 json_allworks = open('allworks_mmdc.json', 'w') # save json 
 json.dump(allworks, json_allworks )
 '''
 Title
 {{Graduation_work
 |Description=
 |Creator=
 |Date=
 |Bio=
 |Thumbnail=
 |Website=
 }}
 Description=
 Extra=
 '''
--- a/mmdc_wiki2web.py
+++ b/mmdc_wiki2web.py
@ -7,7 +7,6 @@ from mmdc_modules import pandoc2html, parse_work, write_html_file, mw_cats, mw_p
 from argparse import ArgumentParser
 from random import shuffle as shuffle
 #####
 # Args
 ####
@ -16,6 +15,7 @@ p.add_argument("--host", default="pzwiki.wdka.nl")
 p.add_argument("--path", default="/mw-mediadesign/", help="nb: should end with /")
 p.add_argument("--category", "-c", nargs="*", default=[["2015", "Graduation_work"]], action="append", help="category to query, use -c foo -c bar to intersect multiple categories")
 p.add_argument("--preview", help='Preview page. Will override category querying. Use: --page "Name Of Wiki Page"')
 args = p.parse_args()
 print 'args', args
@ -32,7 +32,7 @@ def create_page(memberpages, mode):
        articledict = parse_work(site, member, page_text) # create dictionary
        # Title, Creator, Date, Website, Thumbnail, Bio, Description, Extra
        if len(articledict['Creator'])>0 and len(articledict['Title'])>0  and len(articledict['Thumbnail'])>0:
-            for key in articledict.keys():# convert Extra, Description, Bio to HTML
+            for key in articledict.keys():
                if key in ['Extra', 'Description', 'Bio']:
                    articledict[key] =  pandoc2html(articledict[key])
                elif key in ['Creator']:
@ -67,14 +67,22 @@ def create_page(memberpages, mode):
        page_thumb = page_tree.find('.//img[@id="thumbnail"]')
        page_thumb.set('src', articledict['Thumbnail'])
-        # give work page's imgs full url
+        figures = page_tree.findall('.//figure')
-        imgs = page_tree.findall('.//img')        
+        for figure in figures:
-        for img in imgs: #replace src: full url
+            img = figure.find('.//img')            
-            src = (('File:'+img.get('src')).capitalize()).decode('utf-8')
+            figcaption = figure.find('.//figcaption')
-            if src in articledict['Imgs'].keys():
+            img_src = img.get('src')
            figcaption_text = figcaption.text
            if figcaption_text == img_src:# remove figcation if ==  src
                figure.remove(figcaption)                 
            src = (('File:'+img_src).capitalize()).decode('utf-8')
            if src in articledict['Imgs'].keys(): #full-url
                url = articledict['Imgs'][src]
                img.set('src', url)
        # save work page
        creator = articledict['Creator'].encode('ascii', 'ignore')
        creator = creator.replace(' ','_')
@ -109,18 +117,16 @@ site = mwsite(args.host, args.path)
 if args.preview is not None:
    print "** Page Preview Mode**"
-    memberpages = [args.preview.encode('utf-8')]
+    memberpages = [(args.preview).decode('utf-8')]
    print 'memberpages:', memberpages
    create_page(memberpages, 'preview')
 else:
    print "** New Index Mode **"
    memberpages=mw_cats(site, args)
    #memberpages=[u'Unintended Images']
    shuffle(memberpages)
    print 'memberpages:', memberpages
    indexdict = create_page(memberpages, 'index')
 #    pprint.pprint(indexdict)
    create_index(indexdict)
--- a/update.sh
+++ b/update.sh
@ -1,2 +0,0 @@
 #!/bin/sh
 python mmdc_wiki2web.py --category Graduation_work --category 2015
`@ -13,7 +13,7 @@ Or index all the gaduation works:`


	`## To Do`	`## To Do`
	`* remove thumbnail from page_imgs`
		`@ -1,2 +0,0 @@`
			`#!/bin/sh`
			`python mmdc_wiki2web.py --category Graduation_work --category 2015`