From 1ef796e8ba864d3430e224b1cf6577523112e66f Mon Sep 17 00:00:00 2001 From: Castro0o Date: Sun, 31 May 2015 21:54:26 +0200 Subject: [PATCH] cleaning --- README.md | 2 +- mmdc_create_json.py | 113 ---------------------------------- mmdc_x.py => mmdc_wiki2web.py | 26 +++++--- update.sh | 2 - 4 files changed, 17 insertions(+), 126 deletions(-) delete mode 100755 mmdc_create_json.py rename mmdc_x.py => mmdc_wiki2web.py (89%) delete mode 100755 update.sh diff --git a/README.md b/README.md index 14c4771..0b31dbb 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Or index all the gaduation works: ## To Do -* remove thumbnail from page_imgs + diff --git a/mmdc_create_json.py b/mmdc_create_json.py deleted file mode 100755 index afdb268..0000000 --- a/mmdc_create_json.py +++ /dev/null @@ -1,113 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -############## -# CREATE JSON DICTIONARY WITH AN ENTRY FOR EACH WORK -##### -import urllib2, json, pprint, re -from mmdc_modules import api_request, api_page, api_thumb_url - -sid = '1234' -useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" -endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" -allworks = {} -mainkeys = ['Thumbnail','Date','Creator'] - -# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=File:2x2 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300 -# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content -# http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content - -def parse_work_page(title, content): - content = content.encode('utf-8') - if re.match('\{\{\Graduation work', content): - work_dict = {} - work_dict['Title']=title - template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] - # template's key/value pair - keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) - if extra: - extra = ('Extra', extra) - keyval.append(extra) - - checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...] - if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values - for pair in keyval: - key = pair[0] - val = pair[1] - val = val.replace('\n','') - if 'Creator' in key: - val = val.replace(', ', '') - elif 'Thumbnail' in key: - thumburl = api_thumb_url(val) - work_dict['Thumbnail_url']=thumburl - print 'THUMB:', thumburl - work_dict[key]=val - return work_dict - -def api_PageCategories(pageid): - '''Find all the categories, and their parent category of a page ''' - query = 'action=query&pageids={}&prop=categories'.format(pageid) - url = endpoint + query - request = urllib2.urlopen(url) - jsonp = json.loads(request.read()) - json_dic = jsonp['query']['pages'] - page_id = json_dic.keys()[0] - page_categories = json_dic[page_id][u'categories'] - all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '') - return all_cats - - -def api_category(category, year): #Find all pages incategory and add to allworks dictionary - category = category.replace(' ', '_') - apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category) - request = urllib2.urlopen(apiCatMembers) - jsonp = json.loads(request.read()) - Graduation_work_Members = jsonp['query']['categorymembers'] - intersectCatMembers = [] - if year: - for member in Graduation_work_Members: - page_cats = api_PageCategories(member['pageid']) - if ('Category:{}'.format(year)) in page_cats: - print year, 'in', page_cats - intersectCatMembers.append(member)# add member to intersectCatMembers - else: - intersectCatMembers = Graduation_work_Members - - for page in intersectCatMembers: - title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles - pageid = page['pageid'] - article = api_page(pageid, 'content') - # print title - # pprint.pprint(article) - work = parse_work_page(title, article) - if work: - allworks[pageid] = work #dictionary(allworks) entry - print pprint.pprint( work ) - # Create work page - else: - print 'WORK DOES NOT CONTAIN REQUIRED CONTENT' - print '-------------' - print - -api_category('Graduation work', '2015') -json_allworks = open('allworks_mmdc.json', 'w') # save json -json.dump(allworks, json_allworks ) - - - -''' -Title -{{Graduation_work -|Description= -|Creator= -|Date= -|Bio= -|Thumbnail= -|Website= -}} -Description= -Extra= -''' - - - diff --git a/mmdc_x.py b/mmdc_wiki2web.py similarity index 89% rename from mmdc_x.py rename to mmdc_wiki2web.py index 227f39b..6edf20e 100755 --- a/mmdc_x.py +++ b/mmdc_wiki2web.py @@ -7,7 +7,6 @@ from mmdc_modules import pandoc2html, parse_work, write_html_file, mw_cats, mw_p from argparse import ArgumentParser from random import shuffle as shuffle - ##### # Args #### @@ -16,6 +15,7 @@ p.add_argument("--host", default="pzwiki.wdka.nl") p.add_argument("--path", default="/mw-mediadesign/", help="nb: should end with /") p.add_argument("--category", "-c", nargs="*", default=[["2015", "Graduation_work"]], action="append", help="category to query, use -c foo -c bar to intersect multiple categories") p.add_argument("--preview", help='Preview page. Will override category querying. Use: --page "Name Of Wiki Page"') + args = p.parse_args() print 'args', args @@ -32,7 +32,7 @@ def create_page(memberpages, mode): articledict = parse_work(site, member, page_text) # create dictionary # Title, Creator, Date, Website, Thumbnail, Bio, Description, Extra if len(articledict['Creator'])>0 and len(articledict['Title'])>0 and len(articledict['Thumbnail'])>0: - for key in articledict.keys():# convert Extra, Description, Bio to HTML + for key in articledict.keys(): if key in ['Extra', 'Description', 'Bio']: articledict[key] = pandoc2html(articledict[key]) elif key in ['Creator']: @@ -67,14 +67,22 @@ def create_page(memberpages, mode): page_thumb = page_tree.find('.//img[@id="thumbnail"]') page_thumb.set('src', articledict['Thumbnail']) - # give work page's imgs full url - imgs = page_tree.findall('.//img') - for img in imgs: #replace src: full url - src = (('File:'+img.get('src')).capitalize()).decode('utf-8') - if src in articledict['Imgs'].keys(): + figures = page_tree.findall('.//figure') + for figure in figures: + img = figure.find('.//img') + figcaption = figure.find('.//figcaption') + img_src = img.get('src') + figcaption_text = figcaption.text + if figcaption_text == img_src:# remove figcation if == src + figure.remove(figcaption) + + src = (('File:'+img_src).capitalize()).decode('utf-8') + if src in articledict['Imgs'].keys(): #full-url url = articledict['Imgs'][src] img.set('src', url) + + # save work page creator = articledict['Creator'].encode('ascii', 'ignore') creator = creator.replace(' ','_') @@ -109,18 +117,16 @@ site = mwsite(args.host, args.path) if args.preview is not None: print "** Page Preview Mode**" - memberpages = [args.preview.encode('utf-8')] + memberpages = [(args.preview).decode('utf-8')] print 'memberpages:', memberpages create_page(memberpages, 'preview') else: print "** New Index Mode **" memberpages=mw_cats(site, args) - #memberpages=[u'Unintended Images'] shuffle(memberpages) print 'memberpages:', memberpages indexdict = create_page(memberpages, 'index') -# pprint.pprint(indexdict) create_index(indexdict) diff --git a/update.sh b/update.sh deleted file mode 100755 index 8f34378..0000000 --- a/update.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python mmdc_wiki2web.py --category Graduation_work --category 2015