page creation - w/OUT errors

master
Castro0o 10 years ago
parent d947cfaf57
commit 908655871a

3
.gitignore vendored

@ -1,2 +1,5 @@
*~
*.pyc
web/20*.html
*.json
\#*\#

@ -3,7 +3,7 @@
import urllib2, json, pprint, re
import xml.etree.ElementTree as ET
import subprocess, shlex
import subprocess, shlex, urllib
sid = '1234'
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
@ -131,11 +131,8 @@ def parse_work_page(title, content):
work_dict = {}
work_dict['Title']=title
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
print 'template', template,
print 'extra', extra
# template's key/value pair
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
if extra:
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # template's key/value pair
if extra: #append extra
extra = ('Extra', extra)
keyval.append(extra) #?
@ -162,8 +159,10 @@ def parse_work(title, content):
if re.match('\{\{\Graduation work', content):
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
workdict['Extra'] = extra.encode('utf-8')
if extra:
workdict['Extra'] = extra.encode('utf-8')
# template's key/value pair
# Note:Extra value is NOT CAPTURED by this regex
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
for pair in keyval:
key = pair[0]
@ -171,38 +170,26 @@ def parse_work(title, content):
if 'Creator' in key:
val = val.replace(', ', '')
elif 'Thumbnail' in key:
print 'calling API'
val = api_thumb_url(val)
print 'THUMB', val
elif 'Website' in key:
val = urllib.unquote(val)
workdict[key]=val.encode('utf-8')
pprint.pprint(workdict)
# pprint.pprint(workdict)
return workdict
# Conversion Modules
def pandoc2html(mw_content):
if mw_content:
mw_content = mw_content.encode('utf-8')
# convert from mw to html
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
html = (p2.communicate())[0]
return html
def pandoc(filename, title, creator, date, website, thumbnail, bio, description, extra, template) :
'''pandoc: convert mediawiki syntax to html'''
# mw_content = mw_content.encode('utf-8')
args_echo =shlex.split( ('echo "{}"'.format( extra )) )
args_pandoc = shlex.split( 'pandoc -s -f mediawiki -t html \
--template {template} --variable title="{title}" --variable creator="{creator}" --variable date="{date}" --variable website="{website}" --variable website="{website}" --variable thumbnail="{thumbnail}" --variable bio="""{bio}""" -o {filename}'.format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) )
print args_pandoc
'''convert individual mw sections to html'''
mw_content = mw_content.encode('utf-8')
# convert from mw to html
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
html = (p2.communicate())[0]
return html
def img_fullurl(parent):
imgs = parent.findall('.//img')

@ -32,24 +32,21 @@ endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
# CREATE INDEX
########
memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}]
#memberpages = [{u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}]
#memberpages = [{u'ns': 0, u'pageid': 16007, u'title': u'U ntitled'}]
print 'memberpages', memberpages
########
# CREATE PAGE
########
page_template = open("web/page-template.html", "r")
page_template = page_template.read()
for member in memberpages:
print member
#print member
# download mw work page
pageid=member['pageid']
pagetitle=(member['title'].encode('utf-8'))
print pageid
workpage_mw = api_page(pageid, 'content')
# parse workpage_mw
@ -58,7 +55,7 @@ for member in memberpages:
workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template
for key in workdict.keys(): # convert Extra, Description, Bio to HTML
if key in ['Extra', 'Description', 'Bio']:
if key in ['Extra', 'Description', 'Bio'] and workdict[key]:
workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) )
# fill template with dictionary/mw_page values
@ -79,6 +76,14 @@ for member in memberpages:
newsrc = api_file_url(src)
if newsrc:
img.set('src', newsrc)
website = tree.find('.//div[@id="website"]/a')
print 'website', ET.tostring(website)
if not website.get('href'):
#remove empty .//div[@id="website"]/a
# This can be applied to more fields
website_parent = tree.find('.//div[@id="website"]')
website_parent.remove(website)
# save workpage_html
workpage_html = ET.tostring(tree)

Loading…
Cancel
Save