|
|
|
@ -3,7 +3,7 @@
|
|
|
|
|
|
|
|
|
|
import urllib2, json, pprint, re
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
import subprocess, shlex
|
|
|
|
|
import subprocess, shlex, urllib
|
|
|
|
|
sid = '1234'
|
|
|
|
|
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
|
|
|
|
|
endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
|
|
|
|
@ -131,11 +131,8 @@ def parse_work_page(title, content):
|
|
|
|
|
work_dict = {}
|
|
|
|
|
work_dict['Title']=title
|
|
|
|
|
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
|
|
|
|
|
print 'template', template,
|
|
|
|
|
print 'extra', extra
|
|
|
|
|
# template's key/value pair
|
|
|
|
|
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
|
|
|
|
|
if extra:
|
|
|
|
|
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # template's key/value pair
|
|
|
|
|
if extra: #append extra
|
|
|
|
|
extra = ('Extra', extra)
|
|
|
|
|
keyval.append(extra) #?
|
|
|
|
|
|
|
|
|
@ -162,47 +159,37 @@ def parse_work(title, content):
|
|
|
|
|
|
|
|
|
|
if re.match('\{\{\Graduation work', content):
|
|
|
|
|
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
|
|
|
|
|
workdict['Extra'] = extra.encode('utf-8')
|
|
|
|
|
if extra:
|
|
|
|
|
workdict['Extra'] = extra.encode('utf-8')
|
|
|
|
|
# template's key/value pair
|
|
|
|
|
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
|
|
|
|
|
# Note:Extra value is NOT CAPTURED by this regex
|
|
|
|
|
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
|
|
|
|
|
for pair in keyval:
|
|
|
|
|
key = pair[0]
|
|
|
|
|
val = (pair[1]).replace('\n', '')
|
|
|
|
|
if 'Creator' in key:
|
|
|
|
|
val = val.replace(', ', '')
|
|
|
|
|
elif 'Thumbnail' in key:
|
|
|
|
|
print 'calling API'
|
|
|
|
|
val = api_thumb_url(val)
|
|
|
|
|
print 'THUMB', val
|
|
|
|
|
elif 'Website' in key:
|
|
|
|
|
val = urllib.unquote(val)
|
|
|
|
|
|
|
|
|
|
workdict[key]=val.encode('utf-8')
|
|
|
|
|
pprint.pprint(workdict)
|
|
|
|
|
# pprint.pprint(workdict)
|
|
|
|
|
return workdict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Conversion Modules
|
|
|
|
|
def pandoc2html(mw_content):
|
|
|
|
|
if mw_content:
|
|
|
|
|
mw_content = mw_content.encode('utf-8')
|
|
|
|
|
# convert from mw to html
|
|
|
|
|
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
|
|
|
|
|
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
|
|
|
|
|
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
|
|
|
|
|
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
|
|
|
html = (p2.communicate())[0]
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def pandoc(filename, title, creator, date, website, thumbnail, bio, description, extra, template) :
|
|
|
|
|
'''pandoc: convert mediawiki syntax to html'''
|
|
|
|
|
# mw_content = mw_content.encode('utf-8')
|
|
|
|
|
args_echo =shlex.split( ('echo "{}"'.format( extra )) )
|
|
|
|
|
args_pandoc = shlex.split( 'pandoc -s -f mediawiki -t html \
|
|
|
|
|
--template {template} --variable title="{title}" --variable creator="{creator}" --variable date="{date}" --variable website="{website}" --variable website="{website}" --variable thumbnail="{thumbnail}" --variable bio="""{bio}""" -o {filename}'.format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) )
|
|
|
|
|
print args_pandoc
|
|
|
|
|
'''convert individual mw sections to html'''
|
|
|
|
|
mw_content = mw_content.encode('utf-8')
|
|
|
|
|
# convert from mw to html
|
|
|
|
|
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
|
|
|
|
|
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
|
|
|
|
|
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
|
|
|
|
|
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
|
|
|
html = (p2.communicate())[0]
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def img_fullurl(parent):
|
|
|
|
|
imgs = parent.findall('.//img')
|
|
|
|
|