page creation - w/OUT errors

master
Castro0o 10 years ago
parent d947cfaf57
commit 908655871a

5
.gitignore vendored

@ -1,2 +1,5 @@
*~ *~
*.pyc *.pyc
web/20*.html
*.json
\#*\#

@ -3,7 +3,7 @@
import urllib2, json, pprint, re import urllib2, json, pprint, re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import subprocess, shlex import subprocess, shlex, urllib
sid = '1234' sid = '1234'
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
@ -131,11 +131,8 @@ def parse_work_page(title, content):
work_dict = {} work_dict = {}
work_dict['Title']=title work_dict['Title']=title
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
print 'template', template, keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # template's key/value pair
print 'extra', extra if extra: #append extra
# template's key/value pair
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
if extra:
extra = ('Extra', extra) extra = ('Extra', extra)
keyval.append(extra) #? keyval.append(extra) #?
@ -162,47 +159,37 @@ def parse_work(title, content):
if re.match('\{\{\Graduation work', content): if re.match('\{\{\Graduation work', content):
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
workdict['Extra'] = extra.encode('utf-8') if extra:
workdict['Extra'] = extra.encode('utf-8')
# template's key/value pair # template's key/value pair
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # Note:Extra value is NOT CAPTURED by this regex
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)
for pair in keyval: for pair in keyval:
key = pair[0] key = pair[0]
val = (pair[1]).replace('\n', '') val = (pair[1]).replace('\n', '')
if 'Creator' in key: if 'Creator' in key:
val = val.replace(', ', '') val = val.replace(', ', '')
elif 'Thumbnail' in key: elif 'Thumbnail' in key:
print 'calling API'
val = api_thumb_url(val) val = api_thumb_url(val)
print 'THUMB', val elif 'Website' in key:
val = urllib.unquote(val)
workdict[key]=val.encode('utf-8') workdict[key]=val.encode('utf-8')
pprint.pprint(workdict) # pprint.pprint(workdict)
return workdict return workdict
# Conversion Modules # Conversion Modules
def pandoc2html(mw_content): def pandoc2html(mw_content):
if mw_content: '''convert individual mw sections to html'''
mw_content = mw_content.encode('utf-8') mw_content = mw_content.encode('utf-8')
# convert from mw to html # convert from mw to html
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) ) args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' ) args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
html = (p2.communicate())[0]
return html
def pandoc(filename, title, creator, date, website, thumbnail, bio, description, extra, template) :
'''pandoc: convert mediawiki syntax to html'''
# mw_content = mw_content.encode('utf-8')
args_echo =shlex.split( ('echo "{}"'.format( extra )) )
args_pandoc = shlex.split( 'pandoc -s -f mediawiki -t html \
--template {template} --variable title="{title}" --variable creator="{creator}" --variable date="{date}" --variable website="{website}" --variable website="{website}" --variable thumbnail="{thumbnail}" --variable bio="""{bio}""" -o {filename}'.format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) )
print args_pandoc
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE) p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE) p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
html = (p2.communicate())[0] html = (p2.communicate())[0]
return html
def img_fullurl(parent): def img_fullurl(parent):
imgs = parent.findall('.//img') imgs = parent.findall('.//img')

@ -32,24 +32,21 @@ endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
# CREATE INDEX # CREATE INDEX
######## ########
memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}] memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}]
#memberpages = [{u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}]
#memberpages = [{u'ns': 0, u'pageid': 16007, u'title': u'U ntitled'}]
print 'memberpages', memberpages print 'memberpages', memberpages
######## ########
# CREATE PAGE # CREATE PAGE
######## ########
page_template = open("web/page-template.html", "r") page_template = open("web/page-template.html", "r")
page_template = page_template.read() page_template = page_template.read()
for member in memberpages: for member in memberpages:
print member #print member
# download mw work page # download mw work page
pageid=member['pageid'] pageid=member['pageid']
pagetitle=(member['title'].encode('utf-8')) pagetitle=(member['title'].encode('utf-8'))
print pageid
workpage_mw = api_page(pageid, 'content') workpage_mw = api_page(pageid, 'content')
# parse workpage_mw # parse workpage_mw
@ -58,7 +55,7 @@ for member in memberpages:
workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template
for key in workdict.keys(): # convert Extra, Description, Bio to HTML for key in workdict.keys(): # convert Extra, Description, Bio to HTML
if key in ['Extra', 'Description', 'Bio']: if key in ['Extra', 'Description', 'Bio'] and workdict[key]:
workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) ) workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) )
# fill template with dictionary/mw_page values # fill template with dictionary/mw_page values
@ -79,7 +76,15 @@ for member in memberpages:
newsrc = api_file_url(src) newsrc = api_file_url(src)
if newsrc: if newsrc:
img.set('src', newsrc) img.set('src', newsrc)
website = tree.find('.//div[@id="website"]/a')
print 'website', ET.tostring(website)
if not website.get('href'):
#remove empty .//div[@id="website"]/a
# This can be applied to more fields
website_parent = tree.find('.//div[@id="website"]')
website_parent.remove(website)
# save workpage_html # save workpage_html
workpage_html = ET.tostring(tree) workpage_html = ET.tostring(tree)
creator = workdict['Creator'].decode('ascii', 'ignore') creator = workdict['Creator'].decode('ascii', 'ignore')

Loading…
Cancel
Save