diff --git a/.gitignore b/.gitignore index e645833..61afa13 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *~ -*.pyc \ No newline at end of file +*.pyc +web/20*.html +*.json +\#*\# \ No newline at end of file diff --git a/mmdc_modules.py b/mmdc_modules.py index 7c93b35..00ee1c4 100644 --- a/mmdc_modules.py +++ b/mmdc_modules.py @@ -3,7 +3,7 @@ import urllib2, json, pprint, re import xml.etree.ElementTree as ET -import subprocess, shlex +import subprocess, shlex, urllib sid = '1234' useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" @@ -131,11 +131,8 @@ def parse_work_page(title, content): work_dict = {} work_dict['Title']=title template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] - print 'template', template, - print 'extra', extra - # template's key/value pair - keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) - if extra: + keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # template's key/value pair + if extra: #append extra extra = ('Extra', extra) keyval.append(extra) #? @@ -162,47 +159,37 @@ def parse_work(title, content): if re.match('\{\{\Graduation work', content): template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] - workdict['Extra'] = extra.encode('utf-8') + if extra: + workdict['Extra'] = extra.encode('utf-8') # template's key/value pair - keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) + # Note:Extra value is NOT CAPTURED by this regex + keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) for pair in keyval: key = pair[0] val = (pair[1]).replace('\n', '') if 'Creator' in key: val = val.replace(', ', '') elif 'Thumbnail' in key: - print 'calling API' val = api_thumb_url(val) - print 'THUMB', val + elif 'Website' in key: + val = urllib.unquote(val) + workdict[key]=val.encode('utf-8') - pprint.pprint(workdict) +# pprint.pprint(workdict) return workdict - - # Conversion Modules def pandoc2html(mw_content): - if mw_content: - mw_content = mw_content.encode('utf-8') - # convert from mw to html - args_echo =shlex.split( ('echo "{}"'.format(mw_content)) ) - args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' ) - p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE) - p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE) - html = (p2.communicate())[0] - return html - -def pandoc(filename, title, creator, date, website, thumbnail, bio, description, extra, template) : - '''pandoc: convert mediawiki syntax to html''' -# mw_content = mw_content.encode('utf-8') - args_echo =shlex.split( ('echo "{}"'.format( extra )) ) - args_pandoc = shlex.split( 'pandoc -s -f mediawiki -t html \ - --template {template} --variable title="{title}" --variable creator="{creator}" --variable date="{date}" --variable website="{website}" --variable website="{website}" --variable thumbnail="{thumbnail}" --variable bio="""{bio}""" -o {filename}'.format(template=template, title=title, creator=creator, date=date, website=website, thumbnail=thumbnail, bio=bio, description=description, extra=extra, filename=filename) ) - print args_pandoc + '''convert individual mw sections to html''' + mw_content = mw_content.encode('utf-8') + # convert from mw to html + args_echo =shlex.split( ('echo "{}"'.format(mw_content)) ) + args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' ) p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE) p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE) html = (p2.communicate())[0] + return html def img_fullurl(parent): imgs = parent.findall('.//img') diff --git a/prototype_page.py b/prototype_page.py index 0aab6c5..585fb44 100755 --- a/prototype_page.py +++ b/prototype_page.py @@ -32,24 +32,21 @@ endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" # CREATE INDEX ######## memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}] +#memberpages = [{u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}] +#memberpages = [{u'ns': 0, u'pageid': 16007, u'title': u'U ntitled'}] print 'memberpages', memberpages - - - ######## # CREATE PAGE ######## page_template = open("web/page-template.html", "r") page_template = page_template.read() - for member in memberpages: - print member + #print member # download mw work page pageid=member['pageid'] pagetitle=(member['title'].encode('utf-8')) - print pageid workpage_mw = api_page(pageid, 'content') # parse workpage_mw @@ -58,7 +55,7 @@ for member in memberpages: workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template for key in workdict.keys(): # convert Extra, Description, Bio to HTML - if key in ['Extra', 'Description', 'Bio']: + if key in ['Extra', 'Description', 'Bio'] and workdict[key]: workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) ) # fill template with dictionary/mw_page values @@ -79,7 +76,15 @@ for member in memberpages: newsrc = api_file_url(src) if newsrc: img.set('src', newsrc) - + website = tree.find('.//div[@id="website"]/a') + print 'website', ET.tostring(website) + + if not website.get('href'): + #remove empty .//div[@id="website"]/a + # This can be applied to more fields + website_parent = tree.find('.//div[@id="website"]') + website_parent.remove(website) + # save workpage_html workpage_html = ET.tostring(tree) creator = workdict['Creator'].decode('ascii', 'ignore')