diff --git a/mmdc_modules.py b/mmdc_modules.py index 4de5a5c..f04e168 100644 --- a/mmdc_modules.py +++ b/mmdc_modules.py @@ -57,24 +57,24 @@ def write_html_file(html_tree, filename): edited.close() def parse_work(title, content): - workdict = {'Title':title, 'Creator':'', 'Date':'', 'Website':'', 'Thumbnail':'', 'Bio':'', 'Description':'', 'Extra':''} - if re.match('\{\{\Graduation work', content): - template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] + workdict = {'Title':title, 'Creator':u'', 'Date':u'', 'Website':u'', 'Thumbnail':u'', 'Bio':u'', 'Description':u'', 'Extra':u''} + if re.match(u'\{\{\Graduation work', content): + template, extra = (re.findall(u'\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] if extra: - workdict['Extra'] = extra.encode('utf-8') + workdict['Extra'] = extra # template's key/value pair # Note:Extra value is NOT CAPTURED by this regex - keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) + keyval = re.findall(u'\|(.*?)\=(.*?\n)', template, re.DOTALL) for pair in keyval: key = pair[0] val = (pair[1]).replace('\n', '') if 'Creator' in key: - val = val.replace(', ', '') + val = val.replace(u', ', u'') elif 'Thumbnail' in key: val = mw_singelimg_url(site, val)#api_thumb_url(val) elif 'Website' in key: val = urllib.unquote( val) - workdict[key]=val.encode('utf-8') + workdict[key]=val # pprint.pprint(workdict) return workdict @@ -87,6 +87,7 @@ def pandoc2html(mw_content): p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE) p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE) html = (p2.communicate())[0] + html = html.decode("utf-8") return html gallery_exp=re.compile('(.*?)', re.S) diff --git a/mmdc_wiki2web.py b/mmdc_wiki2web.py index 1f50bdd..d82cf59 100755 --- a/mmdc_wiki2web.py +++ b/mmdc_wiki2web.py @@ -34,22 +34,28 @@ index_container = index_tree.find(".//div[@class='isotope']") #maybe id is impor ######## for member in memberpages: workpage_mw = mw_page_text(site, member)#CONTENT + # print workpage_mw.encode("utf-8") + # print "** workpage_mw", type(workpage_mw) workpage_mw = replace_gallery(workpage_mw) workpage_mw = replace_video(workpage_mw) workdict = parse_work(member, workpage_mw) # create dictionary w/ page content workpage_imgs = mw_imgsurl(site, member) + + # print "***", type(workpage_mw), workdict + # for key in workdict: + # print type(workdict[key]), key, workdict[key] print ' member', member # only parse pages with Creator, Title, Thumbnail if len(workdict['Creator'])>1 and len(workdict['Title'])>1 and len(workdict['Thumbnail'])>1: #and len(workdict['Description'])>1 for key in workdict.keys(): # convert Extra, Description, Bio to HTML if key in ['Extra', 'Description', 'Bio'] and workdict[key]: - workdict[key] = pandoc2html( (workdict[key].decode('utf-8'))) + workdict[key] = pandoc2html(workdict[key]) elif key in ['Creator']: workdict[key] = workdict[key].replace(',','' ) for key in workdict.keys(): if type(workdict[key]) is unicode: - workdict[key]=workdict[key].encode('utf-8') + workdict[key]=workdict[key] # print workdict, type(workdict['Creator']) # print workdict['Creator']#.decode('utf-8') @@ -60,17 +66,17 @@ for member in memberpages: page_creator = page_tree.find('.//h2[@id="creator"]') page_creator.text=(workdict['Creator'].decode('utf-8')) page_title_date = page_tree.find('.//p[@id="title"]') - page_title_date.text="{} {}".format(workdict['Title'], workdict['Date']) + page_title_date.text=u"{} {}".format(workdict['Title'], workdict['Date']) page_description = page_tree.find('.//div[@id="description"]') - page_description_el = ET.fromstring('
'+"workdict['Description']"+'
') + page_description_el = ET.fromstring(u'
'+workdict['Description']+u'
') page_description.extend(page_description_el) page_bio = page_tree.find('.//div[@id="bio"]') - page_bio_el = ET.fromstring('
'+workdict['Bio']+'
') + page_bio_el = ET.fromstring(u'
'+workdict['Bio']+u'
') page_bio.extend(page_bio_el) page_sortArea_title = page_tree.find('.//div[@id="sortArea"]/p') page_sortArea_title.text =workdict['Title'] page_extra = page_tree.find('.//div[@id="extra"]') - page_extra_el = ET.fromstring('
'+workdict['Extra']+'
') + page_extra_el = ET.fromstring(u'
'+workdict['Extra']+u'
') page_extra.extend(page_extra_el) page_website = page_tree.find('.//p[@class="hightlightSidebar"]/a') page_website.set('href', workdict['Website'])