From 3192565601a0f7103d6dbc3f372a09e709e4276f Mon Sep 17 00:00:00 2001 From: Castro0o Date: Tue, 28 Apr 2015 22:48:08 +0200 Subject: [PATCH] creating correct pages with images: src=full_url --- mmdc_modules.py | 57 ++++++++++++++++++++++----------------- prototype_page.py | 68 ++++++++++++++++++++++++----------------------- 2 files changed, 68 insertions(+), 57 deletions(-) diff --git a/mmdc_modules.py b/mmdc_modules.py index d96e79b..11cee97 100644 --- a/mmdc_modules.py +++ b/mmdc_modules.py @@ -145,29 +145,6 @@ def pandoc(filename, title, creator, date, website, thumbnail, bio, description, p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE) p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE) html = (p2.communicate())[0] -# return html - -# pandoc either reades input from stdin or through input file -# pandoc DOES NOT convert variables; it has to receive the input from stdin. -# to create html convert: bio, description, extra, of at time -# insert them into HTML template by: -## gerating html in python? and insertion sub elements ? -## ?? - - # pandoc = 'pandoc -s -f mediawiki -t html5 \ -# --template template_article.html \ -# --variable title="{title}" \ -# --variable section="{section}" \ -# --variable topics="{topics}" \ -# --variable issueName="{iname}" \ -# --variable issueNumber="{inum}" \ -# "articles/tmp_content.mw" -o "{articlepath}/{htmlfile}.html"'.format(articlepath=path, title=(pagename).replace("_"," "), section=in_section, topics=in_topic, iname=in_issuename, inum=in_issue, htmlfile=pagename) -# subprocess.call(pandoc, shell=True) # saved in tmp_content.html html -# html = open('tmp_content.html', 'r') #write mediawiki content to html in tmp_content.html -# html = html.read() -# return html - - def img_fullurl(parent): imgs = parent.findall('.//img') @@ -178,9 +155,41 @@ def img_fullurl(parent): print '----- IMG', ET.tostring(img ), src, fullurl if fullurl != None: img.set('src', fullurl) - # fileurl = api_request(src, endpoint)# find url of file +def replace_gallery(content): + gallery_imgs = [] + gallery_found = re.findall(gallery_exp, content) + content = re.sub(gallery_exp, '', content) + for gallery in gallery_found: # in case there is more than 1 + allfiles =re.findall(img_exp, gallery) + for imgfile in allfiles: + imgfile = imgfile[1] + imgsrc = api_file_url(imgfile) # search for original image + gallery_imgs.append(imgsrc) + print 'gallery_imgs', gallery_imgs + # from .* imgs, return list of img ET elements + # replace .* with '' + return content, gallery_imgs + +def replace_video(content): + videos = [] + videos_found = re.findall(video_exp, content) + for video in videos_found: + video_provider = str(video[0]) + video_hash = str(video[1]) + video_src = None + if (video_provider.lower()) == 'youtube': + video_src="https://www.youtube.com/embed/" + video_hash + elif (video_provider.lower()) == 'vimeo': + video_src="https://player.vimeo.com/video/" + video_hash + if video_src: + videos.append(video_src) + iframe = "".format(video_src) +# content = re.sub(video_exp, ' iframe ', content) + else: + content = re.sub(video_exp, '', content) + diff --git a/prototype_page.py b/prototype_page.py index d25c8d3..d61b989 100755 --- a/prototype_page.py +++ b/prototype_page.py @@ -2,61 +2,63 @@ # -*- coding: utf-8 -*- ########### -# Testing downloading and converting mw page content to html +# prototyping downloading and converting mw page content to html ########### -# OVER VIEW: +# OVERVIEW: # * creating one single html page +# * replace {{youtube/vimeo}} with video tags +# * replace galleries with rows of images # request all the pages # build index # build all pages -import pprint -from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work_page, parse_work +import xml.etree.ElementTree as ET +import html5lib, re, pprint +from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video + +gallery_exp=re.compile('.*?') +file_exp=re.compile('File:(.*?)(?=File:|<\/gallery>)') +img_exp=re.compile('(File:|Image:)((.*?)\.(gif|jpg|jpeg|png))(?=\||File:|Image:|<\/gallery>)', re.I) +video_exp=re.compile('\{\{(.*?)\|(.*?)\}\}') + template = open("web/page-template.html", "r") template = template.read() # download -pageid='16025'#'15965'#Qq #'15986'Jozeph +pageid='16025'#'15965'#Qq #'15986'Jozeph #'16025'Mina work = 'Mina'#'User:Joak/graduation/catalog1' - workpage_mw = api_page(pageid, 'content') + # parsing workpage_mw workdict = parse_work(work, workpage_mw) for key in workdict.keys(): if key in ['Extra', 'Description', 'Bio']: workdict[key] = pandoc2html(workdict[key].encode('utf-8')) -# print key -# print workdict[key] -# print '--------------------' - -template = template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra'] ) -work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid) -work_file = open(work_filename, "w") -work_file.write(template) -work_file.close() -#template = template.read() -#print(template) +workpage_html = template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra'] ) +# Process html +tree = html5lib.parse(workpage_html, namespaceHTMLElements=False) +imgs = tree.findall('.//img') +for img in imgs: + src = img.get('src') + newsrc = api_file_url(src) + print 'new src', newsrc + if newsrc: + img.set('src', newsrc) + #print 'IMG', ET.tostring(img) -#for section in [extra, description, bio]: -# section = pandoc2html(section.encode('utf-8')) -# print section -# print '------------' + +workpage_html = ET.tostring(tree) +print 'TREE', workpage_html -#print template -#for key in workdict.keys(): -# print key, workdict[key].encode('utf-8') - - - - - -#print '----------- html -----' -#print html_description -#print html_extra + +# # save +work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid) +work_file = open(work_filename, "w") +work_file.write(workpage_html) +work_file.close() -#print template