From 3192565601a0f7103d6dbc3f372a09e709e4276f Mon Sep 17 00:00:00 2001
From: Castro0o <andre@andrecastro.info>
Date: Tue, 28 Apr 2015 22:48:08 +0200
Subject: [PATCH] creating correct pages with images: src=full_url

---
 mmdc_modules.py   | 57 ++++++++++++++++++++++-----------------
 prototype_page.py | 68 ++++++++++++++++++++++++-----------------------
 2 files changed, 68 insertions(+), 57 deletions(-)
diff --git a/mmdc_modules.py b/mmdc_modules.py
index d96e79b..11cee97 100644
--- a/mmdc_modules.py
+++ b/mmdc_modules.py
@@ -145,29 +145,6 @@ def pandoc(filename, title, creator, date, website, thumbnail, bio, description,
     p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
     p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
     html = (p2.communicate())[0]
-#    return html
-
-# pandoc either reades input from stdin or through input file
-# pandoc DOES NOT convert variables; it has to receive the input from stdin.
-# to create html convert: bio, description, extra, of at time
-# insert them into HTML template by:
-## gerating html in python? and insertion sub elements ?
-## ??
-                               
-                               #     pandoc = 'pandoc -s -f mediawiki -t html5 \
-# --template template_article.html \
-# --variable title="{title}" \
-# --variable section="{section}" \
-# --variable topics="{topics}" \
-# --variable issueName="{iname}" \
-# --variable issueNumber="{inum}" \
-# "articles/tmp_content.mw" -o "{articlepath}/{htmlfile}.html"'.format(articlepath=path, title=(pagename).replace("_"," "), section=in_section, topics=in_topic, iname=in_issuename, inum=in_issue, htmlfile=pagename)
-#     subprocess.call(pandoc, shell=True) # saved in tmp_content.html html
-#     html = open('tmp_content.html', 'r') #write mediawiki content to html in tmp_content.html
-#     html = html.read()
-#     return html
-
-
     
 def img_fullurl(parent):
     imgs = parent.findall('.//img')
@@ -178,9 +155,41 @@ def img_fullurl(parent):
         print '----- IMG', ET.tostring(img ), src, fullurl
         if fullurl != None:            
             img.set('src', fullurl)
-
         #        fileurl = api_request(src, endpoint)# find url of file
 
+def replace_gallery(content):
+    gallery_imgs = []
+    gallery_found = re.findall(gallery_exp, content)
+    content = re.sub(gallery_exp,  '', content)
+    for gallery in gallery_found: # in case there is more than 1 <gallery>
+        allfiles =re.findall(img_exp, gallery)
+        for imgfile in allfiles:
+            imgfile = imgfile[1]
+            imgsrc = api_file_url(imgfile) # search for original image
+            gallery_imgs.append(imgsrc)
+            print 'gallery_imgs', gallery_imgs
+    # from <gallery>.*</gallery> imgs, return list of img ET elements
+    # replace <gallery>.*</gallery> with ''
+    return content, gallery_imgs
+
+def replace_video(content):
+    videos = []
+    videos_found = re.findall(video_exp, content)
+    for video in videos_found:
+        video_provider =  str(video[0])
+        video_hash = str(video[1])
+        video_src = None
+        if (video_provider.lower()) == 'youtube':            
+            video_src="https://www.youtube.com/embed/" + video_hash
+        elif (video_provider.lower()) == 'vimeo':            
+            video_src="https://player.vimeo.com/video/" + video_hash
+        if video_src:
+            videos.append(video_src)
+            iframe = "<iframe src='{}' width='600px' height='450px'></iframe>".format(video_src)
+#            content = re.sub(video_exp, '       iframe    ', content)
+        else:
+            content = re.sub(video_exp,  '', content)
+
 
 
 
diff --git a/prototype_page.py b/prototype_page.py
index d25c8d3..d61b989 100755
--- a/prototype_page.py
+++ b/prototype_page.py
@@ -2,61 +2,63 @@
 # -*- coding: utf-8 -*-
 
 ###########
-# Testing downloading and converting mw page content to html
+# prototyping downloading and converting mw page content to html
 ###########
 
-# OVER VIEW:
+# OVERVIEW:
 # * creating one single html page
+#     * replace {{youtube/vimeo}} with video tags
+#     * replace galleries with rows of images
 # request  all the pages
 # build index
 # build all pages
 
-import pprint
-from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work_page, parse_work
+import xml.etree.ElementTree as ET
+import html5lib, re, pprint
+from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video
+
+gallery_exp=re.compile('<gallery>.*?</gallery>')
+file_exp=re.compile('File:(.*?)(?=File:|<\/gallery>)')
+img_exp=re.compile('(File:|Image:)((.*?)\.(gif|jpg|jpeg|png))(?=\||File:|Image:|<\/gallery>)', re.I)
+video_exp=re.compile('\{\{(.*?)\|(.*?)\}\}')
+
 
 template = open("web/page-template.html", "r")
 template =  template.read()
 
 # download
-pageid='16025'#'15965'#Qq   #'15986'Jozeph
+pageid='16025'#'15965'#Qq   #'15986'Jozeph #'16025'Mina
 work = 'Mina'#'User:Joak/graduation/catalog1'
-
 workpage_mw = api_page(pageid, 'content')
+
 # parsing workpage_mw
 workdict = parse_work(work, workpage_mw) 
 for key in workdict.keys():
     if key in ['Extra', 'Description', 'Bio']:        
         workdict[key] =  pandoc2html(workdict[key].encode('utf-8'))
-#    print key
-#    print workdict[key]
-#    print '--------------------'
-
-template =  template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra']  )
-work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
-work_file = open(work_filename, "w")
-work_file.write(template)
-work_file.close()
-#template =  template.read()
 
-#print(template)
+workpage_html =  template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra']  )
 
+# Process html
+tree = html5lib.parse(workpage_html, namespaceHTMLElements=False)
+imgs = tree.findall('.//img')
+for img in  imgs:
+    src = img.get('src')
+    newsrc = api_file_url(src)
+    print 'new src', newsrc
+    if newsrc:
+        img.set('src', newsrc)                    
+        #print 'IMG', ET.tostring(img)
 
 
-#for section in [extra, description, bio]:
-#   section =  pandoc2html(section.encode('utf-8'))
-#    print section
-#    print '------------'
+        
+workpage_html = ET.tostring(tree)      
+print 'TREE', workpage_html
 
-#print template
-#for key in workdict.keys():
-#    print key, workdict[key].encode('utf-8')
-
-
-
-
-
-#print '----------- html -----'
-#print html_description
-#print html_extra
+    
+# # save 
+work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
+work_file = open(work_filename, "w")
+work_file.write(workpage_html)
+work_file.close()
 
-#print template