creating correct pages with images: src=full_url

10 years ago · 3192565601
parent 3a79e7bfe8
commit 3192565601
2 changed files with 68 additions and 57 deletions
--- a/mmdc_modules.py
+++ b/mmdc_modules.py
@ -145,29 +145,6 @@ def pandoc(filename, title, creator, date, website, thumbnail, bio, description,
    p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
    html = (p2.communicate())[0]
 #    return html
 # pandoc either reades input from stdin or through input file
 # pandoc DOES NOT convert variables; it has to receive the input from stdin.
 # to create html convert: bio, description, extra, of at time
 # insert them into HTML template by:
 ## gerating html in python? and insertion sub elements ?
 ## ??
                               #     pandoc = 'pandoc -s -f mediawiki -t html5 \
 # --template template_article.html \
 # --variable title="{title}" \
 # --variable section="{section}" \
 # --variable topics="{topics}" \
 # --variable issueName="{iname}" \
 # --variable issueNumber="{inum}" \
 # "articles/tmp_content.mw" -o "{articlepath}/{htmlfile}.html"'.format(articlepath=path, title=(pagename).replace("_"," "), section=in_section, topics=in_topic, iname=in_issuename, inum=in_issue, htmlfile=pagename)
 #     subprocess.call(pandoc, shell=True) # saved in tmp_content.html html
 #     html = open('tmp_content.html', 'r') #write mediawiki content to html in tmp_content.html
 #     html = html.read()
 #     return html
 def img_fullurl(parent):
    imgs = parent.findall('.//img')
@ -178,9 +155,41 @@ def img_fullurl(parent):
        print '----- IMG', ET.tostring(img ), src, fullurl
        if fullurl != None:            
            img.set('src', fullurl)
        #        fileurl = api_request(src, endpoint)# find url of file
 def replace_gallery(content):
    gallery_imgs = []
    gallery_found = re.findall(gallery_exp, content)
    content = re.sub(gallery_exp,  '', content)
    for gallery in gallery_found: # in case there is more than 1 <gallery>
        allfiles =re.findall(img_exp, gallery)
        for imgfile in allfiles:
            imgfile = imgfile[1]
            imgsrc = api_file_url(imgfile) # search for original image
            gallery_imgs.append(imgsrc)
            print 'gallery_imgs', gallery_imgs
    # from <gallery>.*</gallery> imgs, return list of img ET elements
    # replace <gallery>.*</gallery> with ''
    return content, gallery_imgs
 def replace_video(content):
    videos = []
    videos_found = re.findall(video_exp, content)
    for video in videos_found:
        video_provider =  str(video[0])
        video_hash = str(video[1])
        video_src = None
        if (video_provider.lower()) == 'youtube':            
            video_src="https://www.youtube.com/embed/" + video_hash
        elif (video_provider.lower()) == 'vimeo':            
            video_src="https://player.vimeo.com/video/" + video_hash
        if video_src:
            videos.append(video_src)
            iframe = "<iframe src='{}' width='600px' height='450px'></iframe>".format(video_src)
 #            content = re.sub(video_exp, '       iframe    ', content)
        else:
            content = re.sub(video_exp,  '', content)
--- a/prototype_page.py
+++ b/prototype_page.py
@ -2,61 +2,63 @@
 # -*- coding: utf-8 -*-
 ###########
-# Testing downloading and converting mw page content to html
+# prototyping downloading and converting mw page content to html
 ###########
-# OVER VIEW:
+# OVERVIEW:
 # * creating one single html page
 #     * replace {{youtube/vimeo}} with video tags
 #     * replace galleries with rows of images
 # request  all the pages
 # build index
 # build all pages
-import pprint
+import xml.etree.ElementTree as ET
-from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work_page, parse_work
+import html5lib, re, pprint
 from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video
 gallery_exp=re.compile('<gallery>.*?</gallery>')
 file_exp=re.compile('File:(.*?)(?=File:|<\/gallery>)')
 img_exp=re.compile('(File:|Image:)((.*?)\.(gif|jpg|jpeg|png))(?=\||File:|Image:|<\/gallery>)', re.I)
 video_exp=re.compile('\{\{(.*?)\|(.*?)\}\}')
 template = open("web/page-template.html", "r")
 template =  template.read()
 # download
-pageid='16025'#'15965'#Qq   #'15986'Jozeph
+pageid='16025'#'15965'#Qq   #'15986'Jozeph #'16025'Mina
 work = 'Mina'#'User:Joak/graduation/catalog1'
 workpage_mw = api_page(pageid, 'content')
 # parsing workpage_mw
 workdict = parse_work(work, workpage_mw) 
 for key in workdict.keys():
    if key in ['Extra', 'Description', 'Bio']:        
        workdict[key] =  pandoc2html(workdict[key].encode('utf-8'))
 #    print key
 #    print workdict[key]
 #    print '--------------------'
 template =  template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra']  )
 work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
 work_file = open(work_filename, "w")
 work_file.write(template)
 work_file.close()
 #template =  template.read()
-#print(template)
+workpage_html =  template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra']  )
 # Process html
 tree = html5lib.parse(workpage_html, namespaceHTMLElements=False)
 imgs = tree.findall('.//img')
 for img in  imgs:
    src = img.get('src')
    newsrc = api_file_url(src)
    print 'new src', newsrc
    if newsrc:
        img.set('src', newsrc)                    
        #print 'IMG', ET.tostring(img)
-#for section in [extra, description, bio]:
+        
-#   section =  pandoc2html(section.encode('utf-8'))
+workpage_html = ET.tostring(tree)      
-#    print section
+print 'TREE', workpage_html
 #    print '------------'
-#print template
+    
-#for key in workdict.keys():
+# # save 
-#    print key, workdict[key].encode('utf-8')
+work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
-
+work_file = open(work_filename, "w")
-
+work_file.write(workpage_html)
-
+work_file.close()
 #print '----------- html -----'
 #print html_description
 #print html_extra
 #print template