|
|
@ -2,61 +2,63 @@
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
###########
|
|
|
|
###########
|
|
|
|
# Testing downloading and converting mw page content to html
|
|
|
|
# prototyping downloading and converting mw page content to html
|
|
|
|
###########
|
|
|
|
###########
|
|
|
|
|
|
|
|
|
|
|
|
# OVER VIEW:
|
|
|
|
# OVERVIEW:
|
|
|
|
# * creating one single html page
|
|
|
|
# * creating one single html page
|
|
|
|
|
|
|
|
# * replace {{youtube/vimeo}} with video tags
|
|
|
|
|
|
|
|
# * replace galleries with rows of images
|
|
|
|
# request all the pages
|
|
|
|
# request all the pages
|
|
|
|
# build index
|
|
|
|
# build index
|
|
|
|
# build all pages
|
|
|
|
# build all pages
|
|
|
|
|
|
|
|
|
|
|
|
import pprint
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work_page, parse_work
|
|
|
|
import html5lib, re, pprint
|
|
|
|
|
|
|
|
from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gallery_exp=re.compile('<gallery>.*?</gallery>')
|
|
|
|
|
|
|
|
file_exp=re.compile('File:(.*?)(?=File:|<\/gallery>)')
|
|
|
|
|
|
|
|
img_exp=re.compile('(File:|Image:)((.*?)\.(gif|jpg|jpeg|png))(?=\||File:|Image:|<\/gallery>)', re.I)
|
|
|
|
|
|
|
|
video_exp=re.compile('\{\{(.*?)\|(.*?)\}\}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template = open("web/page-template.html", "r")
|
|
|
|
template = open("web/page-template.html", "r")
|
|
|
|
template = template.read()
|
|
|
|
template = template.read()
|
|
|
|
|
|
|
|
|
|
|
|
# download
|
|
|
|
# download
|
|
|
|
pageid='16025'#'15965'#Qq #'15986'Jozeph
|
|
|
|
pageid='16025'#'15965'#Qq #'15986'Jozeph #'16025'Mina
|
|
|
|
work = 'Mina'#'User:Joak/graduation/catalog1'
|
|
|
|
work = 'Mina'#'User:Joak/graduation/catalog1'
|
|
|
|
|
|
|
|
|
|
|
|
workpage_mw = api_page(pageid, 'content')
|
|
|
|
workpage_mw = api_page(pageid, 'content')
|
|
|
|
|
|
|
|
|
|
|
|
# parsing workpage_mw
|
|
|
|
# parsing workpage_mw
|
|
|
|
workdict = parse_work(work, workpage_mw)
|
|
|
|
workdict = parse_work(work, workpage_mw)
|
|
|
|
for key in workdict.keys():
|
|
|
|
for key in workdict.keys():
|
|
|
|
if key in ['Extra', 'Description', 'Bio']:
|
|
|
|
if key in ['Extra', 'Description', 'Bio']:
|
|
|
|
workdict[key] = pandoc2html(workdict[key].encode('utf-8'))
|
|
|
|
workdict[key] = pandoc2html(workdict[key].encode('utf-8'))
|
|
|
|
# print key
|
|
|
|
|
|
|
|
# print workdict[key]
|
|
|
|
|
|
|
|
# print '--------------------'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template = template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra'] )
|
|
|
|
|
|
|
|
work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
|
|
|
|
|
|
|
|
work_file = open(work_filename, "w")
|
|
|
|
|
|
|
|
work_file.write(template)
|
|
|
|
|
|
|
|
work_file.close()
|
|
|
|
|
|
|
|
#template = template.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print(template)
|
|
|
|
workpage_html = template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra'] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Process html
|
|
|
|
|
|
|
|
tree = html5lib.parse(workpage_html, namespaceHTMLElements=False)
|
|
|
|
|
|
|
|
imgs = tree.findall('.//img')
|
|
|
|
|
|
|
|
for img in imgs:
|
|
|
|
|
|
|
|
src = img.get('src')
|
|
|
|
|
|
|
|
newsrc = api_file_url(src)
|
|
|
|
|
|
|
|
print 'new src', newsrc
|
|
|
|
|
|
|
|
if newsrc:
|
|
|
|
|
|
|
|
img.set('src', newsrc)
|
|
|
|
|
|
|
|
#print 'IMG', ET.tostring(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#for section in [extra, description, bio]:
|
|
|
|
|
|
|
|
# section = pandoc2html(section.encode('utf-8'))
|
|
|
|
workpage_html = ET.tostring(tree)
|
|
|
|
# print section
|
|
|
|
print 'TREE', workpage_html
|
|
|
|
# print '------------'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print template
|
|
|
|
|
|
|
|
#for key in workdict.keys():
|
|
|
|
# # save
|
|
|
|
# print key, workdict[key].encode('utf-8')
|
|
|
|
work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid)
|
|
|
|
|
|
|
|
work_file = open(work_filename, "w")
|
|
|
|
|
|
|
|
work_file.write(workpage_html)
|
|
|
|
|
|
|
|
work_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print '----------- html -----'
|
|
|
|
|
|
|
|
#print html_description
|
|
|
|
|
|
|
|
#print html_extra
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print template
|
|
|
|
|
|
|
|