#! /usr/bin/env python # -*- coding: utf-8 -*- ########### # prototyping downloading and converting mw page content to html ########### # OVERVIEW: # * creating one single html page # * replace {{youtube/vimeo}} with video tags # * replace galleries with rows of images # request all the pages # build index # build all pages import xml.etree.ElementTree as ET import html5lib, re, pprint from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, gallery_exp, video_exp, api_pagesincategories ######## # QUERY API ######## sid = '1234' useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" ######## # CREATE INDEX ######## memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}] print 'memberpages', memberpages ######## # CREATE PAGE ######## page_template = open("web/page-template.html", "r") page_template = page_template.read() for member in memberpages: print member # download mw work page pageid=member['pageid'] pagetitle=(member['title'].encode('utf-8')) print pageid workpage_mw = api_page(pageid, 'content') # parse workpage_mw workpage_mw = replace_gallery(workpage_mw) workpage_mw = replace_video(workpage_mw) workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template for key in workdict.keys(): # convert Extra, Description, Bio to HTML if key in ['Extra', 'Description', 'Bio']: workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) ) # fill template with dictionary/mw_page values workpage_html = page_template.format(title=(workdict['Title']), creator=(workdict['Creator']), date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=(workdict['Bio']), description=(workdict['Description']), extra=(workdict['Extra']) ) # parse workpage_html # process html: img full url tree = html5lib.parse(workpage_html, namespaceHTMLElements=False) imgs = tree.findall('.//img') for img in imgs: src = img.get('src') newsrc = api_file_url(src) if newsrc: img.set('src', newsrc) # save workpage_html workpage_html = ET.tostring(tree) creator = workdict['Creator'].decode('ascii', 'ignore') creator = creator.replace(' ','_') work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], creator, pageid) work_file = open(work_filename, "w") work_file.write(workpage_html) work_file.close()