page content: grabbed by mwclient

master
Castro0o 10 years ago
parent 72a840f068
commit 7edbdb60bd

@ -28,17 +28,14 @@ def api_request(action, pagename): #get page: content, metadata, images, imageif
page_content = json_dic.get(page_id) page_content = json_dic.get(page_id)
return page_content return page_content
def api_page(title, query):
if query == 'content':
api_response = api_request('action=query&titles={}&prop=revisions&rvprop=content', title)
response = ((api_response.get('revisions'))[0])['*']
elif query == 'metadata':
response = api_request('action=query&titles={}&prop=info', title)
return response
############################## ##############################
# CATEGORIES, PAGES AND IMAGES # CATEGORIES, PAGES AND IMAGES
############################## ##############################
def mw_page_text(site, page):
page = site.Pages[page]
text = page.text()
return text
def mw_cats(site, args): def mw_cats(site, args):
last_names = None last_names = None
for cats in args.category: for cats in args.category:

@ -3,7 +3,7 @@
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import html5lib, pprint import html5lib, pprint
from mmdc_modules import api_page, pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site from mmdc_modules import pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site, mw_page_text
from argparse import ArgumentParser from argparse import ArgumentParser
from mwclient import Site from mwclient import Site
@ -15,14 +15,7 @@ args = p.parse_args()
print args print args
######## ########
# QUERY API # Index
########
sid = '1234'
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
########
# CREATE INDEX
######## ########
memberpages=mw_cats(site, args) memberpages=mw_cats(site, args)
#memberpages['Ctrl-F Reader','As We Speak'] #memberpages['Ctrl-F Reader','As We Speak']
@ -37,20 +30,15 @@ index_tree = html5lib.parse(index_file, namespaceHTMLElements=False)
index_container = index_tree.find(".//div[@class='isotope']") #maybe id is important, to destinguish it index_container = index_tree.find(".//div[@class='isotope']") #maybe id is important, to destinguish it
######## ########
# CREATE PAGE # Create Page
######## ########
for member in memberpages: for member in memberpages:
print ' member', member print ' member', member
workpage_mw = mw_page_text(site, member)#CONTENT
# download mw work page
# pageid=member['pageid']
# pagetitle=(member['title'].encode('utf-8'))
workpage_mw = api_page(member, 'content')
workpage_mw = replace_gallery(workpage_mw) workpage_mw = replace_gallery(workpage_mw)
workpage_mw = replace_video(workpage_mw) workpage_mw = replace_video(workpage_mw)
workdict = parse_work(member, workpage_mw) # create dictionary workpage_mw template workdict = parse_work(member, workpage_mw) # create dictionary w/ page content
workpage_imgs = mw_imgsurl(site, member) workpage_imgs = mw_imgsurl(site, member)
print 'WORKPAGE_IMGS', workpage_imgs
# only parse pages with Creator, Title, Thumbnail # only parse pages with Creator, Title, Thumbnail
if len(workdict['Creator'])>1 and len(workdict['Title'])>1 and len(workdict['Description'])>1 and len(workdict['Thumbnail'])>1: if len(workdict['Creator'])>1 and len(workdict['Title'])>1 and len(workdict['Description'])>1 and len(workdict['Thumbnail'])>1:
@ -58,12 +46,9 @@ for member in memberpages:
if key in ['Extra', 'Description', 'Bio'] and workdict[key]: if key in ['Extra', 'Description', 'Bio'] and workdict[key]:
workdict[key] = pandoc2html( (workdict[key].decode('utf-8'))) workdict[key] = pandoc2html( (workdict[key].decode('utf-8')))
elif key in ['Creator']: elif key in ['Creator']:
workdict[key] = workdict[key].replace(',','' ) #remove comma workdict[key] = workdict[key].replace(',','' )
#replace empty dict values with ' ' # to avoid empty tags
for key in workdict.keys():
if workdict[key] is '':# and key is not 'Thumbnail':
workdict[key] = ' '
for key in workdict.keys():
if type(workdict[key]) is unicode: if type(workdict[key]) is unicode:
workdict[key]=workdict[key].encode('utf-8') workdict[key]=workdict[key].encode('utf-8')
@ -98,16 +83,12 @@ for member in memberpages:
for img in imgs: for img in imgs:
img_class = img.get('class') img_class = img.get('class')
if img_class != 'template': if img_class != 'template':
print 'img_class',img_class
src =unicode(img.get('src')) src =unicode(img.get('src'))
print src print src
for pair in workpage_imgs: for pair in workpage_imgs:
if src.replace("_", " ") in pair[0]: #if img in html match img in workpage_imgs if src.replace("_", " ") in pair[0]:#if img in html matchs img in workpage_imgs
print 'FOUND IMG', pair
img.set('src', pair[1]) img.set('src', pair[1])
# newsrc = api_file_url(src) MOVE FULL URl OPERATION TO MW CONTENT
# if newsrc:
# img.set('src', newsrc)
# save work page # save work page
creator = workdict['Creator']#.decode('ascii', 'ignore') creator = workdict['Creator']#.decode('ascii', 'ignore')
creator = creator.replace(' ','_') creator = creator.replace(' ','_')
@ -115,9 +96,8 @@ for member in memberpages:
write_html_file(page_tree, work_filename) write_html_file(page_tree, work_filename)
####### #######
# INDEX # Insert Work to Index
####### #######
# insert work to index
index_addwork( parent=index_container, index_addwork( parent=index_container,
workid=key, workid=key,
href=work_filename.replace('web/',''), href=work_filename.replace('web/',''),

@ -34,7 +34,7 @@
<div id="bio"><p>Henk-Jelle de Groot is a Rotterdam based sound designer and musician. After graduating with an Audio / Visual design bachelor Henk-Jelle setup a sound studio in Rotterdam to work in the Audio / Visual industry. After 7 years of working he returned to the Piet Zwart Institute to graduate in a Master of comm design something something. In addition to working in the Audio / Visual industry, he is muscian and builder of electronic instruments.</p> <div id="bio"><p>Henk-Jelle de Groot is a Rotterdam based sound designer and musician. After graduating with an Audio / Visual design bachelor Henk-Jelle setup a sound studio in Rotterdam to work in the Audio / Visual industry. After 7 years of working he returned to the Piet Zwart Institute to graduate in a Master of comm design something something. In addition to working in the Audio / Visual industry, he is muscian and builder of electronic instruments.</p>
</div> </div>
<p class="hightlightSidebar"><a href=" " target="_blank"> </a></p><!-- {website} --> <p class="hightlightSidebar"><a href="" target="_blank"></a></p><!-- {website} -->
<!-- // --> <!-- // -->

@ -15,7 +15,7 @@
<div id="sidebarInner"> <div id="sidebarInner">
<div id="sideBarDesc"> <div id="sideBarDesc">
<div id="sideBarDescInfo"> <div id="sideBarDescInfo">
<a class="hoverBackA" href="index.html"><img src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p> <a class="hoverBackA" href="index.html"><img class="template" src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p>
</div> </div>
<div id="sideBarDescInner"> <div id="sideBarDescInner">
@ -40,17 +40,17 @@
</div> </div>
</div> </div>
<div id="logoWrap"><img id="logo" src="./img/black_PZI_logo_p.svg"></div> <div id="logoWrap"><img class="template" id="logo" src="./img/black_PZI_logo_p.svg"></div>
</div> </div>
<div class="zwartArea zwartAreaWhite sidebarBorderLeft" id="section02"> <div class="zwartArea zwartAreaWhite sidebarBorderLeft" id="section02">
<div class="fixedsticky" id="filter" style="top:0;"> <div class="fixedsticky" id="filter" style="top:0;">
<div class="themes" id="sortArea"> <div class="themes" id="sortArea">
<a class="hoverBackB" href="index.html"> <a class="hoverBackB" href="index.html">
<img src="./img/arrowBack.svg"> <img class="template" src="./img/arrowBack.svg">
</a> </a>
<p>User:Joak/graduation/catalog1</p><!--{title}--> <p>User:Joak/graduation/catalog1</p><!--{title}-->
<a class="closeSidebar"><img src="./img/arrowUpW.svg"></a> <a class="closeSidebar"><img class="template" src="./img/arrowUpW.svg"></a>
</div> </div>
</div> </div>

Loading…
Cancel
Save