page content: grabbed by mwclient

master
Castro0o 10 years ago
parent 72a840f068
commit 7edbdb60bd

@ -28,17 +28,14 @@ def api_request(action, pagename): #get page: content, metadata, images, imageif
page_content = json_dic.get(page_id)
return page_content
def api_page(title, query):
if query == 'content':
api_response = api_request('action=query&titles={}&prop=revisions&rvprop=content', title)
response = ((api_response.get('revisions'))[0])['*']
elif query == 'metadata':
response = api_request('action=query&titles={}&prop=info', title)
return response
##############################
# CATEGORIES, PAGES AND IMAGES
##############################
def mw_page_text(site, page):
page = site.Pages[page]
text = page.text()
return text
def mw_cats(site, args):
last_names = None
for cats in args.category:

@ -3,7 +3,7 @@
import xml.etree.ElementTree as ET
import html5lib, pprint
from mmdc_modules import api_page, pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site
from mmdc_modules import pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site, mw_page_text
from argparse import ArgumentParser
from mwclient import Site
@ -15,14 +15,7 @@ args = p.parse_args()
print args
########
# QUERY API
########
sid = '1234'
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
########
# CREATE INDEX
# Index
########
memberpages=mw_cats(site, args)
#memberpages['Ctrl-F Reader','As We Speak']
@ -37,20 +30,15 @@ index_tree = html5lib.parse(index_file, namespaceHTMLElements=False)
index_container = index_tree.find(".//div[@class='isotope']") #maybe id is important, to destinguish it
########
# CREATE PAGE
# Create Page
########
for member in memberpages:
print ' member', member
# download mw work page
# pageid=member['pageid']
# pagetitle=(member['title'].encode('utf-8'))
workpage_mw = api_page(member, 'content')
workpage_mw = mw_page_text(site, member)#CONTENT
workpage_mw = replace_gallery(workpage_mw)
workpage_mw = replace_video(workpage_mw)
workdict = parse_work(member, workpage_mw) # create dictionary workpage_mw template
workdict = parse_work(member, workpage_mw) # create dictionary w/ page content
workpage_imgs = mw_imgsurl(site, member)
print 'WORKPAGE_IMGS', workpage_imgs
# only parse pages with Creator, Title, Thumbnail
if len(workdict['Creator'])>1 and len(workdict['Title'])>1 and len(workdict['Description'])>1 and len(workdict['Thumbnail'])>1:
@ -58,12 +46,9 @@ for member in memberpages:
if key in ['Extra', 'Description', 'Bio'] and workdict[key]:
workdict[key] = pandoc2html( (workdict[key].decode('utf-8')))
elif key in ['Creator']:
workdict[key] = workdict[key].replace(',','' ) #remove comma
#replace empty dict values with ' ' # to avoid empty tags
for key in workdict.keys():
if workdict[key] is '':# and key is not 'Thumbnail':
workdict[key] = ' '
workdict[key] = workdict[key].replace(',','' )
for key in workdict.keys():
if type(workdict[key]) is unicode:
workdict[key]=workdict[key].encode('utf-8')
@ -98,16 +83,12 @@ for member in memberpages:
for img in imgs:
img_class = img.get('class')
if img_class != 'template':
print 'img_class',img_class
src =unicode(img.get('src'))
print src
for pair in workpage_imgs:
if src.replace("_", " ") in pair[0]: #if img in html match img in workpage_imgs
print 'FOUND IMG', pair
if src.replace("_", " ") in pair[0]:#if img in html matchs img in workpage_imgs
img.set('src', pair[1])
# newsrc = api_file_url(src) MOVE FULL URl OPERATION TO MW CONTENT
# if newsrc:
# img.set('src', newsrc)
# save work page
creator = workdict['Creator']#.decode('ascii', 'ignore')
creator = creator.replace(' ','_')
@ -115,9 +96,8 @@ for member in memberpages:
write_html_file(page_tree, work_filename)
#######
# INDEX
# Insert Work to Index
#######
# insert work to index
index_addwork( parent=index_container,
workid=key,
href=work_filename.replace('web/',''),

@ -15,7 +15,7 @@
<div id="sidebarInner">
<div id="sideBarDesc">
<div id="sideBarDescInfo">
<a class="hoverBackA" href="index.html"><img src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p>
<a class="hoverBackA" href="index.html"><img class="template" src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p>
</div>
<div id="sideBarDescInner">
@ -40,17 +40,17 @@
</div>
</div>
<div id="logoWrap"><img id="logo" src="./img/black_PZI_logo_p.svg"></div>
<div id="logoWrap"><img class="template" id="logo" src="./img/black_PZI_logo_p.svg"></div>
</div>
<div class="zwartArea zwartAreaWhite sidebarBorderLeft" id="section02">
<div class="fixedsticky" id="filter" style="top:0;">
<div class="themes" id="sortArea">
<a class="hoverBackB" href="index.html">
<img src="./img/arrowBack.svg">
<img class="template" src="./img/arrowBack.svg">
</a>
<p>User:Joak/graduation/catalog1</p><!--{title}-->
<a class="closeSidebar"><img src="./img/arrowUpW.svg"></a>
<a class="closeSidebar"><img class="template" src="./img/arrowUpW.svg"></a>
</div>
</div>

Loading…
Cancel
Save