page content: grabbed by mwclient

10 years ago · 7edbdb60bd
parent 72a840f068
commit 7edbdb60bd
4 changed files with 21 additions and 44 deletions
--- a/mmdc_modules.py
+++ b/mmdc_modules.py
@ -28,17 +28,14 @@ def api_request(action, pagename): #get page: content, metadata, images, imageif
    page_content = json_dic.get(page_id)
    return page_content

-def api_page(title, query):
-    if query == 'content':
-        api_response = api_request('action=query&titles={}&prop=revisions&rvprop=content', title)
-        response = ((api_response.get('revisions'))[0])['*']
-    elif query == 'metadata':
-        response = api_request('action=query&titles={}&prop=info', title)
-    return response
-
 ##############################
 # CATEGORIES, PAGES AND IMAGES
 ##############################
+def mw_page_text(site, page):
+    page = site.Pages[page]
+    text = page.text()
+    return text
+
 def mw_cats(site, args):
    last_names = None
    for cats in args.category:
--- a/mmdc_wiki2web.py
+++ b/mmdc_wiki2web.py
@ -3,7 +3,7 @@

 import xml.etree.ElementTree as ET
 import html5lib, pprint
-from mmdc_modules import api_page, pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site
+from mmdc_modules import pandoc2html, parse_work, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats, mw_imgsurl, site, mw_page_text
 from argparse import ArgumentParser
 from mwclient import Site

@ -15,14 +15,7 @@ args = p.parse_args()
 print args

 ########
-# QUERY API
-########
-sid = '1234'
-useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
-endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&"
-
-########
-# CREATE INDEX
+# Index
 ########
 memberpages=mw_cats(site, args)
 #memberpages['Ctrl-F Reader','As We Speak']
@ -37,20 +30,15 @@ index_tree = html5lib.parse(index_file, namespaceHTMLElements=False)
 index_container = index_tree.find(".//div[@class='isotope']") #maybe id is important, to destinguish it

 ########
-# CREATE PAGE 
+# Create Page 
 ########
 for member in memberpages:
    print ' member', member
-
-    # download mw work page
-#    pageid=member['pageid']
-#    pagetitle=(member['title'].encode('utf-8'))
-    workpage_mw = api_page(member, 'content')
+    workpage_mw = mw_page_text(site, member)#CONTENT
    workpage_mw = replace_gallery(workpage_mw)
    workpage_mw = replace_video(workpage_mw)
-    workdict = parse_work(member, workpage_mw) # create dictionary workpage_mw template
+    workdict = parse_work(member, workpage_mw) # create dictionary w/ page content
    workpage_imgs = mw_imgsurl(site, member)
-    print 'WORKPAGE_IMGS', workpage_imgs
    
    # only parse pages with Creator, Title, Thumbnail    
    if len(workdict['Creator'])>1 and len(workdict['Title'])>1 and len(workdict['Description'])>1 and len(workdict['Thumbnail'])>1:    
@ -58,12 +46,9 @@ for member in memberpages:
            if key in ['Extra', 'Description', 'Bio'] and workdict[key]:
                workdict[key] =  pandoc2html( (workdict[key].decode('utf-8')))
            elif key in ['Creator']:
-                workdict[key] =  workdict[key].replace(',','' ) #remove comma        
-        #replace empty dict values with ' ' # to avoid empty tags
-        for key in workdict.keys():
-            if workdict[key] is '':# and key is not 'Thumbnail':
-                workdict[key] = ' '
-    
+                workdict[key] =  workdict[key].replace(',','' )
+
+        for key in workdict.keys():    
            if type(workdict[key]) is unicode:
                workdict[key]=workdict[key].encode('utf-8')

@ -98,16 +83,12 @@ for member in memberpages:
        for img in  imgs:
           img_class = img.get('class')
           if  img_class != 'template': 
-                print 'img_class',img_class
                src =unicode(img.get('src'))
                print src
                for pair in workpage_imgs: 
-                    if src.replace("_", " ") in pair[0]: #if img in html match img in workpage_imgs
-                        print 'FOUND IMG', pair 
+                    if src.replace("_", " ") in pair[0]:#if img in html matchs img in workpage_imgs
                        img.set('src', pair[1])                    
-            # newsrc = api_file_url(src) MOVE FULL URl OPERATION TO MW CONTENT 
-                # if newsrc:
-                #     img.set('src', newsrc)                    
+
        # save work page
        creator = workdict['Creator']#.decode('ascii', 'ignore')
        creator = creator.replace(' ','_')
@ -115,9 +96,8 @@ for member in memberpages:
        write_html_file(page_tree, work_filename)

        #######
-        # INDEX
+        # Insert Work to Index        
        #######
-        # insert work to index        
        index_addwork( parent=index_container,
                       workid=key,
                       href=work_filename.replace('web/',''),
--- a/web/2015-Henk-Jelle_de_Groot.html
+++ b/web/2015-Henk-Jelle_de_Groot.html
@ -34,7 +34,7 @@
                        <div id="bio"><p>Henk-Jelle de Groot is a Rotterdam based sound designer and musician. After graduating with an Audio / Visual design bachelor Henk-Jelle setup a sound studio in Rotterdam to work in the Audio / Visual industry. After 7 years of working he returned to the Piet Zwart Institute to graduate in a Master of comm design something something. In addition to working in the Audio / Visual industry, he is muscian and builder of electronic instruments.</p>
 </div>

-                        <p class="hightlightSidebar"><a href=" " target="_blank"> </a></p><!-- {website} -->
+                        <p class="hightlightSidebar"><a href="" target="_blank"></a></p><!-- {website} -->

                        <!-- // -->

--- a/web/2015-Joseph_Knierzinger.html
+++ b/web/2015-Joseph_Knierzinger.html
@ -15,7 +15,7 @@
            <div id="sidebarInner">
                <div id="sideBarDesc">
                    <div id="sideBarDescInfo">
-                        <a class="hoverBackA" href="index.html"><img src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p>
+                        <a class="hoverBackA" href="index.html"><img class="template" src="./img/arrowBack.svg"></a><p>Tempted by Tomorrow</p>
                    </div>

                    <div id="sideBarDescInner">
@ -40,17 +40,17 @@
                </div>
            </div>

-            <div id="logoWrap"><img id="logo" src="./img/black_PZI_logo_p.svg"></div>
+            <div id="logoWrap"><img class="template" id="logo" src="./img/black_PZI_logo_p.svg"></div>
        </div>

        <div class="zwartArea zwartAreaWhite sidebarBorderLeft" id="section02">
            <div class="fixedsticky" id="filter" style="top:0;">
              <div class="themes" id="sortArea">
 		<a class="hoverBackB" href="index.html">
-		  <img src="./img/arrowBack.svg">
+		  <img class="template" src="./img/arrowBack.svg">
 		</a>
 		<p>User:Joak/graduation/catalog1</p><!--{title}-->
-		<a class="closeSidebar"><img src="./img/arrowUpW.svg"></a>
+		<a class="closeSidebar"><img class="template" src="./img/arrowUpW.svg"></a>
              </div>
            </div>