cleaning scripts

10 years ago · 4322349886
parent 1b1933aece
commit 4322349886
2 changed files with 4 additions and 79 deletions
--- a/mmdc_modules.py
+++ b/mmdc_modules.py
@ -41,8 +41,6 @@ def api_page(title, query):
 ##############################
 # CATEGORIES AND PAGES
 ################ 
 # * MUST BE REPLACE BY SMARTER CODE (USING PY MD LIB)
 ##############################
 def mw_cats(args):
    site = Site(args.host, path=args.path)
@ -62,37 +60,6 @@ def mw_cats(args):
    return [p.name  for p in results]
 def api_pagecategories(pageid):
    '''Find all the categories, and their parent category of a page '''
    query = 'action=query&pageids={}&prop=categories'.format(pageid)
    url = endpoint + query
    request = urllib2.urlopen(url)
    jsonp = json.loads(request.read())    
    json_dic = jsonp['query']['pages']
    page_id =  json_dic.keys()[0]
    page_categories = json_dic[page_id][u'categories']
    all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '')
    return all_cats
 def api_pagesincategories(category, year):
    # Find all pages incategory and add to allworks dictionary
    category =  category.replace(' ', '_')
    apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category)    
    request = urllib2.urlopen(apiCatMembers)
    jsonp = json.loads(request.read())    
    graduationWorkMembers = jsonp['query']['categorymembers']
    intersectCatMembers = []
    if year:          
        for member in graduationWorkMembers:
            page_cats = api_pagecategories(member['pageid'])
            if ('Category:{}'.format(year)) in page_cats:
                print year, 'in', page_cats
                intersectCatMembers.append(member)# add member to intersectCatMembers
    else:
        intersectCatMembers =  graduation_work_members
    return intersectCatMembers
 def api_file_url(filename): # get full urls
    page_content_dict = api_page(filename, 'file')   
    if 'imageinfo' in page_content_dict.keys():
@ -115,31 +82,6 @@ def write_html_file(html_tree, filename):
    edited.write(html)
    edited.close()
 # mw article modules
 def parse_work_page(title, content):
 #    content = content.encode('utf-8')
    if re.match('\{\{\Graduation work', content):
        work_dict = {}
        work_dict['Title']=title
        template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
        keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL)    # template's key/value pair
        if extra: #append extra
            extra = ('Extra', extra)
            keyval.append(extra) #?
            for pair in keyval:
                key = pair[0]
                val = pair[1]
                val = val.replace('\n','')            
                if 'Creator' in key:
                    val = val.replace(', ', '')
                elif 'Thumbnail' in key:
                    thumburl = api_thumb_url(val)
                    work_dict['Thumbnail_url']=thumburl
                work_dict[key]=val
            return work_dict, extra
 # Alternative to parse_work_page
 def parse_work(title, content):
    workdict = {'Title':title, 'Creator':'', 'Date':'', 'Website':'', 'Thumbnail':'', 'Bio':'', 'Description':'', 'Extra':''}    
@ -159,13 +101,10 @@ def parse_work(title, content):
                val = api_thumb_url(val)
            elif 'Website' in key:
                val = urllib.unquote(val)                
            workdict[key]=val.encode('utf-8')
 #    pprint.pprint(workdict)
    return workdict
 # Conversion Modules
 def pandoc2html(mw_content):
    '''convert individual mw sections to html'''
    mw_content = mw_content.encode('utf-8')
@ -177,14 +116,6 @@ def pandoc2html(mw_content):
    html = (p2.communicate())[0]
    return html
 def img_fullurl(parent):
    imgs = parent.findall('.//img')
    for img in imgs:
        src = img.get('src')
        fullurl =  api_thumb_url(src)
        if fullurl != None:            
            img.set('src', fullurl)
 gallery_exp=re.compile('<gallery>(.*?)</gallery>', re.S)
 imgfile_exp=re.compile('(File:(.*?)\.(gif|jpg|jpeg|png))')
@ -214,10 +145,4 @@ def index_addwork(parent, workid, href, thumbnail, title, creator, date):
    grandchild_a = ET.SubElement(child_div, 'a', attrib={'href':href, 'class':'work'}) 
    grandgrandchild_img = ET.SubElement(grandchild_a, 'img', attrib={'class':'work', 'src':thumbnail})    
 # TEXT CONTENT ?
 #    grandchild_text = ET.SubElement(child_div, 'div', attrib={'class':'work'}) 
 #    grandchild_text.text=creator
    # need to add css width to div.item
--- a/prototype_page.py
+++ b/prototype_page.py
@ -16,8 +16,8 @@
 # build all pages
 import xml.etree.ElementTree as ET
-import html5lib, re, pprint
+import html5lib, pprint
-from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, gallery_exp, video_exp, api_pagesincategories, index_addwork, write_html_file, mw_cats
+from mmdc_modules import api_page, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats
 from argparse import ArgumentParser
 p = ArgumentParser()