cleaning scripts

master
Castro0o 9 years ago
parent 1b1933aece
commit 4322349886

@ -41,8 +41,6 @@ def api_page(title, query):
##############################
# CATEGORIES AND PAGES
################
# * MUST BE REPLACE BY SMARTER CODE (USING PY MD LIB)
##############################
def mw_cats(args):
site = Site(args.host, path=args.path)
@ -62,37 +60,6 @@ def mw_cats(args):
return [p.name for p in results]
def api_pagecategories(pageid):
'''Find all the categories, and their parent category of a page '''
query = 'action=query&pageids={}&prop=categories'.format(pageid)
url = endpoint + query
request = urllib2.urlopen(url)
jsonp = json.loads(request.read())
json_dic = jsonp['query']['pages']
page_id = json_dic.keys()[0]
page_categories = json_dic[page_id][u'categories']
all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '')
return all_cats
def api_pagesincategories(category, year):
# Find all pages incategory and add to allworks dictionary
category = category.replace(' ', '_')
apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category)
request = urllib2.urlopen(apiCatMembers)
jsonp = json.loads(request.read())
graduationWorkMembers = jsonp['query']['categorymembers']
intersectCatMembers = []
if year:
for member in graduationWorkMembers:
page_cats = api_pagecategories(member['pageid'])
if ('Category:{}'.format(year)) in page_cats:
print year, 'in', page_cats
intersectCatMembers.append(member)# add member to intersectCatMembers
else:
intersectCatMembers = graduation_work_members
return intersectCatMembers
def api_file_url(filename): # get full urls
page_content_dict = api_page(filename, 'file')
if 'imageinfo' in page_content_dict.keys():
@ -115,31 +82,6 @@ def write_html_file(html_tree, filename):
edited.write(html)
edited.close()
# mw article modules
def parse_work_page(title, content):
# content = content.encode('utf-8')
if re.match('\{\{\Graduation work', content):
work_dict = {}
work_dict['Title']=title
template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0]
keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) # template's key/value pair
if extra: #append extra
extra = ('Extra', extra)
keyval.append(extra) #?
for pair in keyval:
key = pair[0]
val = pair[1]
val = val.replace('\n','')
if 'Creator' in key:
val = val.replace(', ', '')
elif 'Thumbnail' in key:
thumburl = api_thumb_url(val)
work_dict['Thumbnail_url']=thumburl
work_dict[key]=val
return work_dict, extra
# Alternative to parse_work_page
def parse_work(title, content):
workdict = {'Title':title, 'Creator':'', 'Date':'', 'Website':'', 'Thumbnail':'', 'Bio':'', 'Description':'', 'Extra':''}
@ -158,14 +100,11 @@ def parse_work(title, content):
elif 'Thumbnail' in key:
val = api_thumb_url(val)
elif 'Website' in key:
val = urllib.unquote(val)
val = urllib.unquote(val)
workdict[key]=val.encode('utf-8')
# pprint.pprint(workdict)
return workdict
# Conversion Modules
def pandoc2html(mw_content):
'''convert individual mw sections to html'''
mw_content = mw_content.encode('utf-8')
@ -177,14 +116,6 @@ def pandoc2html(mw_content):
html = (p2.communicate())[0]
return html
def img_fullurl(parent):
imgs = parent.findall('.//img')
for img in imgs:
src = img.get('src')
fullurl = api_thumb_url(src)
if fullurl != None:
img.set('src', fullurl)
gallery_exp=re.compile('<gallery>(.*?)</gallery>', re.S)
imgfile_exp=re.compile('(File:(.*?)\.(gif|jpg|jpeg|png))')
@ -213,11 +144,5 @@ def index_addwork(parent, workid, href, thumbnail, title, creator, date):
'data-date':date})
grandchild_a = ET.SubElement(child_div, 'a', attrib={'href':href, 'class':'work'})
grandgrandchild_img = ET.SubElement(grandchild_a, 'img', attrib={'class':'work', 'src':thumbnail})
# TEXT CONTENT ?
# grandchild_text = ET.SubElement(child_div, 'div', attrib={'class':'work'})
# grandchild_text.text=creator
grandgrandchild_img = ET.SubElement(grandchild_a, 'img', attrib={'class':'work', 'src':thumbnail})
# need to add css width to div.item

@ -16,8 +16,8 @@
# build all pages
import xml.etree.ElementTree as ET
import html5lib, re, pprint
from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, gallery_exp, video_exp, api_pagesincategories, index_addwork, write_html_file, mw_cats
import html5lib, pprint
from mmdc_modules import api_page, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, index_addwork, write_html_file, mw_cats
from argparse import ArgumentParser
p = ArgumentParser()

Loading…
Cancel
Save