diff --git a/mmdc_modules.py b/mmdc_modules.py index 96ce5ac..7c93b35 100644 --- a/mmdc_modules.py +++ b/mmdc_modules.py @@ -41,6 +41,60 @@ def api_page(pageid, query): response = api_request("action=query&titles=File:{}&prop=imageinfo&iiprop=url&iiurlwidth=500", pagename) # iiurlwidht dermines with of thumbnail return response +############################## +# CATEGORIES AND PAGES +################ +# * MUST BE REPLACE BY SMARTER CODE (USING PY MD LIB) +############################## +def api_pagecategories(pageid): + '''Find all the categories, and their parent category of a page ''' + query = 'action=query&pageids={}&prop=categories'.format(pageid) + url = endpoint + query + request = urllib2.urlopen(url) + jsonp = json.loads(request.read()) + json_dic = jsonp['query']['pages'] + page_id = json_dic.keys()[0] + page_categories = json_dic[page_id][u'categories'] + all_cats = [ entry[u'title'].encode('utf-8') for entry in page_categories ] #.replace('Category:', '') + return all_cats + + +def api_pagesincategories(category, year): + # Find all pages incategory and add to allworks dictionary + category = category.replace(' ', '_') + apiCatMembers = endpoint + 'action=query&list=categorymembers&cmlimit=1000&cmtitle=Category:{}'.format(category) + request = urllib2.urlopen(apiCatMembers) + jsonp = json.loads(request.read()) + graduationWorkMembers = jsonp['query']['categorymembers'] + intersectCatMembers = [] + if year: + for member in graduationWorkMembers: + page_cats = api_pagecategories(member['pageid']) + if ('Category:{}'.format(year)) in page_cats: + print year, 'in', page_cats + intersectCatMembers.append(member)# add member to intersectCatMembers + else: + intersectCatMembers = graduation_work_members + return intersectCatMembers + # for page in intersectCatMembers: + # title = ((page['title']).encode('utf-8') ).replace(" ", "_") #snakecase for page titles + # pageid = page['pageid'] + # article = api_page(pageid, 'content') + # # print title + # # pprint.pprint(article) + # work = parse_work_page(title, article) + # if work: + # allworks[pageid] = work #dictionary(allworks) entry + # print pprint.pprint( work ) + # # Create work page + # else: + # print 'WORK DOES NOT CONTAIN REQUIRED CONTENT' + # print '-------------' + # print + + + + def api_file_url(filename): # get full urls page_content_dict = api_page(filename, 'file') if 'imageinfo' in page_content_dict.keys(): @@ -60,7 +114,7 @@ def api_thumb_url(filename): # http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&pageids=10603&prop=revisions&rvprop=content # http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&action=query&titles=Graduation_Website_Braindump&prop=revisions&rvprop=content -# XML MODULES +# PROCESSING MODULES def write_html_file(html_tree, filename): doctype = "" @@ -77,11 +131,13 @@ def parse_work_page(title, content): work_dict = {} work_dict['Title']=title template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] + print 'template', template, + print 'extra', extra # template's key/value pair keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) if extra: extra = ('Extra', extra) - keyval.append(extra) + keyval.append(extra) #? # checkkeys = [keyval[i] for i in range(len(keyval)) if keyval[i][0] in mainkeys and len(keyval[i][1])>3] #list mainkeys present, w/ values, in tuples [(key, val),(key, val)...] # if len(checkkeys) == 3 : # checkkeys contains all mainkeys and values @@ -106,7 +162,7 @@ def parse_work(title, content): if re.match('\{\{\Graduation work', content): template, extra = (re.findall('\{\{Graduation work\n(.*?)\}\}(.*)', content, re.DOTALL))[0] - workdict['Extra'] = extra#.encode('utf-8') + workdict['Extra'] = extra.encode('utf-8') # template's key/value pair keyval = re.findall('\|(.*?)\=(.*?\n)', template, re.DOTALL) for pair in keyval: @@ -118,7 +174,7 @@ def parse_work(title, content): print 'calling API' val = api_thumb_url(val) print 'THUMB', val - workdict[key]=val + workdict[key]=val.encode('utf-8') pprint.pprint(workdict) return workdict @@ -128,7 +184,7 @@ def parse_work(title, content): # Conversion Modules def pandoc2html(mw_content): if mw_content: - mw_content = mw_content#.encode('utf-8') + mw_content = mw_content.encode('utf-8') # convert from mw to html args_echo =shlex.split( ('echo "{}"'.format(mw_content)) ) args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' ) diff --git a/prototype_page.py b/prototype_page.py index 1e26abe..0aab6c5 100755 --- a/prototype_page.py +++ b/prototype_page.py @@ -15,41 +15,78 @@ import xml.etree.ElementTree as ET import html5lib, re, pprint -from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, gallery_exp, video_exp - -template = open("web/page-template.html", "r") -template = template.read() - -# download mw work page -pageid='15965'#Qq #'16025' #'15986'Jozeph #'16025'Mina -work = 'Q' #'Mina'#'User:Joak/graduation/catalog1' -workpage_mw = api_page(pageid, 'content') - -# parse workpage_mw -workpage_mw = replace_gallery(workpage_mw) -workpage_mw = replace_video(workpage_mw) -workdict = parse_work(work, workpage_mw) # create dictionary workpage_mw template -for key in workdict.keys(): - if key in ['Extra', 'Description', 'Bio']: - workdict[key] = pandoc2html(workdict[key].encode('utf-8')) - -# fill template with dictionary/mw_page values -workpage_html = template.format(title=workdict['Title'], creator=workdict['Creator'], date=workdict['Date'], website=workdict['Website'], thumbnail=workdict['Thumbnail'], bio=workdict['Bio'],description=workdict['Description'], extra=workdict['Extra'] ) - -# parse workpage_html # process html: img full url -tree = html5lib.parse(workpage_html, namespaceHTMLElements=False) -imgs = tree.findall('.//img') -for img in imgs: - src = img.get('src') - newsrc = api_file_url(src) - if newsrc: - img.set('src', newsrc) - -# save workpage_html -workpage_html = ET.tostring(tree) -work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], (workdict['Creator'].encode('ascii', 'ignore')).replace(' ','_'), pageid) -work_file = open(work_filename, "w") -work_file.write(workpage_html) -work_file.close() +from mmdc_modules import api_request, api_page, api_thumb_url, pandoc2html, parse_work, api_file_url, replace_gallery, replace_video, gallery_exp, video_exp, api_pagesincategories + + + + +######## +# QUERY API +######## +sid = '1234' +useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" +endpoint = "http://pzwiki.wdka.nl/mw-mediadesign/api.php?format=json&" + + +######## +# CREATE INDEX +######## +memberpages = api_pagesincategories('Graduation work', '2015') #list, containing dictionary of all page ids. Example: [{u'ns': 0, u'pageid': 15974, u'title': u'Ahhhh'}, {u'ns': 0, u'pageid': 16005, u'title': u'Artyom-graduation-work'}] +print 'memberpages', memberpages + + + + +######## +# CREATE PAGE +######## +page_template = open("web/page-template.html", "r") +page_template = page_template.read() + + +for member in memberpages: + print member + # download mw work page + pageid=member['pageid'] + pagetitle=(member['title'].encode('utf-8')) + print pageid + workpage_mw = api_page(pageid, 'content') + + # parse workpage_mw + workpage_mw = replace_gallery(workpage_mw) + workpage_mw = replace_video(workpage_mw) + workdict = parse_work(pagetitle, workpage_mw) # create dictionary workpage_mw template + + for key in workdict.keys(): # convert Extra, Description, Bio to HTML + if key in ['Extra', 'Description', 'Bio']: + workdict[key] = pandoc2html( (workdict[key].decode('utf-8')) ) + + # fill template with dictionary/mw_page values + workpage_html = page_template.format(title=(workdict['Title']), + creator=(workdict['Creator']), + date=workdict['Date'], + website=workdict['Website'], + thumbnail=workdict['Thumbnail'], + bio=(workdict['Bio']), + description=(workdict['Description']), + extra=(workdict['Extra']) ) + + # parse workpage_html # process html: img full url + tree = html5lib.parse(workpage_html, namespaceHTMLElements=False) + imgs = tree.findall('.//img') + for img in imgs: + src = img.get('src') + newsrc = api_file_url(src) + if newsrc: + img.set('src', newsrc) + + # save workpage_html + workpage_html = ET.tostring(tree) + creator = workdict['Creator'].decode('ascii', 'ignore') + creator = creator.replace(' ','_') + work_filename = 'web/{}-{}-{}.html'.format(workdict['Date'], creator, pageid) + work_file = open(work_filename, "w") + work_file.write(workpage_html) + work_file.close()