You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
4.1 KiB

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2, json, pprint, re
import xml.etree.ElementTree as ET
import subprocess, shlex
sid = '1234'
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
endpoint = ""
def api_request(action, pagename): #get page: content, metadata, images, imageifnp
url = endpoint + action.format(pagename)
request = urllib2.urlopen(url)
jsonp = json.loads( )
json_dic= (jsonp.get('query').get('pages'))
# pprint.pprint( json_dic )
page_id = json_dic.keys()[0]
page_content = json_dic.get(page_id)
return page_content
def api_page(pageid, query):
if query == 'content':
api_response = api_request('action=query&pageids={}&prop=revisions&rvprop=content', pageid)
response = ((api_response.get('revisions'))[0])['*']
elif query == 'metadata':
response = api_request('action=query&pageids={}&prop=info', pageid)
elif query == 'articleimgs':
response = api_request('action=query&pageids={}&prop=images', pageid)
10 years ago
elif query == 'file':
response = api_request('action=query&titles=File:{}&prop=imageinfo&iiprop=url',pageid)
pprint.pprint( response )
elif query == 'imageinfo':
pagename = pageid # in imageinfo titles are used instead of id
response = api_request('action=query&titles=File:{}&prop=imageinfo&iiprop=url&iiurlwidth=500', pagename) # iiurlwidht dermines with of thumbnail
return response
10 years ago
def api_file_url(filename): # get full urls
page_content_dict = api_page(filename, 'file')
if 'imageinfo' in page_content_dict.keys():
imgurl = ((page_content_dict.get('imageinfo'))[0].get('url'))
return imgurl
return None
def api_thumb_url(filename):
'''get thumbnail url of image'''
10 years ago
page_content_dict = api_page(filename, 'imageinfo')
if 'imageinfo' in page_content_dict.keys():
thumburl = ((page_content_dict.get('imageinfo'))[0].get('thumburl'))
return thumburl
# 905.jpg&prop=imageinfo&iiprop=url&iiurlwidth=300
def write_html_file(html_tree, filename):
doctype = "<!DOCTYPE HTML>"
html = doctype + ET.tostring(html_tree, encoding='utf-8', method='html')
edited = open(filename, 'w') #write
# Conversion Modules
def pandoc2html(mw_content):
if mw_content:
mw_content = mw_content.encode('utf-8')
# convert from mw to html
args_echo =shlex.split( ('echo "{}"'.format(mw_content)) )
args_pandoc = shlex.split( 'pandoc -f mediawiki -t html5' )
p1 = subprocess.Popen(args_echo, stdout=subprocess.PIPE)
p2 = subprocess.Popen(args_pandoc, stdin=p1.stdout, stdout=subprocess.PIPE)
html = (p2.communicate())[0]
return html
def img_fullurl(parent):
imgs = parent.findall('.//img')
print 'len IMG', len(imgs)
for img in imgs:
src = img.get('src')
fullurl = api_thumb_url(src)
print '----- IMG', ET.tostring(img ), src, fullurl
if fullurl != None:
img.set('src', fullurl)
# fileurl = api_request(src, endpoint)# find url of file
def replace_youtube(parent, youtube_id):
youtube = parent.findall('.//youtube')[0]
youtube_url = "{}".format(youtube_id)
ET.SubElement(parent, 'iframe', {"width":"560", "height":"315", "frameborder": "0", "allowfullscreen": "allowfullscreen", "src": youtube_url})
# def replace_gallery(parent):
# galleries = parent.findall('.//gallery')
# for gallery in galleries:
# print 'GALLERY', gallery.text()