Merge branch 'master' of ssh://git.xpub.nl:2501/XPUB/special-issue-11-wiki2html

master
Michael Murtaugh 5 years ago
commit 4509d1278a

@ -1,8 +1,9 @@
import os import os
from mwclient import Site from mwclient import Site
from pprint import pprint from pprint import pprint
from functions import update_json
from PIL import Image from PIL import Image
from functions import update_json, remove_nonwords
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.abspath(__file__)) # working directory wd = os.path.dirname(os.path.abspath(__file__)) # working directory
@ -28,7 +29,7 @@ for img in site.allimages():
# important img info to dictionary # important img info to dictionary
img_dict = { img_dict = {
'name': img.name, 'name': img.name,
'filename': img.page_title, 'filename': remove_nonwords(img.page_title),
'timestamp': img.imageinfo['timestamp'], 'timestamp': img.imageinfo['timestamp'],
'url': img.imageinfo['url'], 'url': img.imageinfo['url'],
'urldesc': img.imageinfo['descriptionshorturl'], 'urldesc': img.imageinfo['descriptionshorturl'],
@ -37,6 +38,7 @@ for img in site.allimages():
# location of image storage # location of image storage
img_fn = os.path.join(imgdir, img_dict['filename']) img_fn = os.path.join(imgdir, img_dict['filename'])
print(img_fn)
# function updates images.json and returns whether the img needs to be downloaded or not # function updates images.json and returns whether the img needs to be downloaded or not
download = update_json(imgsjson_fn, img_dict, img_fn) download = update_json(imgsjson_fn, img_dict, img_fn)
@ -47,16 +49,18 @@ for img in site.allimages():
img.download(destination=img_file) img.download(destination=img_file)
# resize image # resize image
pilimg = Image.open(img_fn) fn, ext = os.path.splitext(img_fn)
pilimg_dim = list(pilimg._size) if ext.lower() in ['.jpg', '.jpeg', '.gif', '.png']: # only img format
pilimg_dim_sort = sorted(pilimg_dim) # smallest dimension 1st pilimg = Image.open(img_fn)
img_ratio = pilimg_dim_sort[0] / pilimg_dim_sort[1] pilimg_dim = list(pilimg._size)
if pilimg_dim == pilimg_dim_sort: pilimg_dim_sort = sorted(pilimg_dim) # smallest dimension 1st
# if height was largest img_ratio = pilimg_dim_sort[0] / pilimg_dim_sort[1]
new_dim = [(thumbnail_size * img_ratio), thumbnail_size] if pilimg_dim == pilimg_dim_sort:
else: # if height was largest
# if with was largest new_dim = [(thumbnail_size * img_ratio), thumbnail_size]
new_dim = [thumbnail_size,(thumbnail_size * img_ratio)] else:
pilimg.thumbnail(new_dim) # if with was largest
pilimg.save(img_fn) new_dim = [thumbnail_size,(thumbnail_size * img_ratio)]
pilimg.thumbnail(new_dim)
pilimg.save(img_fn)
print('\n') print('\n')

@ -3,6 +3,12 @@ import subprocess
from datetime import datetime from datetime import datetime
def remove_nonwords(imgname):
filename, ext = os.path.splitext(imgname) # split into filename & extension
filename = re.sub(r'\W', '', filename) # remove nonwoders from filename
return f'{filename}{ext}' # join filename & ext'
def pandoc(pwd, content, format_in, format_out): def pandoc(pwd, content, format_in, format_out):
# print('HTML content file:', wiki_content_f.name) # print('HTML content file:', wiki_content_f.name)
@ -37,23 +43,24 @@ def unpack_response(response):
printouts = response['printouts'] printouts = response['printouts']
page = response['fulltext'] page = response['fulltext']
fullurl = response['fullurl'] fullurl = response['fullurl']
d['page'] = page # convert OrderDict to Dict json.dumps json.loads
for prop in printouts: printouts_dumps = json.dumps(printouts)
p_item = response['printouts'][prop] printouts_loads = json.loads(printouts_dumps)
for prop_val in p_item: printouts_loads['page'] = page
if isinstance(prop_val, dict) is False: simplified_printouts = {}
d[prop] = prop_val for k, v in printouts_loads.items():
else: if k == 'Date':
# if len(prop_val) > 0: simplified_printouts[k] = datetime.fromtimestamp(
props = list(prop_val.keys()) int(v[0]['timestamp']))
if 'fulltext' in props: # elif k == 'Title':
val = prop_val.get('fulltext') # simplified_printouts[k] = v[0]['fulltext']
elif 'timestamp' in props: elif k in ['Part', 'Partof', 'page']: # only 1 value for each
val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) simplified_printouts[k] = v
else: else: # Possibly more than 1 value for the rest of properties
val = list(prop_val.values())[0] simplified_printouts[k] = []
d[prop] = val for listitem in v:
return page, d, fullurl simplified_printouts[k].append(listitem['fulltext'])
return page, simplified_printouts, fullurl
def update_json(imgsjson_fn, img_dict, img_fn): def update_json(imgsjson_fn, img_dict, img_fn):
@ -95,6 +102,7 @@ def clean_dir(dirfullpath):
os.remove(f) os.remove(f)
def print_colormsg(msg, level): def print_colormsg(msg, level):
color_cmd = ''
if level == 'fail': if level == 'fail':
color_cmd = Colors.FAIL color_cmd = Colors.FAIL
elif level == 'warning': elif level == 'warning':

@ -1,76 +0,0 @@
import os, json
from mwclient import Site
from pprint import pprint
from jinja2 import Template
from functions import pandoc, page_props
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
imgdir = os.path.join(wd, 'images')
imgsjson_fn = os.path.join(wd, 'images.json')
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
static_html = os.path.join(wd, 'static_html')
os.makedirs(static_html, exist_ok=True) # create images/ dir
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
page_html_template = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="../static/style.css" />
<title>{{title}}</title>
</head>
<body>
<h1>{{ title }}</h1>
<p><time datetime="{{date}}">{{date}}</time></p>
<div id="img">
<img src="{{ imgsrc }}" />
</div>
<div id="content">
{{ content }}
</div>
<footer>
Part {{part}} of {{partof}}
</footer>
</body>
</html>
'''
page_template = Template(page_html_template)
for img_info in images_info.values():
print(img_info)
page_name = img_info['name']
page = site.pages[page_name]
# print(page)
# pprint(page.__dict__)
# print(dir(page))
pagetext = page.text()
pageproperties = page_props(wikicontent=pagetext)
print(pageproperties)
if pageproperties.get('Title'):
pagetext_html = pandoc(pwd=wd ,content=pagetext, format_in='mediawiki', format_out='html')
# print('pagetext', pagetext)
# print('pagetext_html', pagetext_html)
page_html = page_template.render(title=pageproperties.get('Title'),
date=pageproperties.get('Date'),
imgsrc=os.path.join(imgdir, img_info.get('filename')),
content=pagetext_html,
part=pageproperties.get('Part'),
partof=pageproperties.get('Partof'))
htmlpage_fn = "{}_{}.html".format(
pageproperties.get('Title').replace(" ", ""),
pageproperties.get('Part').zfill(3)
)
print(htmlpage_fn)
with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
htmlfile.write(page_html)

@ -2,7 +2,7 @@ import os, json, sys, urllib
from mwclient import Site from mwclient import Site
from pprint import pprint from pprint import pprint
from jinja2 import Template from jinja2 import Template
from functions import pandoc, page_props, unpack_response, clean_dir from functions import pandoc, unpack_response, clean_dir, remove_nonwords
from functions import Colors from functions import Colors
import argparse import argparse
@ -15,7 +15,7 @@ p.add_argument("--conditions", "-c", metavar='',
default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]', default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]',
help='The query conditions') help='The query conditions')
p.add_argument("--printouts", "-p", metavar='', p.add_argument("--printouts", "-p", metavar='',
default='?Title|?Date|?Part|?Partof|?Creator', default='?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language',
help='Selection of properties to printout') help='Selection of properties to printout')
p.add_argument("--sort", "-s", metavar='', p.add_argument("--sort", "-s", metavar='',
default='Date,Title,Part', default='Date,Title,Part',
@ -23,6 +23,8 @@ p.add_argument("--sort", "-s", metavar='',
p.add_argument("--order", "-o", metavar='', p.add_argument("--order", "-o", metavar='',
default='asc,asc,asc', default='asc,asc,asc',
help='Order of sorting conditions. Should same amount as the --sort properties') help='Order of sorting conditions. Should same amount as the --sort properties')
p.add_argument('--limit', '-l', help='(optional) Limit the number of returned '
'items')
p.add_argument('--dry', '-d', action='store_true', p.add_argument('--dry', '-d', action='store_true',
help='dry-run: will only show the query but not run it') help='dry-run: will only show the query but not run it')
@ -33,8 +35,10 @@ if len(args.sort.split(',')) != len(args.order.split(',')):
Colors.WARNING, '--sort and --order do not have the same amount of elements', Colors.ENDC) Colors.WARNING, '--sort and --order do not have the same amount of elements', Colors.ENDC)
print('Script exiting now') print('Script exiting now')
sys.exit() sys.exit()
query = f'{args.conditions}|{args.printouts}|sort={args.sort}|order={args.order}' query = f'{args.conditions}|{args.printouts}|sort={args.sort}|order={args.order}'
if args.limit:
limit_str = f'|limit={args.limit}'
query += limit_str
print('query:', Colors.GREEN, query, Colors.ENDC) print('query:', Colors.GREEN, query, Colors.ENDC)
query_unquoted = urllib.parse.quote(query) query_unquoted = urllib.parse.quote(query)
query_url = f'https://{args.host}{args.path}api.php?action=ask&query={query_unquoted}&format=json' query_url = f'https://{args.host}{args.path}api.php?action=ask&query={query_unquoted}&format=json'
@ -75,7 +79,7 @@ with open(os.path.join(wd, 'templates/document_part.html')) as document_html:
all_document_parts = '' # to append all content all_document_parts = '' # to append all content
documentslist = [] documentslist = []
for answer in site.ask(query): for answer in site.ask(query):
publication_title = '' # publication_title = ''
# print(answer, answer.keys()) # print(answer, answer.keys())
page, printout_dict, fullurl = unpack_response(answer) page, printout_dict, fullurl = unpack_response(answer)
print(page) print(page)
@ -85,6 +89,7 @@ for answer in site.ask(query):
print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images") print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC) print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
sys.exit() sys.exit()
page = site.pages[[printout_dict['page']]] # request that page from wiki page = site.pages[[printout_dict['page']]] # request that page from wiki
pagetext = page.text() pagetext = page.text()
pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html') pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html')
@ -101,10 +106,13 @@ for answer in site.ask(query):
if printout_dict['Part'] == printout_dict['Partof']: if printout_dict['Part'] == printout_dict['Partof']:
# RENDER DOCUMENT # RENDER DOCUMENT
# by passing all_document_parts html to document_template content # by passing all_document_parts html to document_template content
document_html = document_template.render(title=printout_dict.get('Title'), document_html = document_template.render(
date=printout_dict.get('Date'), title=printout_dict.get('Title'),
content=all_document_parts) # render document template date=printout_dict.get('Date'),
htmlpage_fn = "{}.html".format(printout_dict.get('Title').replace(" ", "")) content=all_document_parts) # render document template
htmlpage_fn = "{}.html".format(
remove_nonwords(printout_dict.get('Title')[0])
)
with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile: with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
htmlfile.write(document_html) htmlfile.write(document_html)
all_document_parts = '' # Reset all_document_parts all_document_parts = '' # Reset all_document_parts

@ -3,10 +3,10 @@
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<link rel="stylesheet" href="../static/style.css" /> <link rel="stylesheet" href="../static/style.css" />
<title>{{title}}</title> <title>{{ title[0] }}</title>
</head> </head>
<body> <body>
<h1>{{ title }}</h1> <h1>{{ title[0] }}</h1>
<p><time datetime="{{date}}">{{ date.year }}.{{ date.month }}.{{ date.day }} </time></p> <p><time datetime="{{date}}">{{ date.year }}.{{ date.month }}.{{ date.day }} </time></p>
<div id="content"> <div id="content">
{{ content }} {{ content }}

@ -11,19 +11,21 @@
<div class="metadata"> <div class="metadata">
<h3>Metadata</h3> <h3>Metadata</h3>
{% for key, value in printout_dict.items() %} {% for key, valuelist in printout_dict.items() %}
{% if key == 'Date' %} <div class="metadata_{{key}}">
<div class="metadata_{{key}}"> {% if key == 'Date' %}
<span class="key">{{key}}</span> <span class="key">{{key}}</span>
<span class="value">{{value.year}} {{value.month}} {{value.day}}</span> <span class="value">{{valuelist.year}} {{valuelist.month}} {{valuelist.day}}</span>
</div> {% elif key == 'page' %}
{% else %}
<div class="metadata_{{key}}">
<span class="key">{{key|upper}}</span> <span class="key">{{key|upper}}</span>
<span class="value">{{value}}</span> <span class="value">{{valuelist}}</span>
</div> {% else %}
{% endif %} {% if valuelist|length > 0 %}
<span class="key">{{key|upper}}</span>
<span class="value">{{valuelist | join(", ")}}</span>
{% endif %}
{% endif %}
</div>
{% endfor %} {% endfor %}
</div> </div>
</div> </div>

@ -8,9 +8,9 @@
<h3>Results from query:<br/><code>{{query}}</code></h3> <h3>Results from query:<br/><code>{{query}}</code></h3>
<ul> <ul>
{% for doc in documentslist %} {% for doc in documentslist %}
<li><a href="./{{ doc['file'] }}">{{ doc['title'] }}</a> <li><a href="./{{ doc['file'] }}">{{ doc['title'][0] }}</a>
{{ doc['date'].year }}.{{ doc['date'].month }}.{{ doc['date'].day }} {{ doc['date'].year }}.{{ doc['date'].month }}.{{ doc['date'].day }}
{{doc['creator']}} {{doc['creator'] | join(", ")}}
</li> </li>
{% endfor %} {% endfor %}
</ul> </ul>

Loading…
Cancel
Save