special-issue-11-wiki2html/query2html.py

import os, json, sys, urllib
from mwclient import Site
from pprint import pprint
from jinja2 import Template
from functions import pandoc, unpack_response, clean_dir, remove_nonwords

from functions import Colors
import argparse

p = argparse.ArgumentParser(description="From smw ask string generate HTML pages with resulting results.",
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host",  metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--conditions", "-c", metavar='',
               default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]',
               help='The query conditions')
p.add_argument("--printouts", "-p", metavar='',
               default='?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language',
               help='Selection of properties to printout')
p.add_argument("--sort", "-s", metavar='',
               default='Date,Title,Part',
               help='Sorting according to conditions')
p.add_argument("--order", "-o", metavar='', 
               default='asc,asc,asc',
               help='Order of sorting conditions. Should same amount as the --sort properties')
p.add_argument('--limit', '-l', help='(optional) Limit the number of returned '
                                     'items')
# TODO: GET limit to work.Perhaps with a site.raw_api method
p.add_argument('--dry', '-d', action='store_true',
               help='dry-run: will only show the query but not run it')

args = p.parse_args()

if len(args.sort.split(',')) != len(args.order.split(',')):
    print(Colors.FAIL, 'Invalid query:',
          Colors.WARNING, '--sort and --order do not have the same amount of elements', Colors.ENDC)
    print('Script exiting now')
    sys.exit()
query = f'{args.conditions}|{args.printouts}|sort={args.sort}|order={args.order}'
if args.limit:
    limit_str = f'|limit={args.limit}'
    query += limit_str
print('query:', Colors.GREEN, query, Colors.ENDC)
query_unquoted = urllib.parse.quote(query)
query_url = f'https://{args.host}{args.path}api.php?action=ask&query={query_unquoted}&format=json'
print('query URL:', query_url)

if args.dry is True:
    sys.exit()


# site and login

site = Site(host=args.host, path=args.path)

wd = os.path.dirname(os.path.abspath(__file__))  # working directory
imgdir = os.path.join(wd, 'images')
imgsjson_fn = os.path.join(wd, 'images.json')
with open(imgsjson_fn, 'r') as imgsjson_file:
    images_info = json.load(imgsjson_file)
static_html = os.path.join(wd, 'static_html')
os.makedirs(static_html, exist_ok=True)  # create static_html/ dir
clean_dir(static_html)  # if static_html exists and has files or dirs: clean it

with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki

# read template files
with open(os.path.join(wd, 'templates/index.html')) as document_html:
    index_template = Template(document_html.read())

with open(os.path.join(wd, 'templates/document.html')) as document_html:
    document_template = Template(document_html.read())

with open(os.path.join(wd, 'templates/document_part.html')) as document_html:
    document_part_template = Template(document_html.read())

all_document_parts = ''  # to append all content
documentslist = []
for answer in site.ask(query):
    # publication_title = ''
    # print(answer, answer.keys())
    page, printout_dict, fullurl = unpack_response(answer)
    print(page)
    try:
        img_info = images_info[printout_dict['page']]  # find corresponding image in images.json
    except KeyError:
        print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
        print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
        sys.exit()

    page = site.pages[[printout_dict['page']]]  # request that page from wiki
    pagetext = page.text()
    pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html')
    img_local = os.path.join(imgdir, img_info.get('filename'))

    # RENDER document part
    document_part_html = document_part_template.render(
        printout_dict=printout_dict,
        imgsrc=os.path.join(imgdir, img_info.get('filename')),
        text=pagetext_html,
        fullurl=fullurl,)
    all_document_parts += document_part_html  # append resulting html from document part to the previous parts

    if printout_dict['Part'] == printout_dict['Partof']:
        # RENDER DOCUMENT
        # by passing all_document_parts html to document_template content
        document_html = document_template.render(
            title=printout_dict.get('Title'),
            date=printout_dict.get('Date'),
            content=all_document_parts)  # render document template
        htmlpage_fn = "{}.html".format(
            remove_nonwords(printout_dict.get('Title')[0])
        )
        with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
            htmlfile.write(document_html)
        all_document_parts = ''  # Reset all_document_parts

        # add info to documentslist for index creation
        documentslist.append({'file': htmlpage_fn,
                              'title': printout_dict.get('Title'),
                              'date': printout_dict.get('Date'),
                              'creator': printout_dict.get('Creator')
                              })

# RENDER index.html from documentslist
index_html = index_template.render(index='Index',
                                   query=query,
                                   documentslist=documentslist)
with open(os.path.join(static_html, 'index.html'), 'w') as htmlfile:
    htmlfile.write(index_html)
ask broken down into several arguments; --dry run 5 years ago			`import os, json, sys, urllib`
wip: generate an index via ask 5 years ago			`from mwclient import Site`
			`from pprint import pprint`
			`from jinja2 import Template`
removing non-word chars from html filename; creator in index as string; cleaned old code form functions.py 5 years ago			`from functions import pandoc, unpack_response, clean_dir, remove_nonwords`
wip: generate an index via ask 5 years ago
ask broken down into several arguments; --dry run 5 years ago			`from functions import Colors`
			`import argparse`

			`p = argparse.ArgumentParser(description="From smw ask string generate HTML pages with resulting results.",`
			`formatter_class=argparse.ArgumentDefaultsHelpFormatter)`
			`p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')`
Argument Parser 5 years ago			`p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")`
ask broken down into several arguments; --dry run 5 years ago			`p.add_argument("--conditions", "-c", metavar='',`
			`default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]',`
			`help='The query conditions')`
			`p.add_argument("--printouts", "-p", metavar='',`
WIP:multi values properties 5 years ago			`default='?Title\|?Date\|?Part\|?Partof\|?Creator\|?Organization\|?Format\|?Event\|?Topic\|?Language',`
ask broken down into several arguments; --dry run 5 years ago			`help='Selection of properties to printout')`
			`p.add_argument("--sort", "-s", metavar='',`
			`default='Date,Title,Part',`
			`help='Sorting according to conditions')`
			`p.add_argument("--order", "-o", metavar='',`
			`default='asc,asc,asc',`
			`help='Order of sorting conditions. Should same amount as the --sort properties')`
WIP:multi values properties 5 years ago			`p.add_argument('--limit', '-l', help='(optional) Limit the number of returned '`
			`'items')`
todos 5 years ago			`# TODO: GET limit to work.Perhaps with a site.raw_api method`
ask broken down into several arguments; --dry run 5 years ago			`p.add_argument('--dry', '-d', action='store_true',`
			`help='dry-run: will only show the query but not run it')`
Argument Parser 5 years ago
			`args = p.parse_args()`

ask broken down into several arguments; --dry run 5 years ago			`if len(args.sort.split(',')) != len(args.order.split(',')):`
			`print(Colors.FAIL, 'Invalid query:',`
			`Colors.WARNING, '--sort and --order do not have the same amount of elements', Colors.ENDC)`
			`print('Script exiting now')`
			`sys.exit()`
			`query = f'{args.conditions}\|{args.printouts}\|sort={args.sort}\|order={args.order}'`
WIP:multi values properties 5 years ago			`if args.limit:`
			`limit_str = f'\|limit={args.limit}'`
			`query += limit_str`
ask broken down into several arguments; --dry run 5 years ago			`print('query:', Colors.GREEN, query, Colors.ENDC)`
			`query_unquoted = urllib.parse.quote(query)`
			`query_url = f'https://{args.host}{args.path}api.php?action=ask&query={query_unquoted}&format=json'`
			`print('query URL:', query_url)`

			`if args.dry is True:`
			`sys.exit()`


			`# site and login`

Argument Parser 5 years ago			`site = Site(host=args.host, path=args.path)`
fixed issues with pandoc, using local tmp files 5 years ago
cleaning static_html dir beefore creating new html 5 years ago			`wd = os.path.dirname(os.path.abspath(__file__)) # working directory`
wip: generate an index via ask 5 years ago			`imgdir = os.path.join(wd, 'images')`
			`imgsjson_fn = os.path.join(wd, 'images.json')`
			`with open(imgsjson_fn, 'r') as imgsjson_file:`
			`images_info = json.load(imgsjson_file)`
			`static_html = os.path.join(wd, 'static_html')`
cleaning static_html dir beefore creating new html 5 years ago			`os.makedirs(static_html, exist_ok=True) # create static_html/ dir`
			`clean_dir(static_html) # if static_html exists and has files or dirs: clean it`
wip: generate an index via ask 5 years ago
			`with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd`
			`loginlines = login.read()`
			`user, pwd = loginlines.split('\n')`
			`site.login(username=user, password=pwd) # login to wiki`

ask broken down into several arguments; --dry run 5 years ago			`# read template files`
generating index.html 5 years ago			`with open(os.path.join(wd, 'templates/index.html')) as document_html:`
			`index_template = Template(document_html.read())`
all parts of publication in a single html file 5 years ago
generating index.html 5 years ago			`with open(os.path.join(wd, 'templates/document.html')) as document_html:`
			`document_template = Template(document_html.read())`
all parts of publication in a single html file 5 years ago
generating index.html 5 years ago			`with open(os.path.join(wd, 'templates/document_part.html')) as document_html:`
			`document_part_template = Template(document_html.read())`
all parts of publication in a single html file 5 years ago
generating index.html 5 years ago			`all_document_parts = '' # to append all content`
			`documentslist = []`
ask broken down into several arguments; --dry run 5 years ago			`for answer in site.ask(query):`
WIP:multi values properties 5 years ago			`# publication_title = ''`
links from image to wiki 5 years ago			`# print(answer, answer.keys())`
warnign about missing images 5 years ago			`page, printout_dict, fullurl = unpack_response(answer)`
			`print(page)`
			`try:`
			`img_info = images_info[printout_dict['page']] # find corresponding image in images.json`
			`except KeyError:`
			`print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")`
			`print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)`
			`sys.exit()`
removing non-word chars from html filename; creator in index as string; cleaned old code form functions.py 5 years ago
renaming: publication --> document 5 years ago			`page = site.pages[[printout_dict['page']]] # request that page from wiki`
wip: generate an index via ask 5 years ago			`pagetext = page.text()`
fixed issues with pandoc, using local tmp files 5 years ago			`pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html')`
wip: generate an index via ask 5 years ago			`img_local = os.path.join(imgdir, img_info.get('filename'))`
documentation 5 years ago
			`# RENDER document part`
generating index.html 5 years ago			`document_part_html = document_part_template.render(`
template for document_part with all the metadata 5 years ago			`printout_dict=printout_dict,`
all parts of publication in a single html file 5 years ago			`imgsrc=os.path.join(imgdir, img_info.get('filename')),`
links from image to wiki 5 years ago			`text=pagetext_html,`
			`fullurl=fullurl,)`
renaming: publication --> document 5 years ago			`all_document_parts += document_part_html # append resulting html from document part to the previous parts`
generating index.html 5 years ago
documentation 5 years ago			`if printout_dict['Part'] == printout_dict['Partof']:`
			`# RENDER DOCUMENT`
			`# by passing all_document_parts html to document_template content`
WIP:multi values properties 5 years ago			`document_html = document_template.render(`
			`title=printout_dict.get('Title'),`
			`date=printout_dict.get('Date'),`
			`content=all_document_parts) # render document template`
			`htmlpage_fn = "{}.html".format(`
removing non-word chars from html filename; creator in index as string; cleaned old code form functions.py 5 years ago			`remove_nonwords(printout_dict.get('Title')[0])`
			`)`
documentation 5 years ago			`with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:`
			`htmlfile.write(document_html)`
			`all_document_parts = '' # Reset all_document_parts`
generating index.html 5 years ago
documentation 5 years ago			`# add info to documentslist for index creation`
generating index.html 5 years ago			`documentslist.append({'file': htmlpage_fn,`
			`'title': printout_dict.get('Title'),`
documentation 5 years ago			`'date': printout_dict.get('Date'),`
			`'creator': printout_dict.get('Creator')`
generating index.html 5 years ago			`})`

documentation 5 years ago			`# RENDER index.html from documentslist`
generating index.html 5 years ago			`index_html = index_template.render(index='Index',`
documentation 5 years ago			`query=query,`
generating index.html 5 years ago			`documentslist=documentslist)`
			`with open(os.path.join(static_html, 'index.html'), 'w') as htmlfile:`
			`htmlfile.write(index_html)`