special-issue-11-wiki2html/functions.py

import os, json, re, shlex
import subprocess
from datetime import datetime


def pandoc(pwd, content, format_in, format_out):
    # print('HTML content file:', wiki_content_f.name)

    # tmp files
    mw_tmp_fn = os.path.join(pwd, '.mediawiki_content')
    html_tmp_fn = os.path.join(pwd, '.html_content')  # TODO: join with pw
    for fn in [mw_tmp_fn, html_tmp_fn ]:
        if os.path.isfile(fn) is False:
            os.mknod(fn) # create them if not in dir
    with open(mw_tmp_fn, 'w') as mw_tmp_fn_:
        mw_tmp_fn_.write(content)

    pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}"
    subprocess.call(shlex.split(pandoc_cmd))

    with open(html_tmp_fn, 'r') as html_tmp_fn_:
        output = html_tmp_fn_.read()

    return output


def page_props(wikicontent):
    exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
    found = re.findall(exp, wikicontent)
    found_dict ={item[0]: item[1] for item in found}
    return found_dict


def unpack_response(response):
    # printout is ordered dict
    d = {}
    printouts = response['printouts']
    page = response['fulltext']
    fullurl = response['fullurl']
    d['page'] = page
    for prop in printouts:
        p_item = response['printouts'][prop]
        for prop_val in p_item:
            if isinstance(prop_val, dict) is False:
                d[prop] = prop_val
            else:
                # if len(prop_val) > 0:
                props = list(prop_val.keys())
                if 'fulltext' in props:
                    val = prop_val.get('fulltext')
                elif 'timestamp' in props:
                    val = datetime.fromtimestamp(int(prop_val.get('timestamp')))
                else:
                    val = list(prop_val.values())[0]
                d[prop] = val
    return page, d, fullurl


def update_json(imgsjson_fn, img_dict, img_fn):
    # write img_dict  to json file
    if os.path.isfile(imgsjson_fn) is True:  # if json exists
        with open(imgsjson_fn, 'r') as imgsjson_file:  # read its content
            imgsjson_dict = json.load(imgsjson_file)
            # print(imgsjson_dict)
    else:  # other wise
        imgsjson_dict = {}  # imgsjson_dict will be an empty dictionary

    # is file already in dict
    if img_dict['name'] in imgsjson_dict.keys():
        # check if
        # file is locally stored
        img_issaved = os.path.isfile(img_fn)
        # timestamp in json is same as in img object
        img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp']
        if all([img_issaved, img_samets]) is False:  # if one or more is False
            # ask it to download again
            download = True
            imgsjson_dict[img_dict['name']] = img_dict  # add img_dict to imgsjson_dict under the key of the img.name
        else:
            download = False
    else:
        download = True
        imgsjson_dict[img_dict['name']] = img_dict  # add img_dict to imgsjson_dict under the key of the img.name

    with open(imgsjson_fn, 'w') as imgsjson_file:
        json.dump(imgsjson_dict, imgsjson_file, indent=4)

    return download


def clean_dir(dirfullpath):
    for f in os.listdir(dirfullpath):
        f = os.path.join(dirfullpath, f)
        if os.path.isfile(f):
            os.remove(f)


class Colors:
    HEADER = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'