special-issue-11-wiki2html/functions.py

import os, json, re, shlex, sys
import subprocess
from datetime import datetime


def remove_nonwords(imgname):
    filename, ext = os.path.splitext(imgname)  # split into filename & extension
    filename = re.sub(r'\W', '', filename)  # remove nonwoders from filename
    return f'{filename}{ext}'  # join filename & ext'


def pandoc(pwd, content, format_in, format_out):
    # print('HTML content file:', wiki_content_f.name)

    # tmp files
    mw_tmp_fn = os.path.join(pwd, '.mediawiki_content')
    html_tmp_fn = os.path.join(pwd, '.html_content')  # TODO: join with pw
    for fn in [mw_tmp_fn, html_tmp_fn ]:
        if os.path.isfile(fn) is False:
            os.mknod(fn) # create them if not in dir
    with open(mw_tmp_fn, 'w') as mw_tmp_fn_:
        mw_tmp_fn_.write(content)

    pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}"
    subprocess.call(shlex.split(pandoc_cmd))

    with open(html_tmp_fn, 'r') as html_tmp_fn_:
        output = html_tmp_fn_.read()

    return output


def page_props(wikicontent):
    exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
    found = re.findall(exp, wikicontent)
    found_dict ={item[0]: item[1] for item in found}
    return found_dict


def unpack_response(response):
    # printout is ordered dict
    d = {}
    printouts = response['printouts']
    page = response['fulltext']
    fullurl = response['fullurl']
    printouts_dumps = json.dumps(printouts)
    printouts_loads = json.loads(printouts_dumps)
    printouts_loads['page'] = page
    # printouts_loads['Date'] = datetime.fromtimestamp(
    #     int(printouts_loads['Date'][0]['timestamp']))
    simplified_printouts = {}
    for k, v in printouts_loads.items():
        if k == 'Date':
            simplified_printouts[k] = datetime.fromtimestamp(
                int(v[0]['timestamp']))
        # elif k == 'Title':
        #     simplified_printouts[k] = v[0]['fulltext']
        elif k in ['Part', 'Partof', 'page']:  # only 1 value for each
            simplified_printouts[k] = v
        else: # Possibly more than 1 value for the rest of properties
            simplified_printouts[k] = []
            for listitem in v:
                simplified_printouts[k].append(listitem['fulltext'])

    # for prop in printouts:
    #     p_item = response['printouts'][prop]
    #     for prop_val in p_item:
    #         if isinstance(prop_val, dict) is False:
    #             d[prop] = prop_val
    #         else:
    #             # if len(prop_val) > 0:
    #             props = list(prop_val.keys())
    #             if 'fulltext' in props:
    #                 val = prop_val.get('fulltext')
    #             elif 'timestamp' in props:
    #                 val = datetime.fromtimestamp(int(prop_val.get('timestamp')))
    #             else:
    #                 val = list(prop_val.values())[0]
    #             d[prop] = val
    return page, simplified_printouts, fullurl


def update_json(imgsjson_fn, img_dict, img_fn):
    # write img_dict  to json file
    if os.path.isfile(imgsjson_fn) is True:  # if json exists
        with open(imgsjson_fn, 'r') as imgsjson_file:  # read its content
            imgsjson_dict = json.load(imgsjson_file)
            # print(imgsjson_dict)
    else:  # other wise
        imgsjson_dict = {}  # imgsjson_dict will be an empty dictionary

    # is file already in dict
    if img_dict['name'] in imgsjson_dict.keys():
        # check if
        # file is locally stored
        img_issaved = os.path.isfile(img_fn)
        # timestamp in json is same as in img object
        img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp']
        if all([img_issaved, img_samets]) is False:  # if one or more is False
            # ask it to download again
            download = True
            imgsjson_dict[img_dict['name']] = img_dict  # add img_dict to imgsjson_dict under the key of the img.name
        else:
            download = False
    else:
        download = True
        imgsjson_dict[img_dict['name']] = img_dict  # add img_dict to imgsjson_dict under the key of the img.name

    with open(imgsjson_fn, 'w') as imgsjson_file:
        json.dump(imgsjson_dict, imgsjson_file, indent=4)

    return download


def clean_dir(dirfullpath):
    for f in os.listdir(dirfullpath):
        f = os.path.join(dirfullpath, f)
        if os.path.isfile(f):
            os.remove(f)

def print_colormsg(msg, level):
    color_cmd = ''
    if level == 'fail':
        color_cmd = Colors.FAIL
    elif level == 'warning':
        color_cmd = Colors.WARNING
    elif level == 'ok':
        color_cmd = Colors.BLUE
    print(color_cmd, msg, Colors.ENDC)


class Colors:
    HEADER = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# image upload function

def listimgs(dir):
    lsimgs = [_file for _file in os.listdir(dir) if
              (os.path.splitext(_file)[-1]).lower() in
              ['.jpg', '.jpeg', '.png']]
    lsimgs.sort()
    return lsimgs


def reorder_imgs(dir, dry):
    # does zero pad file numbers
    # and returns correct order of files
    lsimgs = listimgs(dir)
    for img in lsimgs:
        img_name, img_ext = os.path.splitext(img)
        # does file follow \d{1,}\.img_ext
        numb_exp = re.compile(
            r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
        match = re.search(numb_exp, img)
        if not match:
            print(f'Image {img} Filename is not suitable for bulk upload.'
                  f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
                  f'You have to DO IT MANUALLY')
            sys.exit()
        else:
            # only change name of single digit numbers
            if len(match.groupdict()['num']) == 1:
                name = match.groupdict()['name']
                num = match.groupdict()['num'].zfill(3) # pad with 0s
                ext = match.groupdict()['ext']
                new_img = name + num + ext
                src_img = os.path.join(dir, img)
                dst_img = os.path.join(dir, new_img)
                print(f'Renaming: {img} >>>>>  {new_img}')
                if dry == False:
                    os.replace(src_img, dst_img)
    return listimgs(dir)   # update list w/ renamed imgs
padding folder files with zeros for correct order 5 years ago			`import os, json, re, shlex, sys`
images2html 5 years ago			`import subprocess`
wip: generate an index via ask 5 years ago			`from datetime import datetime`

renaming: publication --> document 5 years ago
remove non words 5 years ago			`def remove_nonwords(imgname):`
			`filename, ext = os.path.splitext(imgname) # split into filename & extension`
			`filename = re.sub(r'\W', '', filename) # remove nonwoders from filename`
			`return f'{filename}{ext}' # join filename & ext'`


fixed issues with pandoc, using local tmp files 5 years ago			`def pandoc(pwd, content, format_in, format_out):`
			`# print('HTML content file:', wiki_content_f.name)`
images2html 5 years ago
fixed issues with pandoc, using local tmp files 5 years ago			`# tmp files`
			`mw_tmp_fn = os.path.join(pwd, '.mediawiki_content')`
			`html_tmp_fn = os.path.join(pwd, '.html_content') # TODO: join with pw`
			`for fn in [mw_tmp_fn, html_tmp_fn ]:`
			`if os.path.isfile(fn) is False:`
			`os.mknod(fn) # create them if not in dir`
			`with open(mw_tmp_fn, 'w') as mw_tmp_fn_:`
			`mw_tmp_fn_.write(content)`

			`pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}"`
			`subprocess.call(shlex.split(pandoc_cmd))`

			`with open(html_tmp_fn, 'r') as html_tmp_fn_:`
			`output = html_tmp_fn_.read()`

			`return output`
images2html 5 years ago

			`def page_props(wikicontent):`
			`exp = re.compile(r'\\|(\w?)\=(.)', flags=re.M)`
			`found = re.findall(exp, wikicontent)`
			`found_dict ={item[0]: item[1] for item in found}`
			`return found_dict`

download_imgs.py script completed 5 years ago
wip: generate an index via ask 5 years ago			`def unpack_response(response):`
			`# printout is ordered dict`
			`d = {}`
			`printouts = response['printouts']`
			`page = response['fulltext']`
links from image to wiki 5 years ago			`fullurl = response['fullurl']`
WIP:multi values properties 5 years ago			`printouts_dumps = json.dumps(printouts)`
			`printouts_loads = json.loads(printouts_dumps)`
			`printouts_loads['page'] = page`
			`# printouts_loads['Date'] = datetime.fromtimestamp(`
			`# int(printouts_loads['Date'][0]['timestamp']))`
			`simplified_printouts = {}`
			`for k, v in printouts_loads.items():`
			`if k == 'Date':`
			`simplified_printouts[k] = datetime.fromtimestamp(`
			`int(v[0]['timestamp']))`
			`# elif k == 'Title':`
			`# simplified_printouts[k] = v[0]['fulltext']`
			`elif k in ['Part', 'Partof', 'page']: # only 1 value for each`
			`simplified_printouts[k] = v`
			`else: # Possibly more than 1 value for the rest of properties`
			`simplified_printouts[k] = []`
			`for listitem in v:`
			`simplified_printouts[k].append(listitem['fulltext'])`

			`# for prop in printouts:`
			`# p_item = response['printouts'][prop]`
			`# for prop_val in p_item:`
			`# if isinstance(prop_val, dict) is False:`
			`# d[prop] = prop_val`
			`# else:`
			`# # if len(prop_val) > 0:`
			`# props = list(prop_val.keys())`
			`# if 'fulltext' in props:`
			`# val = prop_val.get('fulltext')`
			`# elif 'timestamp' in props:`
			`# val = datetime.fromtimestamp(int(prop_val.get('timestamp')))`
			`# else:`
			`# val = list(prop_val.values())[0]`
			`# d[prop] = val`
			`return page, simplified_printouts, fullurl`
wip: generate an index via ask 5 years ago
download_imgs.py script completed 5 years ago
			`def update_json(imgsjson_fn, img_dict, img_fn):`
			`# write img_dict to json file`
			`if os.path.isfile(imgsjson_fn) is True: # if json exists`
			`with open(imgsjson_fn, 'r') as imgsjson_file: # read its content`
			`imgsjson_dict = json.load(imgsjson_file)`
			`# print(imgsjson_dict)`
			`else: # other wise`
			`imgsjson_dict = {} # imgsjson_dict will be an empty dictionary`

			`# is file already in dict`
			`if img_dict['name'] in imgsjson_dict.keys():`
			`# check if`
			`# file is locally stored`
			`img_issaved = os.path.isfile(img_fn)`
			`# timestamp in json is same as in img object`
			`img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp']`
			`if all([img_issaved, img_samets]) is False: # if one or more is False`
			`# ask it to download again`
			`download = True`
			`imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name`
			`else:`
			`download = False`
			`else:`
			`download = True`
			`imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name`

			`with open(imgsjson_fn, 'w') as imgsjson_file:`
			`json.dump(imgsjson_dict, imgsjson_file, indent=4)`

ask broken down into several arguments; --dry run 5 years ago			`return download`


cleaning static_html dir beefore creating new html 5 years ago			`def clean_dir(dirfullpath):`
			`for f in os.listdir(dirfullpath):`
			`f = os.path.join(dirfullpath, f)`
			`if os.path.isfile(f):`
			`os.remove(f)`

warnings 5 years ago			`def print_colormsg(msg, level):`
remove non words 5 years ago			`color_cmd = ''`
warnings 5 years ago			`if level == 'fail':`
print functions 5 years ago			`color_cmd = Colors.FAIL`
warnings 5 years ago			`elif level == 'warning':`
print functions 5 years ago			`color_cmd = Colors.WARNING`
warnings 5 years ago			`elif level == 'ok':`
print functions 5 years ago			`color_cmd = Colors.BLUE`
			`print(color_cmd, msg, Colors.ENDC)`
cleaning static_html dir beefore creating new html 5 years ago

ask broken down into several arguments; --dry run 5 years ago			`class Colors:`
			`HEADER = '\033[95m'`
			`BLUE = '\033[94m'`
			`GREEN = '\033[92m'`
			`WARNING = '\033[93m'`
			`FAIL = '\033[91m'`
			`ENDC = '\033[0m'`
			`BOLD = '\033[1m'`
images reorder 5 years ago			`UNDERLINE = '\033[4m'`

smw_prop_val_ template rendered 5 years ago			`# image upload function`

images reorder 5 years ago			`def listimgs(dir):`
			`lsimgs = [_file for _file in os.listdir(dir) if`
			`(os.path.splitext(_file)[-1]).lower() in`
			`['.jpg', '.jpeg', '.png']]`
			`lsimgs.sort()`
			`return lsimgs`


			`def reorder_imgs(dir, dry):`
padding folder files with zeros for correct order 5 years ago			`# does zero pad file numbers`
small corrections to def reorder_imgs 5 years ago			`# and returns correct order of files`
images reorder 5 years ago			`lsimgs = listimgs(dir)`
			`for img in lsimgs:`
			`img_name, img_ext = os.path.splitext(img)`
			`# does file follow \d{1,}\.img_ext`
			`numb_exp = re.compile(`
			`r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))`
			`match = re.search(numb_exp, img)`
			`if not match:`
			`print(f'Image {img} Filename is not suitable for bulk upload.'`
padding folder files with zeros for correct order 5 years ago			`f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'`
images reorder 5 years ago			`f'You have to DO IT MANUALLY')`
padding folder files with zeros for correct order 5 years ago			`sys.exit()`
images reorder 5 years ago			`else:`
			`# only change name of single digit numbers`
			`if len(match.groupdict()['num']) == 1:`
			`name = match.groupdict()['name']`
padding folder files with zeros for correct order 5 years ago			`num = match.groupdict()['num'].zfill(3) # pad with 0s`
images reorder 5 years ago			`ext = match.groupdict()['ext']`
			`new_img = name + num + ext`
padding folder files with zeros for correct order 5 years ago			`src_img = os.path.join(dir, img)`
			`dst_img = os.path.join(dir, new_img)`
			`print(f'Renaming: {img} >>>>> {new_img}')`
images reorder 5 years ago			`if dry == False:`
padding folder files with zeros for correct order 5 years ago			`os.replace(src_img, dst_img)`
small corrections to def reorder_imgs 5 years ago			`return listimgs(dir) # update list w/ renamed imgs`