You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
special-issue-11-wiki2html/functions.py

180 lines
6.2 KiB
Python

import os, json, re, shlex, sys
import subprocess
from datetime import datetime
def remove_nonwords(imgname):
filename, ext = os.path.splitext(imgname) # split into filename & extension
filename = re.sub(r'\W', '', filename) # remove nonwoders from filename
return f'{filename}{ext}' # join filename & ext'
def pandoc(pwd, content, format_in, format_out):
# print('HTML content file:', wiki_content_f.name)
# tmp files
mw_tmp_fn = os.path.join(pwd, '.mediawiki_content')
html_tmp_fn = os.path.join(pwd, '.html_content') # TODO: join with pw
for fn in [mw_tmp_fn, html_tmp_fn ]:
if os.path.isfile(fn) is False:
os.mknod(fn) # create them if not in dir
with open(mw_tmp_fn, 'w') as mw_tmp_fn_:
mw_tmp_fn_.write(content)
pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}"
subprocess.call(shlex.split(pandoc_cmd))
with open(html_tmp_fn, 'r') as html_tmp_fn_:
output = html_tmp_fn_.read()
return output
def page_props(wikicontent):
exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
found = re.findall(exp, wikicontent)
found_dict ={item[0]: item[1] for item in found}
return found_dict
def unpack_response(response):
# printout is ordered dict
d = {}
printouts = response['printouts']
page = response['fulltext']
fullurl = response['fullurl']
printouts_dumps = json.dumps(printouts)
printouts_loads = json.loads(printouts_dumps)
printouts_loads['page'] = page
# printouts_loads['Date'] = datetime.fromtimestamp(
# int(printouts_loads['Date'][0]['timestamp']))
simplified_printouts = {}
for k, v in printouts_loads.items():
if k == 'Date':
simplified_printouts[k] = datetime.fromtimestamp(
int(v[0]['timestamp']))
# elif k == 'Title':
# simplified_printouts[k] = v[0]['fulltext']
elif k in ['Part', 'Partof', 'page']: # only 1 value for each
simplified_printouts[k] = v
else: # Possibly more than 1 value for the rest of properties
simplified_printouts[k] = []
for listitem in v:
simplified_printouts[k].append(listitem['fulltext'])
# for prop in printouts:
# p_item = response['printouts'][prop]
# for prop_val in p_item:
# if isinstance(prop_val, dict) is False:
# d[prop] = prop_val
# else:
# # if len(prop_val) > 0:
# props = list(prop_val.keys())
# if 'fulltext' in props:
# val = prop_val.get('fulltext')
# elif 'timestamp' in props:
# val = datetime.fromtimestamp(int(prop_val.get('timestamp')))
# else:
# val = list(prop_val.values())[0]
# d[prop] = val
return page, simplified_printouts, fullurl
def update_json(imgsjson_fn, img_dict, img_fn):
# write img_dict to json file
if os.path.isfile(imgsjson_fn) is True: # if json exists
with open(imgsjson_fn, 'r') as imgsjson_file: # read its content
imgsjson_dict = json.load(imgsjson_file)
# print(imgsjson_dict)
else: # other wise
imgsjson_dict = {} # imgsjson_dict will be an empty dictionary
# is file already in dict
if img_dict['name'] in imgsjson_dict.keys():
# check if
# file is locally stored
img_issaved = os.path.isfile(img_fn)
# timestamp in json is same as in img object
img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp']
if all([img_issaved, img_samets]) is False: # if one or more is False
# ask it to download again
download = True
imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name
else:
download = False
else:
download = True
imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name
with open(imgsjson_fn, 'w') as imgsjson_file:
json.dump(imgsjson_dict, imgsjson_file, indent=4)
return download
def clean_dir(dirfullpath):
for f in os.listdir(dirfullpath):
f = os.path.join(dirfullpath, f)
if os.path.isfile(f):
os.remove(f)
def print_colormsg(msg, level):
color_cmd = ''
if level == 'fail':
color_cmd = Colors.FAIL
elif level == 'warning':
color_cmd = Colors.WARNING
elif level == 'ok':
color_cmd = Colors.BLUE
print(color_cmd, msg, Colors.ENDC)
class Colors:
HEADER = '\033[95m'
BLUE = '\033[94m'
GREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# image upload function
def listimgs(dir):
lsimgs = [_file for _file in os.listdir(dir) if
(os.path.splitext(_file)[-1]).lower() in
['.jpg', '.jpeg', '.png']]
lsimgs.sort()
return lsimgs
def reorder_imgs(dir, dry):
# does zero pad file numbers
# and returns correct order of files
lsimgs = listimgs(dir)
for img in lsimgs:
img_name, img_ext = os.path.splitext(img)
# does file follow \d{1,}\.img_ext
numb_exp = re.compile(
r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
match = re.search(numb_exp, img)
if not match:
print(f'Image {img} Filename is not suitable for bulk upload.'
f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
f'You have to DO IT MANUALLY')
sys.exit()
else:
# only change name of single digit numbers
if len(match.groupdict()['num']) == 1:
name = match.groupdict()['name']
num = match.groupdict()['num'].zfill(3) # pad with 0s
ext = match.groupdict()['ext']
new_img = name + num + ext
src_img = os.path.join(dir, img)
dst_img = os.path.join(dir, new_img)
print(f'Renaming: {img} >>>>> {new_img}')
if dry == False:
os.replace(src_img, dst_img)
return listimgs(dir) # update list w/ renamed imgs