import os, json, re, shlex import subprocess from datetime import datetime def pandoc(pwd, content, format_in, format_out): # print('HTML content file:', wiki_content_f.name) # tmp files mw_tmp_fn = os.path.join(pwd, '.mediawiki_content') html_tmp_fn = os.path.join(pwd, '.html_content') # TODO: join with pw for fn in [mw_tmp_fn, html_tmp_fn ]: if os.path.isfile(fn) is False: os.mknod(fn) # create them if not in dir with open(mw_tmp_fn, 'w') as mw_tmp_fn_: mw_tmp_fn_.write(content) pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}" subprocess.call(shlex.split(pandoc_cmd)) with open(html_tmp_fn, 'r') as html_tmp_fn_: output = html_tmp_fn_.read() return output def page_props(wikicontent): exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M) found = re.findall(exp, wikicontent) found_dict ={item[0]: item[1] for item in found} return found_dict def unpack_response(response): # printout is ordered dict d = {} printouts = response['printouts'] page = response['fulltext'] fullurl = response['fullurl'] d['page'] = page for prop in printouts: p_item = response['printouts'][prop] for prop_val in p_item: if isinstance(prop_val, dict) is False: d[prop] = prop_val else: # if len(prop_val) > 0: props = list(prop_val.keys()) if 'fulltext' in props: val = prop_val.get('fulltext') elif 'timestamp' in props: val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) else: val = list(prop_val.values())[0] d[prop] = val return page, d, fullurl def update_json(imgsjson_fn, img_dict, img_fn): # write img_dict to json file if os.path.isfile(imgsjson_fn) is True: # if json exists with open(imgsjson_fn, 'r') as imgsjson_file: # read its content imgsjson_dict = json.load(imgsjson_file) # print(imgsjson_dict) else: # other wise imgsjson_dict = {} # imgsjson_dict will be an empty dictionary # is file already in dict if img_dict['name'] in imgsjson_dict.keys(): # check if # file is locally stored img_issaved = os.path.isfile(img_fn) # timestamp in json is same as in img object img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp'] if all([img_issaved, img_samets]) is False: # if one or more is False # ask it to download again download = True imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name else: download = False else: download = True imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name with open(imgsjson_fn, 'w') as imgsjson_file: json.dump(imgsjson_dict, imgsjson_file, indent=4) return download def clean_dir(dirfullpath): for f in os.listdir(dirfullpath): f = os.path.join(dirfullpath, f) if os.path.isfile(f): os.remove(f) class Colors: HEADER = '\033[95m' BLUE = '\033[94m' GREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m'