import os, json, re, shlex, sys import subprocess from datetime import datetime def remove_nonwords(imgname): filename, ext = os.path.splitext(imgname) # split into filename & extension filename = re.sub(r'\W', '', filename) # remove nonwoders from filename return f'{filename}{ext}' # join filename & ext' def pandoc(pwd, content, format_in, format_out): # print('HTML content file:', wiki_content_f.name) # tmp files mw_tmp_fn = os.path.join(pwd, '.mediawiki_content') html_tmp_fn = os.path.join(pwd, '.html_content') # TODO: join with pw for fn in [mw_tmp_fn, html_tmp_fn ]: if os.path.isfile(fn) is False: os.mknod(fn) # create them if not in dir with open(mw_tmp_fn, 'w') as mw_tmp_fn_: mw_tmp_fn_.write(content) pandoc_cmd = f"pandoc {mw_tmp_fn} -f {format_in} -t {format_out} -o {html_tmp_fn}" subprocess.call(shlex.split(pandoc_cmd)) with open(html_tmp_fn, 'r') as html_tmp_fn_: output = html_tmp_fn_.read() return output def page_props(wikicontent): exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M) found = re.findall(exp, wikicontent) found_dict ={item[0]: item[1] for item in found} return found_dict def unpack_response(response): # printout is ordered dict d = {} printouts = response['printouts'] page = response['fulltext'] fullurl = response['fullurl'] printouts_dumps = json.dumps(printouts) printouts_loads = json.loads(printouts_dumps) printouts_loads['page'] = page # printouts_loads['Date'] = datetime.fromtimestamp( # int(printouts_loads['Date'][0]['timestamp'])) simplified_printouts = {} for k, v in printouts_loads.items(): if k == 'Date': simplified_printouts[k] = datetime.fromtimestamp( int(v[0]['timestamp'])) # elif k == 'Title': # simplified_printouts[k] = v[0]['fulltext'] elif k in ['Part', 'Partof', 'page']: # only 1 value for each simplified_printouts[k] = v else: # Possibly more than 1 value for the rest of properties simplified_printouts[k] = [] for listitem in v: simplified_printouts[k].append(listitem['fulltext']) # for prop in printouts: # p_item = response['printouts'][prop] # for prop_val in p_item: # if isinstance(prop_val, dict) is False: # d[prop] = prop_val # else: # # if len(prop_val) > 0: # props = list(prop_val.keys()) # if 'fulltext' in props: # val = prop_val.get('fulltext') # elif 'timestamp' in props: # val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) # else: # val = list(prop_val.values())[0] # d[prop] = val return page, simplified_printouts, fullurl def update_json(imgsjson_fn, img_dict, img_fn): # write img_dict to json file if os.path.isfile(imgsjson_fn) is True: # if json exists with open(imgsjson_fn, 'r') as imgsjson_file: # read its content imgsjson_dict = json.load(imgsjson_file) # print(imgsjson_dict) else: # other wise imgsjson_dict = {} # imgsjson_dict will be an empty dictionary # is file already in dict if img_dict['name'] in imgsjson_dict.keys(): # check if # file is locally stored img_issaved = os.path.isfile(img_fn) # timestamp in json is same as in img object img_samets = imgsjson_dict[img_dict['name']]['timestamp'] == img_dict['timestamp'] if all([img_issaved, img_samets]) is False: # if one or more is False # ask it to download again download = True imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name else: download = False else: download = True imgsjson_dict[img_dict['name']] = img_dict # add img_dict to imgsjson_dict under the key of the img.name with open(imgsjson_fn, 'w') as imgsjson_file: json.dump(imgsjson_dict, imgsjson_file, indent=4) return download def clean_dir(dirfullpath): for f in os.listdir(dirfullpath): f = os.path.join(dirfullpath, f) if os.path.isfile(f): os.remove(f) def print_colormsg(msg, level): color_cmd = '' if level == 'fail': color_cmd = Colors.FAIL elif level == 'warning': color_cmd = Colors.WARNING elif level == 'ok': color_cmd = Colors.BLUE print(color_cmd, msg, Colors.ENDC) class Colors: HEADER = '\033[95m' BLUE = '\033[94m' GREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' # image upload function def listimgs(dir): lsimgs = [_file for _file in os.listdir(dir) if (os.path.splitext(_file)[-1]).lower() in ['.jpg', '.jpeg', '.png']] lsimgs.sort() return lsimgs def reorder_imgs(dir, dry): # does zero pad file numbers # and returns correct order of files lsimgs = listimgs(dir) for img in lsimgs: img_name, img_ext = os.path.splitext(img) # does file follow \d{1,}\.img_ext numb_exp = re.compile( r'(?P.*?)(?P\d+)(?P%s)'% re.escape(img_ext)) match = re.search(numb_exp, img) if not match: print(f'Image {img} Filename is not suitable for bulk upload.' f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg' f'You have to DO IT MANUALLY') sys.exit() else: # only change name of single digit numbers if len(match.groupdict()['num']) == 1: name = match.groupdict()['name'] num = match.groupdict()['num'].zfill(3) # pad with 0s ext = match.groupdict()['ext'] new_img = name + num + ext src_img = os.path.join(dir, img) dst_img = os.path.join(dir, new_img) print(f'Renaming: {img} >>>>> {new_img}') if dry == False: os.replace(src_img, dst_img) return listimgs(dir) # update list w/ renamed imgs