diff --git a/.gitignore b/.gitignore index a512554..67798e6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ images.json .mediawiki_content .html_content .DS_Store + # venv dirs & files .idea/ bin/ @@ -14,3 +15,5 @@ lib64 pyvenv.cfg share/ __pycache__/ +*.jpg +*.jpeg diff --git a/README.md b/README.md index f07bb5f..5a2fe60 100644 --- a/README.md +++ b/README.md @@ -39,28 +39,28 @@ Run scripts together with `./run.sh` 1 script at a time: -`python3 download_imgs.py` -* Downloads all images from wiki to `images/` directory +`python3 download_imgs.py` +* Downloads all images from wiki to `images/` directory * and stores each image's metadata to `images.json` `python3 query2html.py` -* with ask API perform a query: +* with ask API perform a query: * help `python3 query2html.py --help` * run dry `python3 query2html.py --dry` only printing request, not executing it * build custom query with arguments `--conditions --printouts --sort --order` * default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc` - * custom queries + * custom queries * `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'` * `python3 query2html.py --conditions '[[Creator::~*task force*]]'` -Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default. +Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default. Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts. ## How does query2html.py work? Based on the query made: -MW API will send back a number of Page titles that match the query conditions, +MW API will send back a number of Page titles that match the query conditions, together with its printouts (metadata proprety::value pairs). For each Page: @@ -69,12 +69,31 @@ For each Page: * a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html` All Pages that *share the same metadata's Title value*, will: -* gather all their html fragments in `all_document_parts` +* gather all their html fragments in `all_document_parts` * render `templates/document.html` with the content of `all_document_parts` -* save the render template to `'static_html/DocumentTitle.html'`, - +* save the render template to `'static_html/DocumentTitle.html'`, + Each of the saved documents: * render `templates/index.html` with the info on each document has been saved into `documentslist` * resulting in `static_html/index.html` - + +# Bulk image upload upload_imgs_dir.py + +Get Help: `python3 upload_imgs_dir.py --help` + +**Edit and run via** `.helper-upload_imgs_dir.sh` + + +# Convert PDFs to folder of JPGs with pdf2jpg.sh +By either: +* running it from this folder and using absolute path to PDF +`./pdf2jpg.sh "/full/path/to/2020_bantayog/PDFname.pdf"` + +* copying pdf2jpg.sh to 2020_bantayog/ and running with relative path to PDF +`./pdf2jpg.sh "PDFname.pdf"` + +It is + +to convert pdfs to jpgs: +convert -quality 100 -density 300 [name-of-pdf] %02d.jpg diff --git a/functions.py b/functions.py index 2c9b054..4c6b438 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,4 @@ -import os, json, re, shlex +import os, json, re, shlex, sys import subprocess from datetime import datetime @@ -94,6 +94,14 @@ def clean_dir(dirfullpath): if os.path.isfile(f): os.remove(f) +def print_colormsg(msg, level): + if level == 'fail': + color_cmd = Colors.FAIL + elif level == 'warning': + color_cmd = Colors.WARNING + elif level == 'ok': + color_cmd = Colors.BLUE + print(color_cmd, msg, Colors.ENDC) class Colors: @@ -104,4 +112,43 @@ class Colors: FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' - UNDERLINE = '\033[4m' \ No newline at end of file + UNDERLINE = '\033[4m' + +# image upload function + +def listimgs(dir): + lsimgs = [_file for _file in os.listdir(dir) if + (os.path.splitext(_file)[-1]).lower() in + ['.jpg', '.jpeg', '.png']] + lsimgs.sort() + return lsimgs + + +def reorder_imgs(dir, dry): + # does zero pad file numbers + # and returns correct order of files + lsimgs = listimgs(dir) + for img in lsimgs: + img_name, img_ext = os.path.splitext(img) + # does file follow \d{1,}\.img_ext + numb_exp = re.compile( + r'(?P.*?)(?P\d+)(?P%s)'% re.escape(img_ext)) + match = re.search(numb_exp, img) + if not match: + print(f'Image {img} Filename is not suitable for bulk upload.' + f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg' + f'You have to DO IT MANUALLY') + sys.exit() + else: + # only change name of single digit numbers + if len(match.groupdict()['num']) == 1: + name = match.groupdict()['name'] + num = match.groupdict()['num'].zfill(3) # pad with 0s + ext = match.groupdict()['ext'] + new_img = name + num + ext + src_img = os.path.join(dir, img) + dst_img = os.path.join(dir, new_img) + print(f'Renaming: {img} >>>>> {new_img}') + if dry == False: + os.replace(src_img, dst_img) + return listimgs(dir) # update list w/ renamed imgs diff --git a/helper-upload_imgs_dir.sh b/helper-upload_imgs_dir.sh new file mode 100755 index 0000000..138539f --- /dev/null +++ b/helper-upload_imgs_dir.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +python3 upload_imgs_dir.py \ +--title 'Ang Bayan December 1984' \ +--creator 'Central Committee of the Communist Party of the Philippines' \ +--date '1984/12/01' \ +--org 'Communist Party of the Philippines' \ +--format 'Bulletin' \ +--topic 'Communism, Armed Struggle' \ +--dir '/full/path/to/2020_bantayog/Folder name' \ +# --dry + +# Note: +# * Add this values to you upload specific upload. +# * --dry can be enabled to show you what will be uploaded and the metadata, without actully uploading it +# * parameters --event --topic can be added +# * \ allow you to continue the command of a different line +# +# Get help: python3 upload_imgs_dir.py --help + + diff --git a/pdf2jpg.sh b/pdf2jpg.sh new file mode 100755 index 0000000..b0b1866 --- /dev/null +++ b/pdf2jpg.sh @@ -0,0 +1,9 @@ +#!/bin/sh +PDF="$1" +echo "$PDF" +DIR=`echo "$PDF" | sed s/\.pdf//` +echo "$DIR" +mkdir "$DIR" +echo "Starting convertion ..." +convert -quality 100 -density 300 "$PDF" "$DIR/"%02d.jpg +echo "PDF converted thanks to Damla aka Imagemagick ninja" diff --git a/sandbox/wiki_images.py b/sandbox/wiki_images.py deleted file mode 100644 index 630784b..0000000 --- a/sandbox/wiki_images.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -from mwclient import Site -from pprint import pprint - -site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') -wd = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # parent working directory - -with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd - loginlines = login.read() - user, pwd = loginlines.split('\n') - site.login(username=user, password=pwd) # login to wiki - -print(site) - -for n, img in enumerate(site.allimages()): - if n < 5: - print(img) - print('Image attributes:') - pprint(img.__dict__) # info contained in each img object - print('Image object methods:', dir(img)) - # important img info to dictionary - print(img.name, img.page_title, img.imageinfo['timestamp'], img.imageinfo['url'], - img.imageinfo['descriptionshorturl']) - print('\n') diff --git a/templates/document_part.html b/templates/document_part.html index 5524bae..c435c5a 100644 --- a/templates/document_part.html +++ b/templates/document_part.html @@ -1,9 +1,8 @@
diff --git a/templates/smw_infobox_template.jinja b/templates/smw_infobox_template.jinja new file mode 100644 index 0000000..ab8d340 --- /dev/null +++ b/templates/smw_infobox_template.jinja @@ -0,0 +1,12 @@ +{{ '{{' }}ImageMetadata +|Title={{ title }} +|Date={{ date }} +|Part={{ part }} +|Partof={{ partof }} +|Creator={{ creator }} +|Organization={{ organization }} +|Format={{ format }} +|Event={{ event }} +|Topic={{ topic }} +{{ '}}' }} +[[Template:ImageMetadata]] \ No newline at end of file diff --git a/upload_imgs_dir.py b/upload_imgs_dir.py new file mode 100644 index 0000000..3c47d8d --- /dev/null +++ b/upload_imgs_dir.py @@ -0,0 +1,113 @@ +import os, argparse, sys, re +from mwclient import (Site, + errors) +from jinja2 import Template +from functions import (print_colormsg, + reorder_imgs) + +p = argparse.ArgumentParser(description='Upload files from a directory, with metadata values to the wiki.\n' + 'Note that any VALUES CONTAINING ' + 'SPACES SHOULD BE BETWEEN QUOTATION MARKS', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +# TODO: Add example of command to description +p.add_argument('--host', default='hub.xpub.nl/sandbox', help='wiki host') +p.add_argument('--path', default='/itchwiki/', help='Wiki path. Should end with /') +p.add_argument('--dry', '-d', action='store_true', + help='dry-run: will only print the metadata of each file that ' + 'will be upload, but does NOT upload') +p.add_argument('--dir', required=True, + help='Required. Full path of the image directory, that you wish to upload') + +p.add_argument('--title', required=True, + help='Required. Must not exist yet in the wiki.') +p.add_argument('--date', required=True, + help='Required. Format: yyyy/mm/dd ' + 'For dates without day or month use 01 as default ' + 'ie. 1986: --date "1986/01/01" ' + 'March 1985: --date "1984/05/01"') +p.add_argument('--creator', required=False, action='append', default=[''], + help='Multiple values should be SEPARATED BY COMMA') +p.add_argument('--org', required=False, action='append', default=[''], + help='Organization:Multiple values should be SEPARATED BY ' + 'COMMA') +p.add_argument('--format', required=False, action='append', default=[''], + help='Multiple values should be SEPARATED BY COMMA') +p.add_argument('--event', required=False, action='append', default=[''], + help='Multiple values should be SEPARATED BY COMMA') +p.add_argument('--topic', required=False, action='append', default=[''], + help='Multiple values should be SEPARATED BY COMMA') + +# TODO ADD NEW PROPS +args = p.parse_args() + +# login +site = Site(host=args.host, path=args.path) + +wd =os.path.dirname(os.path.abspath(__file__)) # parent working directory +with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd + loginlines = login.read() + user, pwd = loginlines.split('\n') + site.login(username=user, password=pwd) # login to wiki + +# metadata checks +if os.path.isdir(args.dir) is False: + print_colormsg(f'Error: --dir {args.dir} absolute path cannot be found', level='fail') + sys.exit() +elif not re.match(r'\d{4}\/\d{2}\/\d{2}', args.date): + print_colormsg(f'Error: --date {args.date} format should be --date "yyyy/mm/dd"', level='fail') + sys.exit() +elif len(list(site.ask(f'[[Title::{args.title}]]'))) > 0: + print_colormsg(f'Error: --title "{args.title}" already exists in wiki. Provide a different one', level='fail') + sys.exit() + +# read template file +with open(os.path.join(wd, 'templates/smw_infobox_template.jinja')) as tmplt: + smw_propval_template = Template(tmplt.read()) + +lsimgs = reorder_imgs(dir=args.dir, dry=args.dry) +dirname = os.path.split(args.dir)[-1].replace(' ', '_') +dirname = re.sub(r'[\W]', '', dirname) #remove non letters or digits +# print('lsimgs:', lsimgs, '\n', dirname) + +for n, _file in enumerate(lsimgs): + pagename = f'{dirname}-{_file}' + print_colormsg(pagename, level='ok') + page = site.pages[_file] + + if page.exists: + url = page.imageinfo['descriptionurl'] + print_colormsg( + f'Already exists in {url} Will NOT be uploaded', + level='warning') + else: + img_smw_prop_val = smw_propval_template.render( + title=args.title, + date=args.date, + part=n + 1, + partof=len(lsimgs), + creator=(', ').join(args.creator[1:]), + organization=(', ').join(args.org[1:]), + format=(', ').join(args.format[1:]), + event=(', ').join(args.event[1:]), + topic=(', ').join(args.topic[1:]) + ) + + _file_path = os.path.join(args.dir, _file) + if not args.dry: + pageurl = f'https://{args.host}{args.path}index.php/File:{pagename}' + with open(_file_path, 'rb') as _f: + try: + site.upload(file=_file_path, + filename=pagename, + description=img_smw_prop_val, + ignore=True) + print(img_smw_prop_val) + except errors.APIError as e: + print_colormsg(f'Error: {e.info}\n' + f'It will not be uploaded', + level='fail') + + print(f'See image at {pageurl}') + else: + print(img_smw_prop_val) +