E.zn 5 years ago
commit 569d59ddc5

3
.gitignore vendored

@ -6,6 +6,7 @@ images.json
.mediawiki_content .mediawiki_content
.html_content .html_content
.DS_Store .DS_Store
# venv dirs & files # venv dirs & files
.idea/ .idea/
bin/ bin/
@ -14,3 +15,5 @@ lib64
pyvenv.cfg pyvenv.cfg
share/ share/
__pycache__/ __pycache__/
*.jpg
*.jpeg

@ -39,28 +39,28 @@ Run scripts together with `./run.sh`
1 script at a time: 1 script at a time:
`python3 download_imgs.py` `python3 download_imgs.py`
* Downloads all images from wiki to `images/` directory * Downloads all images from wiki to `images/` directory
* and stores each image's metadata to `images.json` * and stores each image's metadata to `images.json`
`python3 query2html.py` `python3 query2html.py`
* with ask API perform a query: * with ask API perform a query:
* help `python3 query2html.py --help` * help `python3 query2html.py --help`
* run dry `python3 query2html.py --dry` only printing request, not executing it * run dry `python3 query2html.py --dry` only printing request, not executing it
* build custom query with arguments `--conditions --printouts --sort --order` * build custom query with arguments `--conditions --printouts --sort --order`
* default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc` * default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc`
* custom queries * custom queries
* `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'` * `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'`
* `python3 query2html.py --conditions '[[Creator::~*task force*]]'` * `python3 query2html.py --conditions '[[Creator::~*task force*]]'`
Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default. Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default.
Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts. Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts.
## How does query2html.py work? ## How does query2html.py work?
Based on the query made: Based on the query made:
MW API will send back a number of Page titles that match the query conditions, MW API will send back a number of Page titles that match the query conditions,
together with its printouts (metadata proprety::value pairs). together with its printouts (metadata proprety::value pairs).
For each Page: For each Page:
@ -69,12 +69,31 @@ For each Page:
* a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html` * a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html`
All Pages that *share the same metadata's Title value*, will: All Pages that *share the same metadata's Title value*, will:
* gather all their html fragments in `all_document_parts` * gather all their html fragments in `all_document_parts`
* render `templates/document.html` with the content of `all_document_parts` * render `templates/document.html` with the content of `all_document_parts`
* save the render template to `'static_html/DocumentTitle.html'`, * save the render template to `'static_html/DocumentTitle.html'`,
Each of the saved documents: Each of the saved documents:
* render `templates/index.html` with the info on each document has been saved into `documentslist` * render `templates/index.html` with the info on each document has been saved into `documentslist`
* resulting in `static_html/index.html` * resulting in `static_html/index.html`
# Bulk image upload upload_imgs_dir.py
Get Help: `python3 upload_imgs_dir.py --help`
**Edit and run via** `.helper-upload_imgs_dir.sh`
# Convert PDFs to folder of JPGs with pdf2jpg.sh
By either:
* running it from this folder and using absolute path to PDF
`./pdf2jpg.sh "/full/path/to/2020_bantayog/PDFname.pdf"`
* copying pdf2jpg.sh to 2020_bantayog/ and running with relative path to PDF
`./pdf2jpg.sh "PDFname.pdf"`
It is
to convert pdfs to jpgs:
convert -quality 100 -density 300 [name-of-pdf] %02d.jpg

@ -1,4 +1,4 @@
import os, json, re, shlex import os, json, re, shlex, sys
import subprocess import subprocess
from datetime import datetime from datetime import datetime
@ -94,6 +94,14 @@ def clean_dir(dirfullpath):
if os.path.isfile(f): if os.path.isfile(f):
os.remove(f) os.remove(f)
def print_colormsg(msg, level):
if level == 'fail':
color_cmd = Colors.FAIL
elif level == 'warning':
color_cmd = Colors.WARNING
elif level == 'ok':
color_cmd = Colors.BLUE
print(color_cmd, msg, Colors.ENDC)
class Colors: class Colors:
@ -104,4 +112,43 @@ class Colors:
FAIL = '\033[91m' FAIL = '\033[91m'
ENDC = '\033[0m' ENDC = '\033[0m'
BOLD = '\033[1m' BOLD = '\033[1m'
UNDERLINE = '\033[4m' UNDERLINE = '\033[4m'
# image upload function
def listimgs(dir):
lsimgs = [_file for _file in os.listdir(dir) if
(os.path.splitext(_file)[-1]).lower() in
['.jpg', '.jpeg', '.png']]
lsimgs.sort()
return lsimgs
def reorder_imgs(dir, dry):
# does zero pad file numbers
# and returns correct order of files
lsimgs = listimgs(dir)
for img in lsimgs:
img_name, img_ext = os.path.splitext(img)
# does file follow \d{1,}\.img_ext
numb_exp = re.compile(
r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
match = re.search(numb_exp, img)
if not match:
print(f'Image {img} Filename is not suitable for bulk upload.'
f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
f'You have to DO IT MANUALLY')
sys.exit()
else:
# only change name of single digit numbers
if len(match.groupdict()['num']) == 1:
name = match.groupdict()['name']
num = match.groupdict()['num'].zfill(3) # pad with 0s
ext = match.groupdict()['ext']
new_img = name + num + ext
src_img = os.path.join(dir, img)
dst_img = os.path.join(dir, new_img)
print(f'Renaming: {img} >>>>> {new_img}')
if dry == False:
os.replace(src_img, dst_img)
return listimgs(dir) # update list w/ renamed imgs

@ -0,0 +1,21 @@
#!/bin/sh
python3 upload_imgs_dir.py \
--title 'Ang Bayan December 1984' \
--creator 'Central Committee of the Communist Party of the Philippines' \
--date '1984/12/01' \
--org 'Communist Party of the Philippines' \
--format 'Bulletin' \
--topic 'Communism, Armed Struggle' \
--dir '/full/path/to/2020_bantayog/Folder name' \
# --dry
# Note:
# * Add this values to you upload specific upload.
# * --dry can be enabled to show you what will be uploaded and the metadata, without actully uploading it
# * parameters --event --topic can be added
# * \ allow you to continue the command of a different line
#
# Get help: python3 upload_imgs_dir.py --help

@ -0,0 +1,9 @@
#!/bin/sh
PDF="$1"
echo "$PDF"
DIR=`echo "$PDF" | sed s/\.pdf//`
echo "$DIR"
mkdir "$DIR"
echo "Starting convertion ..."
convert -quality 100 -density 300 "$PDF" "$DIR/"%02d.jpg
echo "PDF converted thanks to Damla aka Imagemagick ninja"

@ -1,24 +0,0 @@
import os
from mwclient import Site
from pprint import pprint
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # parent working directory
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
print(site)
for n, img in enumerate(site.allimages()):
if n < 5:
print(img)
print('Image attributes:')
pprint(img.__dict__) # info contained in each img object
print('Image object methods:', dir(img))
# important img info to dictionary
print(img.name, img.page_title, img.imageinfo['timestamp'], img.imageinfo['url'],
img.imageinfo['descriptionshorturl'])
print('\n')

@ -1,9 +1,8 @@
<div class="part"> <div class="part">
<div class="img"> <div class="img">
<a href="https:{{ fullurl }}"> <img src="{{ imgsrc }}" />
<img src="{{ imgsrc }}" /> <a href="https:{{ fullurl }}">{{ fullurl }}</a>
</a>
</div> </div>
<div class="text"> <div class="text">

@ -0,0 +1,12 @@
{{ '{{' }}ImageMetadata
|Title={{ title }}
|Date={{ date }}
|Part={{ part }}
|Partof={{ partof }}
|Creator={{ creator }}
|Organization={{ organization }}
|Format={{ format }}
|Event={{ event }}
|Topic={{ topic }}
{{ '}}' }}
[[Template:ImageMetadata]]

@ -0,0 +1,113 @@
import os, argparse, sys, re
from mwclient import (Site,
errors)
from jinja2 import Template
from functions import (print_colormsg,
reorder_imgs)
p = argparse.ArgumentParser(description='Upload files from a directory, with metadata values to the wiki.\n'
'Note that any VALUES CONTAINING '
'SPACES SHOULD BE BETWEEN QUOTATION MARKS',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# TODO: Add example of command to description
p.add_argument('--host', default='hub.xpub.nl/sandbox', help='wiki host')
p.add_argument('--path', default='/itchwiki/', help='Wiki path. Should end with /')
p.add_argument('--dry', '-d', action='store_true',
help='dry-run: will only print the metadata of each file that '
'will be upload, but does NOT upload')
p.add_argument('--dir', required=True,
help='Required. Full path of the image directory, that you wish to upload')
p.add_argument('--title', required=True,
help='Required. Must not exist yet in the wiki.')
p.add_argument('--date', required=True,
help='Required. Format: yyyy/mm/dd '
'For dates without day or month use 01 as default '
'ie. 1986: --date "1986/01/01" '
'March 1985: --date "1984/05/01"')
p.add_argument('--creator', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--org', required=False, action='append', default=[''],
help='Organization:Multiple values should be SEPARATED BY '
'COMMA')
p.add_argument('--format', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--event', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--topic', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
# TODO ADD NEW PROPS
args = p.parse_args()
# login
site = Site(host=args.host, path=args.path)
wd =os.path.dirname(os.path.abspath(__file__)) # parent working directory
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
# metadata checks
if os.path.isdir(args.dir) is False:
print_colormsg(f'Error: --dir {args.dir} absolute path cannot be found', level='fail')
sys.exit()
elif not re.match(r'\d{4}\/\d{2}\/\d{2}', args.date):
print_colormsg(f'Error: --date {args.date} format should be --date "yyyy/mm/dd"', level='fail')
sys.exit()
elif len(list(site.ask(f'[[Title::{args.title}]]'))) > 0:
print_colormsg(f'Error: --title "{args.title}" already exists in wiki. Provide a different one', level='fail')
sys.exit()
# read template file
with open(os.path.join(wd, 'templates/smw_infobox_template.jinja')) as tmplt:
smw_propval_template = Template(tmplt.read())
lsimgs = reorder_imgs(dir=args.dir, dry=args.dry)
dirname = os.path.split(args.dir)[-1].replace(' ', '_')
dirname = re.sub(r'[\W]', '', dirname) #remove non letters or digits
# print('lsimgs:', lsimgs, '\n', dirname)
for n, _file in enumerate(lsimgs):
pagename = f'{dirname}-{_file}'
print_colormsg(pagename, level='ok')
page = site.pages[_file]
if page.exists:
url = page.imageinfo['descriptionurl']
print_colormsg(
f'Already exists in {url} Will NOT be uploaded',
level='warning')
else:
img_smw_prop_val = smw_propval_template.render(
title=args.title,
date=args.date,
part=n + 1,
partof=len(lsimgs),
creator=(', ').join(args.creator[1:]),
organization=(', ').join(args.org[1:]),
format=(', ').join(args.format[1:]),
event=(', ').join(args.event[1:]),
topic=(', ').join(args.topic[1:])
)
_file_path = os.path.join(args.dir, _file)
if not args.dry:
pageurl = f'https://{args.host}{args.path}index.php/File:{pagename}'
with open(_file_path, 'rb') as _f:
try:
site.upload(file=_file_path,
filename=pagename,
description=img_smw_prop_val,
ignore=True)
print(img_smw_prop_val)
except errors.APIError as e:
print_colormsg(f'Error: {e.info}\n'
f'It will not be uploaded',
level='fail')
print(f'See image at {pageurl}')
else:
print(img_smw_prop_val)
Loading…
Cancel
Save