pull/3/head
E.zn 4 years ago
commit 569d59ddc5

3
.gitignore vendored

@ -6,6 +6,7 @@ images.json
.mediawiki_content
.html_content
.DS_Store
# venv dirs & files
.idea/
bin/
@ -14,3 +15,5 @@ lib64
pyvenv.cfg
share/
__pycache__/
*.jpg
*.jpeg

@ -39,28 +39,28 @@ Run scripts together with `./run.sh`
1 script at a time:
`python3 download_imgs.py`
* Downloads all images from wiki to `images/` directory
`python3 download_imgs.py`
* Downloads all images from wiki to `images/` directory
* and stores each image's metadata to `images.json`
`python3 query2html.py`
* with ask API perform a query:
* with ask API perform a query:
* help `python3 query2html.py --help`
* run dry `python3 query2html.py --dry` only printing request, not executing it
* build custom query with arguments `--conditions --printouts --sort --order`
* default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc`
* custom queries
* custom queries
* `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'`
* `python3 query2html.py --conditions '[[Creator::~*task force*]]'`
Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default.
Note: to avoid confusion or problems is better to leave the `--printouts` `--sort` `--order` arguments as the default.
Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts.
## How does query2html.py work?
Based on the query made:
MW API will send back a number of Page titles that match the query conditions,
MW API will send back a number of Page titles that match the query conditions,
together with its printouts (metadata proprety::value pairs).
For each Page:
@ -69,12 +69,31 @@ For each Page:
* a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html`
All Pages that *share the same metadata's Title value*, will:
* gather all their html fragments in `all_document_parts`
* gather all their html fragments in `all_document_parts`
* render `templates/document.html` with the content of `all_document_parts`
* save the render template to `'static_html/DocumentTitle.html'`,
* save the render template to `'static_html/DocumentTitle.html'`,
Each of the saved documents:
* render `templates/index.html` with the info on each document has been saved into `documentslist`
* resulting in `static_html/index.html`
# Bulk image upload upload_imgs_dir.py
Get Help: `python3 upload_imgs_dir.py --help`
**Edit and run via** `.helper-upload_imgs_dir.sh`
# Convert PDFs to folder of JPGs with pdf2jpg.sh
By either:
* running it from this folder and using absolute path to PDF
`./pdf2jpg.sh "/full/path/to/2020_bantayog/PDFname.pdf"`
* copying pdf2jpg.sh to 2020_bantayog/ and running with relative path to PDF
`./pdf2jpg.sh "PDFname.pdf"`
It is
to convert pdfs to jpgs:
convert -quality 100 -density 300 [name-of-pdf] %02d.jpg

@ -1,4 +1,4 @@
import os, json, re, shlex
import os, json, re, shlex, sys
import subprocess
from datetime import datetime
@ -94,6 +94,14 @@ def clean_dir(dirfullpath):
if os.path.isfile(f):
os.remove(f)
def print_colormsg(msg, level):
if level == 'fail':
color_cmd = Colors.FAIL
elif level == 'warning':
color_cmd = Colors.WARNING
elif level == 'ok':
color_cmd = Colors.BLUE
print(color_cmd, msg, Colors.ENDC)
class Colors:
@ -104,4 +112,43 @@ class Colors:
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
UNDERLINE = '\033[4m'
# image upload function
def listimgs(dir):
lsimgs = [_file for _file in os.listdir(dir) if
(os.path.splitext(_file)[-1]).lower() in
['.jpg', '.jpeg', '.png']]
lsimgs.sort()
return lsimgs
def reorder_imgs(dir, dry):
# does zero pad file numbers
# and returns correct order of files
lsimgs = listimgs(dir)
for img in lsimgs:
img_name, img_ext = os.path.splitext(img)
# does file follow \d{1,}\.img_ext
numb_exp = re.compile(
r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
match = re.search(numb_exp, img)
if not match:
print(f'Image {img} Filename is not suitable for bulk upload.'
f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
f'You have to DO IT MANUALLY')
sys.exit()
else:
# only change name of single digit numbers
if len(match.groupdict()['num']) == 1:
name = match.groupdict()['name']
num = match.groupdict()['num'].zfill(3) # pad with 0s
ext = match.groupdict()['ext']
new_img = name + num + ext
src_img = os.path.join(dir, img)
dst_img = os.path.join(dir, new_img)
print(f'Renaming: {img} >>>>> {new_img}')
if dry == False:
os.replace(src_img, dst_img)
return listimgs(dir) # update list w/ renamed imgs

@ -0,0 +1,21 @@
#!/bin/sh
python3 upload_imgs_dir.py \
--title 'Ang Bayan December 1984' \
--creator 'Central Committee of the Communist Party of the Philippines' \
--date '1984/12/01' \
--org 'Communist Party of the Philippines' \
--format 'Bulletin' \
--topic 'Communism, Armed Struggle' \
--dir '/full/path/to/2020_bantayog/Folder name' \
# --dry
# Note:
# * Add this values to you upload specific upload.
# * --dry can be enabled to show you what will be uploaded and the metadata, without actully uploading it
# * parameters --event --topic can be added
# * \ allow you to continue the command of a different line
#
# Get help: python3 upload_imgs_dir.py --help

@ -0,0 +1,9 @@
#!/bin/sh
PDF="$1"
echo "$PDF"
DIR=`echo "$PDF" | sed s/\.pdf//`
echo "$DIR"
mkdir "$DIR"
echo "Starting convertion ..."
convert -quality 100 -density 300 "$PDF" "$DIR/"%02d.jpg
echo "PDF converted thanks to Damla aka Imagemagick ninja"

@ -1,24 +0,0 @@
import os
from mwclient import Site
from pprint import pprint
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # parent working directory
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
print(site)
for n, img in enumerate(site.allimages()):
if n < 5:
print(img)
print('Image attributes:')
pprint(img.__dict__) # info contained in each img object
print('Image object methods:', dir(img))
# important img info to dictionary
print(img.name, img.page_title, img.imageinfo['timestamp'], img.imageinfo['url'],
img.imageinfo['descriptionshorturl'])
print('\n')

@ -1,9 +1,8 @@
<div class="part">
<div class="img">
<a href="https:{{ fullurl }}">
<img src="{{ imgsrc }}" />
</a>
<img src="{{ imgsrc }}" />
<a href="https:{{ fullurl }}">{{ fullurl }}</a>
</div>
<div class="text">

@ -0,0 +1,12 @@
{{ '{{' }}ImageMetadata
|Title={{ title }}
|Date={{ date }}
|Part={{ part }}
|Partof={{ partof }}
|Creator={{ creator }}
|Organization={{ organization }}
|Format={{ format }}
|Event={{ event }}
|Topic={{ topic }}
{{ '}}' }}
[[Template:ImageMetadata]]

@ -0,0 +1,113 @@
import os, argparse, sys, re
from mwclient import (Site,
errors)
from jinja2 import Template
from functions import (print_colormsg,
reorder_imgs)
p = argparse.ArgumentParser(description='Upload files from a directory, with metadata values to the wiki.\n'
'Note that any VALUES CONTAINING '
'SPACES SHOULD BE BETWEEN QUOTATION MARKS',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# TODO: Add example of command to description
p.add_argument('--host', default='hub.xpub.nl/sandbox', help='wiki host')
p.add_argument('--path', default='/itchwiki/', help='Wiki path. Should end with /')
p.add_argument('--dry', '-d', action='store_true',
help='dry-run: will only print the metadata of each file that '
'will be upload, but does NOT upload')
p.add_argument('--dir', required=True,
help='Required. Full path of the image directory, that you wish to upload')
p.add_argument('--title', required=True,
help='Required. Must not exist yet in the wiki.')
p.add_argument('--date', required=True,
help='Required. Format: yyyy/mm/dd '
'For dates without day or month use 01 as default '
'ie. 1986: --date "1986/01/01" '
'March 1985: --date "1984/05/01"')
p.add_argument('--creator', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--org', required=False, action='append', default=[''],
help='Organization:Multiple values should be SEPARATED BY '
'COMMA')
p.add_argument('--format', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--event', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
p.add_argument('--topic', required=False, action='append', default=[''],
help='Multiple values should be SEPARATED BY COMMA')
# TODO ADD NEW PROPS
args = p.parse_args()
# login
site = Site(host=args.host, path=args.path)
wd =os.path.dirname(os.path.abspath(__file__)) # parent working directory
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
# metadata checks
if os.path.isdir(args.dir) is False:
print_colormsg(f'Error: --dir {args.dir} absolute path cannot be found', level='fail')
sys.exit()
elif not re.match(r'\d{4}\/\d{2}\/\d{2}', args.date):
print_colormsg(f'Error: --date {args.date} format should be --date "yyyy/mm/dd"', level='fail')
sys.exit()
elif len(list(site.ask(f'[[Title::{args.title}]]'))) > 0:
print_colormsg(f'Error: --title "{args.title}" already exists in wiki. Provide a different one', level='fail')
sys.exit()
# read template file
with open(os.path.join(wd, 'templates/smw_infobox_template.jinja')) as tmplt:
smw_propval_template = Template(tmplt.read())
lsimgs = reorder_imgs(dir=args.dir, dry=args.dry)
dirname = os.path.split(args.dir)[-1].replace(' ', '_')
dirname = re.sub(r'[\W]', '', dirname) #remove non letters or digits
# print('lsimgs:', lsimgs, '\n', dirname)
for n, _file in enumerate(lsimgs):
pagename = f'{dirname}-{_file}'
print_colormsg(pagename, level='ok')
page = site.pages[_file]
if page.exists:
url = page.imageinfo['descriptionurl']
print_colormsg(
f'Already exists in {url} Will NOT be uploaded',
level='warning')
else:
img_smw_prop_val = smw_propval_template.render(
title=args.title,
date=args.date,
part=n + 1,
partof=len(lsimgs),
creator=(', ').join(args.creator[1:]),
organization=(', ').join(args.org[1:]),
format=(', ').join(args.format[1:]),
event=(', ').join(args.event[1:]),
topic=(', ').join(args.topic[1:])
)
_file_path = os.path.join(args.dir, _file)
if not args.dry:
pageurl = f'https://{args.host}{args.path}index.php/File:{pagename}'
with open(_file_path, 'rb') as _f:
try:
site.upload(file=_file_path,
filename=pagename,
description=img_smw_prop_val,
ignore=True)
print(img_smw_prop_val)
except errors.APIError as e:
print_colormsg(f'Error: {e.info}\n'
f'It will not be uploaded',
level='fail')
print(f'See image at {pageurl}')
else:
print(img_smw_prop_val)
Loading…
Cancel
Save