Merge branch 'master' of https://git.xpub.nl/XPUB/special-issue-11-wiki2html into ezn

5 years ago · 569d59ddc5
parent 80e346a2c9 a3eb19720c
commit 569d59ddc5
9 changed files with 238 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@ images.json
 .mediawiki_content
 .html_content
 .DS_Store
+
 # venv dirs & files
 .idea/
 bin/
@ -14,3 +15,5 @@ lib64
 pyvenv.cfg
 share/
 __pycache__/
+*.jpg
+*.jpeg
--- a/README.md
+++ b/README.md
@ -39,28 +39,28 @@ Run scripts together with `./run.sh`

 1 script at a time:

-`python3 download_imgs.py` 
-* Downloads all images from wiki to `images/` directory 
+`python3 download_imgs.py`
+* Downloads all images from wiki to `images/` directory
 * and stores each image's metadata to `images.json`

 `python3 query2html.py`
-* with ask API perform a query: 
+* with ask API perform a query:
    * help `python3 query2html.py --help`
    * run dry `python3 query2html.py --dry` only printing request, not executing it
    * build custom query with arguments `--conditions  --printouts  --sort  --order`
    * default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc`  
-    * custom queries 
+    * custom queries
        * `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'`
        * `python3 query2html.py --conditions '[[Creator::~*task force*]]'`

-Note: to avoid confusion or problems is better to leave the `--printouts` `--sort`  `--order` arguments as the default. 
+Note: to avoid confusion or problems is better to leave the `--printouts` `--sort`  `--order` arguments as the default.
 Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts.


 ## How does query2html.py work?

 Based on the query made:
-MW API will send back a number of Page titles that match the query conditions, 
+MW API will send back a number of Page titles that match the query conditions,
 together with its printouts (metadata proprety::value pairs).

 For each Page:
@ -69,12 +69,31 @@ For each Page:
 * a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html`

 All Pages that *share the same metadata's Title value*, will:
-* gather all their html fragments in `all_document_parts` 
+* gather all their html fragments in `all_document_parts`
 * render `templates/document.html` with the content of `all_document_parts`   
-* save the render template to `'static_html/DocumentTitle.html'`, 
-    
+* save the render template to `'static_html/DocumentTitle.html'`,
+
 Each of the saved documents:
 * render `templates/index.html` with the info on each document has been saved into `documentslist`  
 * resulting in `static_html/index.html`
- 

+
+# Bulk image upload upload_imgs_dir.py
+
+Get Help: `python3 upload_imgs_dir.py --help`
+
+**Edit and run via** `.helper-upload_imgs_dir.sh`
+
+
+# Convert PDFs to folder of JPGs with pdf2jpg.sh
+By either:
+* running it from this folder and using absolute path to PDF
+`./pdf2jpg.sh "/full/path/to/2020_bantayog/PDFname.pdf"`
+
+* copying pdf2jpg.sh to 2020_bantayog/ and running with relative path to PDF
+`./pdf2jpg.sh "PDFname.pdf"`
+
+It is 
+
+to convert pdfs to jpgs:
+convert -quality 100 -density 300 [name-of-pdf] %02d.jpg
--- a/functions.py
+++ b/functions.py
@ -1,4 +1,4 @@
-import os, json, re, shlex
+import os, json, re, shlex, sys
 import subprocess
 from datetime import datetime

@ -94,6 +94,14 @@ def clean_dir(dirfullpath):
        if os.path.isfile(f):
            os.remove(f)

+def print_colormsg(msg, level):
+    if level == 'fail':
+        color_cmd = Colors.FAIL
+    elif level == 'warning':
+        color_cmd = Colors.WARNING
+    elif level == 'ok':
+        color_cmd = Colors.BLUE
+    print(color_cmd, msg, Colors.ENDC)


 class Colors:
@ -104,4 +112,43 @@ class Colors:
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
+    UNDERLINE = '\033[4m'
+
+# image upload function
+
+def listimgs(dir):
+    lsimgs = [_file for _file in os.listdir(dir) if
+              (os.path.splitext(_file)[-1]).lower() in
+              ['.jpg', '.jpeg', '.png']]
+    lsimgs.sort()
+    return lsimgs
+
+
+def reorder_imgs(dir, dry):
+    # does zero pad file numbers
+    # and returns correct order of files
+    lsimgs = listimgs(dir)
+    for img in lsimgs:
+        img_name, img_ext = os.path.splitext(img)
+        # does file follow \d{1,}\.img_ext
+        numb_exp = re.compile(
+            r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
+        match = re.search(numb_exp, img)
+        if not match:
+            print(f'Image {img} Filename is not suitable for bulk upload.'
+                  f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
+                  f'You have to DO IT MANUALLY')
+            sys.exit()
+        else:
+            # only change name of single digit numbers
+            if len(match.groupdict()['num']) == 1:
+                name = match.groupdict()['name']
+                num = match.groupdict()['num'].zfill(3) # pad with 0s
+                ext = match.groupdict()['ext']
+                new_img = name + num + ext
+                src_img = os.path.join(dir, img)
+                dst_img = os.path.join(dir, new_img)
+                print(f'Renaming: {img} >>>>>  {new_img}')
+                if dry == False:
+                    os.replace(src_img, dst_img)
+    return listimgs(dir)   # update list w/ renamed imgs
--- a/helper-upload_imgs_dir.sh
+++ b/helper-upload_imgs_dir.sh
@ -0,0 +1,21 @@
+#!/bin/sh
+
+python3 upload_imgs_dir.py \
+--title 'Ang Bayan  December 1984' \
+--creator 'Central Committee of the Communist Party of the Philippines' \
+--date '1984/12/01' \
+--org 'Communist Party of the Philippines' \
+--format 'Bulletin' \
+--topic 'Communism, Armed Struggle' \
+--dir '/full/path/to/2020_bantayog/Folder name' \
+# --dry
+
+# Note:
+# * Add this values to you upload specific upload.
+# * --dry can be enabled to show you what will be uploaded and the metadata, without actully uploading it
+# * parameters --event --topic can be added
+# *  \ allow you to continue the command of a different line
+#
+# Get help:  python3 upload_imgs_dir.py --help
+
+
--- a/pdf2jpg.sh
+++ b/pdf2jpg.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+PDF="$1"
+echo "$PDF"
+DIR=`echo "$PDF" | sed s/\.pdf//`
+echo "$DIR"
+mkdir "$DIR"
+echo "Starting convertion ..."
+convert -quality 100 -density 300 "$PDF" "$DIR/"%02d.jpg
+echo "PDF converted thanks to Damla aka Imagemagick ninja"
--- a/sandbox/wiki_images.py
+++ b/sandbox/wiki_images.py
@ -1,24 +0,0 @@
-import os
-from mwclient import Site
-from pprint import pprint
-
-site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
-wd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # parent working directory
-
-with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
-    loginlines = login.read()
-    user, pwd = loginlines.split('\n')
-    site.login(username=user, password=pwd)  # login to wiki
-
-print(site)
-
-for n, img in enumerate(site.allimages()):
-    if n < 5:
-        print(img)
-        print('Image attributes:')
-        pprint(img.__dict__)  # info contained in each img object
-        print('Image object methods:', dir(img))
-        # important img info to dictionary
-        print(img.name, img.page_title, img.imageinfo['timestamp'], img.imageinfo['url'],
-              img.imageinfo['descriptionshorturl'])
-        print('\n')
--- a/templates/document_part.html
+++ b/templates/document_part.html
@ -1,9 +1,8 @@
 <div class="part">

    <div class="img">
-        <a href="https:{{ fullurl }}">
-          <img src="{{ imgsrc }}" />
-        </a>
+        <img src="{{ imgsrc }}" />
+        <a href="https:{{ fullurl }}">{{ fullurl }}</a>
    </div>

    <div class="text">
--- a/templates/smw_infobox_template.jinja
+++ b/templates/smw_infobox_template.jinja
@ -0,0 +1,12 @@
+{{ '{{' }}ImageMetadata
+|Title={{ title }}
+|Date={{ date }}
+|Part={{ part }}
+|Partof={{ partof }}
+|Creator={{ creator }}
+|Organization={{ organization }}
+|Format={{ format }}
+|Event={{ event }}
+|Topic={{ topic }}
+{{ '}}' }}
+[[Template:ImageMetadata]]
--- a/upload_imgs_dir.py
+++ b/upload_imgs_dir.py
@ -0,0 +1,113 @@
+import os, argparse, sys, re
+from mwclient import (Site,
+                      errors)
+from jinja2 import Template
+from functions import (print_colormsg,
+                       reorder_imgs)
+
+p = argparse.ArgumentParser(description='Upload files from a directory, with metadata values to the wiki.\n'
+                                        'Note that any VALUES CONTAINING '
+                                        'SPACES SHOULD BE BETWEEN QUOTATION MARKS',
+                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+# TODO: Add example of command to description
+p.add_argument('--host', default='hub.xpub.nl/sandbox', help='wiki host')
+p.add_argument('--path', default='/itchwiki/', help='Wiki path. Should end with /')
+p.add_argument('--dry', '-d', action='store_true',
+               help='dry-run: will only print the metadata of each file that '
+                    'will be upload, but does NOT upload')
+p.add_argument('--dir', required=True,
+               help='Required. Full path of the image directory, that you wish to upload')
+
+p.add_argument('--title', required=True,
+               help='Required. Must not exist yet in the wiki.')
+p.add_argument('--date', required=True,
+               help='Required. Format: yyyy/mm/dd '
+                    'For dates without day or month use 01 as default '
+                    'ie. 1986: --date "1986/01/01" '
+                    'March 1985: --date "1984/05/01"')
+p.add_argument('--creator', required=False, action='append', default=[''],
+               help='Multiple values should be SEPARATED BY COMMA')
+p.add_argument('--org', required=False, action='append', default=[''],
+               help='Organization:Multiple values should be SEPARATED BY '
+                    'COMMA')
+p.add_argument('--format', required=False, action='append', default=[''],
+               help='Multiple values should be SEPARATED BY COMMA')
+p.add_argument('--event', required=False, action='append', default=[''],
+               help='Multiple values should be SEPARATED BY COMMA')
+p.add_argument('--topic', required=False, action='append', default=[''],
+               help='Multiple values should be SEPARATED BY COMMA')
+
+# TODO ADD  NEW PROPS
+args = p.parse_args()
+
+# login
+site = Site(host=args.host, path=args.path)
+
+wd =os.path.dirname(os.path.abspath(__file__))  # parent working directory
+with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
+    loginlines = login.read()
+    user, pwd = loginlines.split('\n')
+    site.login(username=user, password=pwd)  # login to wiki
+
+# metadata checks
+if os.path.isdir(args.dir) is False:
+    print_colormsg(f'Error: --dir {args.dir} absolute path cannot be found', level='fail')
+    sys.exit()
+elif not re.match(r'\d{4}\/\d{2}\/\d{2}', args.date):
+    print_colormsg(f'Error:  --date {args.date} format should be --date "yyyy/mm/dd"', level='fail')
+    sys.exit()
+elif len(list(site.ask(f'[[Title::{args.title}]]'))) > 0:
+    print_colormsg(f'Error: --title "{args.title}" already exists in wiki. Provide a different one', level='fail')
+    sys.exit()
+
+# read template file
+with open(os.path.join(wd, 'templates/smw_infobox_template.jinja')) as tmplt:
+    smw_propval_template = Template(tmplt.read())
+
+lsimgs = reorder_imgs(dir=args.dir, dry=args.dry)
+dirname = os.path.split(args.dir)[-1].replace(' ', '_')
+dirname = re.sub(r'[\W]', '', dirname) #remove non letters or digits
+# print('lsimgs:', lsimgs, '\n', dirname)
+
+for n, _file in enumerate(lsimgs):
+    pagename = f'{dirname}-{_file}'
+    print_colormsg(pagename, level='ok')
+    page = site.pages[_file]
+
+    if page.exists:
+        url = page.imageinfo['descriptionurl']
+        print_colormsg(
+            f'Already exists in {url} Will NOT be uploaded',
+            level='warning')
+    else:
+        img_smw_prop_val = smw_propval_template.render(
+            title=args.title,
+            date=args.date,
+            part=n + 1,
+            partof=len(lsimgs),
+            creator=(', ').join(args.creator[1:]),
+            organization=(', ').join(args.org[1:]),
+            format=(', ').join(args.format[1:]),
+            event=(', ').join(args.event[1:]),
+            topic=(', ').join(args.topic[1:])
+        )
+
+        _file_path = os.path.join(args.dir, _file)
+        if not args.dry:
+            pageurl = f'https://{args.host}{args.path}index.php/File:{pagename}'
+            with open(_file_path, 'rb') as _f:
+                try:
+                    site.upload(file=_file_path,
+                                filename=pagename,
+                                description=img_smw_prop_val,
+                                ignore=True)
+                    print(img_smw_prop_val)
+                except errors.APIError as e:
+                    print_colormsg(f'Error: {e.info}\n'
+                                   f'It will not be uploaded',
+                                   level='fail')
+
+                print(f'See image at {pageurl}')
+        else:
+            print(img_smw_prop_val)
+