Merge branch 'master' of https://git.xpub.nl/XPUB/special-issue-11-wiki2html into ezn

5 years ago · 569d59ddc5
parent 80e346a2c9 a3eb19720c
commit 569d59ddc5
9 changed files with 238 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@ images.json
 .mediawiki_content
 .html_content
 .DS_Store
 # venv dirs & files
 .idea/
 bin/
@ -14,3 +15,5 @@ lib64
 pyvenv.cfg
 share/
 __pycache__/
 *.jpg
 *.jpeg
--- a/README.md
+++ b/README.md
@ -39,28 +39,28 @@ Run scripts together with `./run.sh`
 1 script at a time:
-`python3 download_imgs.py` 
+`python3 download_imgs.py`
-* Downloads all images from wiki to `images/` directory 
+* Downloads all images from wiki to `images/` directory
 * and stores each image's metadata to `images.json`
 `python3 query2html.py`
-* with ask API perform a query: 
+* with ask API perform a query:
    * help `python3 query2html.py --help`
    * run dry `python3 query2html.py --dry` only printing request, not executing it
    * build custom query with arguments `--conditions  --printouts  --sort  --order`
    * default query is: `[[File:+]][[Title::+]][[Part::+]][[Date::+]]|?Title|?Date|?Part|?Partof|sort=Date,Title,Part|order=asc,asc,asc`  
-    * custom queries 
+    * custom queries
        * `python3 query2html.py --conditions '[[Date::>=1970/01/01]][[Date::<=1979/12/31]]'`
        * `python3 query2html.py --conditions '[[Creator::~*task force*]]'`
-Note: to avoid confusion or problems is better to leave the `--printouts` `--sort`  `--order` arguments as the default. 
+Note: to avoid confusion or problems is better to leave the `--printouts` `--sort`  `--order` arguments as the default.
 Otherwise document parts will start to get grouped not according to their Title, hence creating documents made from different original parts.
 ## How does query2html.py work?
 Based on the query made:
-MW API will send back a number of Page titles that match the query conditions, 
+MW API will send back a number of Page titles that match the query conditions,
 together with its printouts (metadata proprety::value pairs).
 For each Page:
@ -69,12 +69,31 @@ For each Page:
 * a fragment of html (`document_part_html`) is generated based on the `templates/document_part.html`
 All Pages that *share the same metadata's Title value*, will:
-* gather all their html fragments in `all_document_parts` 
+* gather all their html fragments in `all_document_parts`
 * render `templates/document.html` with the content of `all_document_parts`   
-* save the render template to `'static_html/DocumentTitle.html'`, 
+* save the render template to `'static_html/DocumentTitle.html'`,
-    
+
 Each of the saved documents:
 * render `templates/index.html` with the info on each document has been saved into `documentslist`  
 * resulting in `static_html/index.html`
 # Bulk image upload upload_imgs_dir.py
 Get Help: `python3 upload_imgs_dir.py --help`
 **Edit and run via** `.helper-upload_imgs_dir.sh`
 # Convert PDFs to folder of JPGs with pdf2jpg.sh
 By either:
 * running it from this folder and using absolute path to PDF
 `./pdf2jpg.sh "/full/path/to/2020_bantayog/PDFname.pdf"`
 * copying pdf2jpg.sh to 2020_bantayog/ and running with relative path to PDF
 `./pdf2jpg.sh "PDFname.pdf"`
 It is 
 to convert pdfs to jpgs:
 convert -quality 100 -density 300 [name-of-pdf] %02d.jpg
--- a/functions.py
+++ b/functions.py
@ -1,4 +1,4 @@
-import os, json, re, shlex
+import os, json, re, shlex, sys
 import subprocess
 from datetime import datetime
@ -94,6 +94,14 @@ def clean_dir(dirfullpath):
        if os.path.isfile(f):
            os.remove(f)
 def print_colormsg(msg, level):
    if level == 'fail':
        color_cmd = Colors.FAIL
    elif level == 'warning':
        color_cmd = Colors.WARNING
    elif level == 'ok':
        color_cmd = Colors.BLUE
    print(color_cmd, msg, Colors.ENDC)
 class Colors:
@ -104,4 +112,43 @@ class Colors:
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
+    UNDERLINE = '\033[4m'
 # image upload function
 def listimgs(dir):
    lsimgs = [_file for _file in os.listdir(dir) if
              (os.path.splitext(_file)[-1]).lower() in
              ['.jpg', '.jpeg', '.png']]
    lsimgs.sort()
    return lsimgs
 def reorder_imgs(dir, dry):
    # does zero pad file numbers
    # and returns correct order of files
    lsimgs = listimgs(dir)
    for img in lsimgs:
        img_name, img_ext = os.path.splitext(img)
        # does file follow \d{1,}\.img_ext
        numb_exp = re.compile(
            r'(?P<name>.*?)(?P<num>\d+)(?P<ext>%s)'% re.escape(img_ext))
        match = re.search(numb_exp, img)
        if not match:
            print(f'Image {img} Filename is not suitable for bulk upload.'
                  f'Filename pattern dn\'t match 1.jpg 01.jpg something01.jpg'
                  f'You have to DO IT MANUALLY')
            sys.exit()
        else:
            # only change name of single digit numbers
            if len(match.groupdict()['num']) == 1:
                name = match.groupdict()['name']
                num = match.groupdict()['num'].zfill(3) # pad with 0s
                ext = match.groupdict()['ext']
                new_img = name + num + ext
                src_img = os.path.join(dir, img)
                dst_img = os.path.join(dir, new_img)
                print(f'Renaming: {img} >>>>>  {new_img}')
                if dry == False:
                    os.replace(src_img, dst_img)
    return listimgs(dir)   # update list w/ renamed imgs
--- a/helper-upload_imgs_dir.sh
+++ b/helper-upload_imgs_dir.sh
@ -0,0 +1,21 @@
 #!/bin/sh
 python3 upload_imgs_dir.py \
 --title 'Ang Bayan  December 1984' \
 --creator 'Central Committee of the Communist Party of the Philippines' \
 --date '1984/12/01' \
 --org 'Communist Party of the Philippines' \
 --format 'Bulletin' \
 --topic 'Communism, Armed Struggle' \
 --dir '/full/path/to/2020_bantayog/Folder name' \
 # --dry
 # Note:
 # * Add this values to you upload specific upload.
 # * --dry can be enabled to show you what will be uploaded and the metadata, without actully uploading it
 # * parameters --event --topic can be added
 # *  \ allow you to continue the command of a different line
 #
 # Get help:  python3 upload_imgs_dir.py --help
--- a/pdf2jpg.sh
+++ b/pdf2jpg.sh
@ -0,0 +1,9 @@
 #!/bin/sh
 PDF="$1"
 echo "$PDF"
 DIR=`echo "$PDF" | sed s/\.pdf//`
 echo "$DIR"
 mkdir "$DIR"
 echo "Starting convertion ..."
 convert -quality 100 -density 300 "$PDF" "$DIR/"%02d.jpg
 echo "PDF converted thanks to Damla aka Imagemagick ninja"
--- a/sandbox/wiki_images.py
+++ b/sandbox/wiki_images.py
@ -1,24 +0,0 @@
 import os
 from mwclient import Site
 from pprint import pprint
 site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
 wd = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # parent working directory
 with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki
 print(site)
 for n, img in enumerate(site.allimages()):
    if n < 5:
        print(img)
        print('Image attributes:')
        pprint(img.__dict__)  # info contained in each img object
        print('Image object methods:', dir(img))
        # important img info to dictionary
        print(img.name, img.page_title, img.imageinfo['timestamp'], img.imageinfo['url'],
              img.imageinfo['descriptionshorturl'])
        print('\n')
--- a/templates/document_part.html
+++ b/templates/document_part.html
@ -1,9 +1,8 @@
 <div class="part">
    <div class="img">
-        <a href="https:{{ fullurl }}">
+        <img src="{{ imgsrc }}" />
-          <img src="{{ imgsrc }}" />
+        <a href="https:{{ fullurl }}">{{ fullurl }}</a>
        </a>
    </div>
    <div class="text">
--- a/templates/smw_infobox_template.jinja
+++ b/templates/smw_infobox_template.jinja
@ -0,0 +1,12 @@
 {{ '{{' }}ImageMetadata
 |Title={{ title }}
 |Date={{ date }}
 |Part={{ part }}
 |Partof={{ partof }}
 |Creator={{ creator }}
 |Organization={{ organization }}
 |Format={{ format }}
 |Event={{ event }}
 |Topic={{ topic }}
 {{ '}}' }}
 [[Template:ImageMetadata]]
--- a/upload_imgs_dir.py
+++ b/upload_imgs_dir.py
@ -0,0 +1,113 @@
 import os, argparse, sys, re
 from mwclient import (Site,
                      errors)
 from jinja2 import Template
 from functions import (print_colormsg,
                       reorder_imgs)
 p = argparse.ArgumentParser(description='Upload files from a directory, with metadata values to the wiki.\n'
                                        'Note that any VALUES CONTAINING '
                                        'SPACES SHOULD BE BETWEEN QUOTATION MARKS',
                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 # TODO: Add example of command to description
 p.add_argument('--host', default='hub.xpub.nl/sandbox', help='wiki host')
 p.add_argument('--path', default='/itchwiki/', help='Wiki path. Should end with /')
 p.add_argument('--dry', '-d', action='store_true',
               help='dry-run: will only print the metadata of each file that '
                    'will be upload, but does NOT upload')
 p.add_argument('--dir', required=True,
               help='Required. Full path of the image directory, that you wish to upload')
 p.add_argument('--title', required=True,
               help='Required. Must not exist yet in the wiki.')
 p.add_argument('--date', required=True,
               help='Required. Format: yyyy/mm/dd '
                    'For dates without day or month use 01 as default '
                    'ie. 1986: --date "1986/01/01" '
                    'March 1985: --date "1984/05/01"')
 p.add_argument('--creator', required=False, action='append', default=[''],
               help='Multiple values should be SEPARATED BY COMMA')
 p.add_argument('--org', required=False, action='append', default=[''],
               help='Organization:Multiple values should be SEPARATED BY '
                    'COMMA')
 p.add_argument('--format', required=False, action='append', default=[''],
               help='Multiple values should be SEPARATED BY COMMA')
 p.add_argument('--event', required=False, action='append', default=[''],
               help='Multiple values should be SEPARATED BY COMMA')
 p.add_argument('--topic', required=False, action='append', default=[''],
               help='Multiple values should be SEPARATED BY COMMA')
 # TODO ADD  NEW PROPS
 args = p.parse_args()
 # login
 site = Site(host=args.host, path=args.path)
 wd =os.path.dirname(os.path.abspath(__file__))  # parent working directory
 with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki
 # metadata checks
 if os.path.isdir(args.dir) is False:
    print_colormsg(f'Error: --dir {args.dir} absolute path cannot be found', level='fail')
    sys.exit()
 elif not re.match(r'\d{4}\/\d{2}\/\d{2}', args.date):
    print_colormsg(f'Error:  --date {args.date} format should be --date "yyyy/mm/dd"', level='fail')
    sys.exit()
 elif len(list(site.ask(f'[[Title::{args.title}]]'))) > 0:
    print_colormsg(f'Error: --title "{args.title}" already exists in wiki. Provide a different one', level='fail')
    sys.exit()
 # read template file
 with open(os.path.join(wd, 'templates/smw_infobox_template.jinja')) as tmplt:
    smw_propval_template = Template(tmplt.read())
 lsimgs = reorder_imgs(dir=args.dir, dry=args.dry)
 dirname = os.path.split(args.dir)[-1].replace(' ', '_')
 dirname = re.sub(r'[\W]', '', dirname) #remove non letters or digits
 # print('lsimgs:', lsimgs, '\n', dirname)
 for n, _file in enumerate(lsimgs):
    pagename = f'{dirname}-{_file}'
    print_colormsg(pagename, level='ok')
    page = site.pages[_file]
    if page.exists:
        url = page.imageinfo['descriptionurl']
        print_colormsg(
            f'Already exists in {url} Will NOT be uploaded',
            level='warning')
    else:
        img_smw_prop_val = smw_propval_template.render(
            title=args.title,
            date=args.date,
            part=n + 1,
            partof=len(lsimgs),
            creator=(', ').join(args.creator[1:]),
            organization=(', ').join(args.org[1:]),
            format=(', ').join(args.format[1:]),
            event=(', ').join(args.event[1:]),
            topic=(', ').join(args.topic[1:])
        )
        _file_path = os.path.join(args.dir, _file)
        if not args.dry:
            pageurl = f'https://{args.host}{args.path}index.php/File:{pagename}'
            with open(_file_path, 'rb') as _f:
                try:
                    site.upload(file=_file_path,
                                filename=pagename,
                                description=img_smw_prop_val,
                                ignore=True)
                    print(img_smw_prop_val)
                except errors.APIError as e:
                    print_colormsg(f'Error: {e.info}\n'
                                   f'It will not be uploaded',
                                   level='fail')
                print(f'See image at {pageurl}')
        else:
            print(img_smw_prop_val)