From 94116713328c4fccb24b309437b01fc4e57f69d2 Mon Sep 17 00:00:00 2001 From: Castro0o Date: Tue, 3 Mar 2020 14:07:12 +0100 Subject: [PATCH 1/6] removing deprecated images2html.py --- images2html.py | 76 -------------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 images2html.py diff --git a/images2html.py b/images2html.py deleted file mode 100644 index b6de07a..0000000 --- a/images2html.py +++ /dev/null @@ -1,76 +0,0 @@ -import os, json -from mwclient import Site -from pprint import pprint -from jinja2 import Template -from functions import pandoc, page_props - -site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') -wd = os.path.dirname(os.path.abspath(__file__)) # working directory -imgdir = os.path.join(wd, 'images') -imgsjson_fn = os.path.join(wd, 'images.json') -with open(imgsjson_fn, 'r') as imgsjson_file: - images_info = json.load(imgsjson_file) - -static_html = os.path.join(wd, 'static_html') -os.makedirs(static_html, exist_ok=True) # create images/ dir - -with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd - loginlines = login.read() - user, pwd = loginlines.split('\n') - site.login(username=user, password=pwd) # login to wiki - - -page_html_template = ''' - - - - - - {{title}} - - -

{{ title }}

-

-
- -
-
- {{ content }} -
- - - -''' -page_template = Template(page_html_template) - - -for img_info in images_info.values(): - print(img_info) - page_name = img_info['name'] - page = site.pages[page_name] - # print(page) - # pprint(page.__dict__) - # print(dir(page)) - pagetext = page.text() - pageproperties = page_props(wikicontent=pagetext) - print(pageproperties) - - if pageproperties.get('Title'): - pagetext_html = pandoc(pwd=wd ,content=pagetext, format_in='mediawiki', format_out='html') - # print('pagetext', pagetext) - # print('pagetext_html', pagetext_html) - page_html = page_template.render(title=pageproperties.get('Title'), - date=pageproperties.get('Date'), - imgsrc=os.path.join(imgdir, img_info.get('filename')), - content=pagetext_html, - part=pageproperties.get('Part'), - partof=pageproperties.get('Partof')) - htmlpage_fn = "{}_{}.html".format( - pageproperties.get('Title').replace(" ", ""), - pageproperties.get('Part').zfill(3) - ) - print(htmlpage_fn) - with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile: - htmlfile.write(page_html) From b441df8ba0d95bc934297187199d4998537779ab Mon Sep 17 00:00:00 2001 From: Castro0o Date: Tue, 3 Mar 2020 16:55:57 +0100 Subject: [PATCH 2/6] downloading images with clean filenames and resizing:jpg,jpeg,png --- download_imgs.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/download_imgs.py b/download_imgs.py index 5ba671a..a8af106 100644 --- a/download_imgs.py +++ b/download_imgs.py @@ -1,8 +1,9 @@ import os from mwclient import Site from pprint import pprint -from functions import update_json from PIL import Image +from functions import update_json, remove_nonwords + site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') wd = os.path.dirname(os.path.abspath(__file__)) # working directory @@ -28,7 +29,7 @@ for img in site.allimages(): # important img info to dictionary img_dict = { 'name': img.name, - 'filename': img.page_title, + 'filename': remove_nonwords(img.page_title), 'timestamp': img.imageinfo['timestamp'], 'url': img.imageinfo['url'], 'urldesc': img.imageinfo['descriptionshorturl'], @@ -37,6 +38,7 @@ for img in site.allimages(): # location of image storage img_fn = os.path.join(imgdir, img_dict['filename']) + print(img_fn) # function updates images.json and returns whether the img needs to be downloaded or not download = update_json(imgsjson_fn, img_dict, img_fn) @@ -47,16 +49,18 @@ for img in site.allimages(): img.download(destination=img_file) # resize image - pilimg = Image.open(img_fn) - pilimg_dim = list(pilimg._size) - pilimg_dim_sort = sorted(pilimg_dim) # smallest dimension 1st - img_ratio = pilimg_dim_sort[0] / pilimg_dim_sort[1] - if pilimg_dim == pilimg_dim_sort: - # if height was largest - new_dim = [(thumbnail_size * img_ratio), thumbnail_size] - else: - # if with was largest - new_dim = [thumbnail_size,(thumbnail_size * img_ratio)] - pilimg.thumbnail(new_dim) - pilimg.save(img_fn) + fn, ext = os.path.splitext(img_fn) + if ext.lower() in ['.jpg', '.jpeg', '.gif', '.png']: # only img format + pilimg = Image.open(img_fn) + pilimg_dim = list(pilimg._size) + pilimg_dim_sort = sorted(pilimg_dim) # smallest dimension 1st + img_ratio = pilimg_dim_sort[0] / pilimg_dim_sort[1] + if pilimg_dim == pilimg_dim_sort: + # if height was largest + new_dim = [(thumbnail_size * img_ratio), thumbnail_size] + else: + # if with was largest + new_dim = [thumbnail_size,(thumbnail_size * img_ratio)] + pilimg.thumbnail(new_dim) + pilimg.save(img_fn) print('\n') From 69fc3f2ec5dbaff0606febb714d751e4c194b0ab Mon Sep 17 00:00:00 2001 From: Castro0o Date: Wed, 4 Mar 2020 08:53:52 +0100 Subject: [PATCH 3/6] remove non words --- functions.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/functions.py b/functions.py index 4c6b438..5414223 100644 --- a/functions.py +++ b/functions.py @@ -3,6 +3,12 @@ import subprocess from datetime import datetime +def remove_nonwords(imgname): + filename, ext = os.path.splitext(imgname) # split into filename & extension + filename = re.sub(r'\W', '', filename) # remove nonwoders from filename + return f'{filename}{ext}' # join filename & ext' + + def pandoc(pwd, content, format_in, format_out): # print('HTML content file:', wiki_content_f.name) @@ -95,6 +101,7 @@ def clean_dir(dirfullpath): os.remove(f) def print_colormsg(msg, level): + color_cmd = '' if level == 'fail': color_cmd = Colors.FAIL elif level == 'warning': From 0d9ed8a2d4113c70546746f75d9ce0485dca3b1d Mon Sep 17 00:00:00 2001 From: Castro0o Date: Wed, 4 Mar 2020 11:28:20 +0100 Subject: [PATCH 4/6] WIP:multi values properties --- functions.py | 52 ++++++++++++++++++++++++------------ query2html.py | 30 ++++++++++++++++----- templates/document.html | 4 +-- templates/document_part.html | 25 +++++++++++++---- templates/index.html | 2 +- 5 files changed, 81 insertions(+), 32 deletions(-) diff --git a/functions.py b/functions.py index 5414223..29aac39 100644 --- a/functions.py +++ b/functions.py @@ -43,23 +43,41 @@ def unpack_response(response): printouts = response['printouts'] page = response['fulltext'] fullurl = response['fullurl'] - d['page'] = page - for prop in printouts: - p_item = response['printouts'][prop] - for prop_val in p_item: - if isinstance(prop_val, dict) is False: - d[prop] = prop_val - else: - # if len(prop_val) > 0: - props = list(prop_val.keys()) - if 'fulltext' in props: - val = prop_val.get('fulltext') - elif 'timestamp' in props: - val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) - else: - val = list(prop_val.values())[0] - d[prop] = val - return page, d, fullurl + printouts_dumps = json.dumps(printouts) + printouts_loads = json.loads(printouts_dumps) + printouts_loads['page'] = page + # printouts_loads['Date'] = datetime.fromtimestamp( + # int(printouts_loads['Date'][0]['timestamp'])) + simplified_printouts = {} + for k, v in printouts_loads.items(): + if k == 'Date': + simplified_printouts[k] = datetime.fromtimestamp( + int(v[0]['timestamp'])) + # elif k == 'Title': + # simplified_printouts[k] = v[0]['fulltext'] + elif k in ['Part', 'Partof', 'page']: # only 1 value for each + simplified_printouts[k] = v + else: # Possibly more than 1 value for the rest of properties + simplified_printouts[k] = [] + for listitem in v: + simplified_printouts[k].append(listitem['fulltext']) + + # for prop in printouts: + # p_item = response['printouts'][prop] + # for prop_val in p_item: + # if isinstance(prop_val, dict) is False: + # d[prop] = prop_val + # else: + # # if len(prop_val) > 0: + # props = list(prop_val.keys()) + # if 'fulltext' in props: + # val = prop_val.get('fulltext') + # elif 'timestamp' in props: + # val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) + # else: + # val = list(prop_val.values())[0] + # d[prop] = val + return page, simplified_printouts, fullurl def update_json(imgsjson_fn, img_dict, img_fn): diff --git a/query2html.py b/query2html.py index 018eb9a..9e234d9 100644 --- a/query2html.py +++ b/query2html.py @@ -15,7 +15,7 @@ p.add_argument("--conditions", "-c", metavar='', default='[[File:+]][[Title::+]][[Part::+]][[Date::+]]', help='The query conditions') p.add_argument("--printouts", "-p", metavar='', - default='?Title|?Date|?Part|?Partof|?Creator', + default='?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language', help='Selection of properties to printout') p.add_argument("--sort", "-s", metavar='', default='Date,Title,Part', @@ -23,6 +23,8 @@ p.add_argument("--sort", "-s", metavar='', p.add_argument("--order", "-o", metavar='', default='asc,asc,asc', help='Order of sorting conditions. Should same amount as the --sort properties') +p.add_argument('--limit', '-l', help='(optional) Limit the number of returned ' + 'items') p.add_argument('--dry', '-d', action='store_true', help='dry-run: will only show the query but not run it') @@ -33,8 +35,10 @@ if len(args.sort.split(',')) != len(args.order.split(',')): Colors.WARNING, '--sort and --order do not have the same amount of elements', Colors.ENDC) print('Script exiting now') sys.exit() - query = f'{args.conditions}|{args.printouts}|sort={args.sort}|order={args.order}' +if args.limit: + limit_str = f'|limit={args.limit}' + query += limit_str print('query:', Colors.GREEN, query, Colors.ENDC) query_unquoted = urllib.parse.quote(query) query_url = f'https://{args.host}{args.path}api.php?action=ask&query={query_unquoted}&format=json' @@ -75,7 +79,7 @@ with open(os.path.join(wd, 'templates/document_part.html')) as document_html: all_document_parts = '' # to append all content documentslist = [] for answer in site.ask(query): - publication_title = '' + # publication_title = '' # print(answer, answer.keys()) page, printout_dict, fullurl = unpack_response(answer) print(page) @@ -85,6 +89,13 @@ for answer in site.ask(query): print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images") print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC) sys.exit() + # + # # TODO: EXTRACT PROPERTIES THROUGH THE FOLLOWING ASK QUERY + # ask_page_props = f'[[File:{printout_dict["page"]}]]|?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language' + # print(ask_page_props) + # page_props = site.ask(ask_page_props) + # print(page_props) + # import pdb; pdb.set_trace() page = site.pages[[printout_dict['page']]] # request that page from wiki pagetext = page.text() pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html') @@ -101,10 +112,15 @@ for answer in site.ask(query): if printout_dict['Part'] == printout_dict['Partof']: # RENDER DOCUMENT # by passing all_document_parts html to document_template content - document_html = document_template.render(title=printout_dict.get('Title'), - date=printout_dict.get('Date'), - content=all_document_parts) # render document template - htmlpage_fn = "{}.html".format(printout_dict.get('Title').replace(" ", "")) + + # TODO: EXPAND PROPERTIES IN TEMPLATE + + document_html = document_template.render( + title=printout_dict.get('Title'), + date=printout_dict.get('Date'), + content=all_document_parts) # render document template + htmlpage_fn = "{}.html".format( + printout_dict.get('Title')[0].replace(" ", "")) with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile: htmlfile.write(document_html) all_document_parts = '' # Reset all_document_parts diff --git a/templates/document.html b/templates/document.html index 4748c77..72687fd 100644 --- a/templates/document.html +++ b/templates/document.html @@ -3,10 +3,10 @@ - {{title}} + {{ title[0] }} -

{{ title }}

+

{{ title[0] }}

{{ content }} diff --git a/templates/document_part.html b/templates/document_part.html index c435c5a..feb365a 100644 --- a/templates/document_part.html +++ b/templates/document_part.html @@ -11,17 +11,32 @@ diff --git a/templates/index.html b/templates/index.html index 55628bc..af2286e 100644 --- a/templates/index.html +++ b/templates/index.html @@ -8,7 +8,7 @@

Results from query:
{{query}}

    {% for doc in documentslist %} -
  • {{ doc['title'] }} +
  • {{ doc['title'][0] }} {{ doc['date'].year }}.{{ doc['date'].month }}.{{ doc['date'].day }} {{doc['creator']}}
  • From 5ba753199bc41baca73c0ffe64a52b74d0f08a87 Mon Sep 17 00:00:00 2001 From: Castro0o Date: Wed, 4 Mar 2020 11:56:15 +0100 Subject: [PATCH 5/6] correct document_part template --- templates/document_part.html | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/templates/document_part.html b/templates/document_part.html index feb365a..18bcebc 100644 --- a/templates/document_part.html +++ b/templates/document_part.html @@ -12,33 +12,20 @@ -
From f4b9fae02076edbcdf6a98435ab146de2d33b4b3 Mon Sep 17 00:00:00 2001 From: Castro0o Date: Wed, 4 Mar 2020 12:11:47 +0100 Subject: [PATCH 6/6] removing non-word chars from html filename; creator in index as string; cleaned old code form functions.py --- functions.py | 19 +------------------ query2html.py | 16 ++++------------ templates/index.html | 2 +- 3 files changed, 6 insertions(+), 31 deletions(-) diff --git a/functions.py b/functions.py index 29aac39..ce6d4a2 100644 --- a/functions.py +++ b/functions.py @@ -43,11 +43,10 @@ def unpack_response(response): printouts = response['printouts'] page = response['fulltext'] fullurl = response['fullurl'] + # convert OrderDict to Dict json.dumps json.loads printouts_dumps = json.dumps(printouts) printouts_loads = json.loads(printouts_dumps) printouts_loads['page'] = page - # printouts_loads['Date'] = datetime.fromtimestamp( - # int(printouts_loads['Date'][0]['timestamp'])) simplified_printouts = {} for k, v in printouts_loads.items(): if k == 'Date': @@ -61,22 +60,6 @@ def unpack_response(response): simplified_printouts[k] = [] for listitem in v: simplified_printouts[k].append(listitem['fulltext']) - - # for prop in printouts: - # p_item = response['printouts'][prop] - # for prop_val in p_item: - # if isinstance(prop_val, dict) is False: - # d[prop] = prop_val - # else: - # # if len(prop_val) > 0: - # props = list(prop_val.keys()) - # if 'fulltext' in props: - # val = prop_val.get('fulltext') - # elif 'timestamp' in props: - # val = datetime.fromtimestamp(int(prop_val.get('timestamp'))) - # else: - # val = list(prop_val.values())[0] - # d[prop] = val return page, simplified_printouts, fullurl diff --git a/query2html.py b/query2html.py index 9e234d9..457f5f5 100644 --- a/query2html.py +++ b/query2html.py @@ -2,7 +2,7 @@ import os, json, sys, urllib from mwclient import Site from pprint import pprint from jinja2 import Template -from functions import pandoc, page_props, unpack_response, clean_dir +from functions import pandoc, unpack_response, clean_dir, remove_nonwords from functions import Colors import argparse @@ -89,13 +89,7 @@ for answer in site.ask(query): print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images") print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC) sys.exit() - # - # # TODO: EXTRACT PROPERTIES THROUGH THE FOLLOWING ASK QUERY - # ask_page_props = f'[[File:{printout_dict["page"]}]]|?Title|?Date|?Part|?Partof|?Creator|?Organization|?Format|?Event|?Topic|?Language' - # print(ask_page_props) - # page_props = site.ask(ask_page_props) - # print(page_props) - # import pdb; pdb.set_trace() + page = site.pages[[printout_dict['page']]] # request that page from wiki pagetext = page.text() pagetext_html = pandoc(pwd=wd, content=pagetext, format_in='mediawiki', format_out='html') @@ -112,15 +106,13 @@ for answer in site.ask(query): if printout_dict['Part'] == printout_dict['Partof']: # RENDER DOCUMENT # by passing all_document_parts html to document_template content - - # TODO: EXPAND PROPERTIES IN TEMPLATE - document_html = document_template.render( title=printout_dict.get('Title'), date=printout_dict.get('Date'), content=all_document_parts) # render document template htmlpage_fn = "{}.html".format( - printout_dict.get('Title')[0].replace(" ", "")) + remove_nonwords(printout_dict.get('Title')[0]) + ) with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile: htmlfile.write(document_html) all_document_parts = '' # Reset all_document_parts diff --git a/templates/index.html b/templates/index.html index af2286e..efe90e2 100644 --- a/templates/index.html +++ b/templates/index.html @@ -10,7 +10,7 @@ {% for doc in documentslist %}
  • {{ doc['title'][0] }} {{ doc['date'].year }}.{{ doc['date'].month }}.{{ doc['date'].day }} - {{doc['creator']}} + {{doc['creator'] | join(", ")}}
  • {% endfor %}