From bf66a2d572890ea1e539b9542ba4ce6139628467 Mon Sep 17 00:00:00 2001 From: Castro0o Date: Wed, 8 Apr 2020 10:36:59 +0200 Subject: [PATCH] images: remote, local, archoive --- README.md | 9 +++++++-- dumpwiki.py | 54 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index f5df694..78cabea 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,14 @@ mypassword ### locally on your own machine: create archive folder: `mkdir archive` -run script outputting to archive folder and displaying the images from the wiki: +run script outputting to archive folder and **displaying the images from the wiki**: + +`python3 dumpwiki.py --output archive --local --imgsrc remote` + +run script outputting to archive folder and **displaying the images from local ../images**: +* requires running `download_imgs.py` +`python3 dumpwiki.py --output archive --local --imgsrc local` -`python3 dumpwiki.py --output archive --local` ### Categories and Templates: diff --git a/dumpwiki.py b/dumpwiki.py index 4152757..54c24bd 100644 --- a/dumpwiki.py +++ b/dumpwiki.py @@ -1,8 +1,6 @@ -import os, json, sys, urllib +import os, json, sys from mwclient import Site -from pprint import pprint from jinja2 import Template -from functions import unpack_response, clean_dir, remove_nonwords import html5lib from functions import Colors import argparse @@ -17,20 +15,25 @@ p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Shou p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages") p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images") +p.add_argument("--imgsrc", default='archive', + choices=['archive', 'local', 'remote'], + help="What is the source of the images?") args = p.parse_args() print(args) # site and login site = Site(host=args.host, path=args.path) +wd = os.path.dirname(os.path.abspath(__file__)) # working directory with open('login.txt', 'r') as login: # read login user & pwd loginlines = login.read() user, pwd = loginlines.split('\n') site.login(username=user, password=pwd) # login to wiki -# read template files - +imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file +with open(imgsjson_fn, 'r') as imgsjson_file: + images_info = json.load(imgsjson_file) SLASH = "\u2044" @@ -61,20 +64,43 @@ def rewritelinks (html): if href.startswith("/sandbox/itchwiki/index.php/"): new_href = filenameforlink(href) a.attrib['href'] = new_href - if args.local is True: - for img in t.findall(".//img[@src]"): - src = img.attrib.get("src") - if not src.startswith('http'): - img.attrib['src'] = 'https://hub.xpub.nl' + src - html = ET.tostring(t, method="html", encoding="unicode") return html def rewriteimgs(html): t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) - # remove the srcset value as it prevents images from displaying - for img in t.findall(".//img[@srcset]"): - img.attrib['srcset'] = "" + + # replace images url with local image in ../images + for img in t.findall(".//img[@src]"): + # imgsrc can be: + # remote: url remains + # archive f' images/{img_filename}' + # local: f'../../images/{img_filename}' + + if args.imgsrc == 'remote': + src = img.attrib.get("src") + if not src.startswith('http'): + img.attrib['src'] = 'https://hub.xpub.nl' + src + else: # local / archive imgsrc + img_alt = img.attrib.get("alt") # alt property has filename + img_page = f'File:{img_alt}' # find image it images.json + try: + # get its filename + img_filename = images_info[img_page]['filename'] + except KeyError: + print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images") + print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC) + sys.exit() + if args.imgsrc == 'local': + # 2 dirs above HTML files dir: archive/ + img.attrib['src'] = f'../../images/{img_filename}' + if args.imgsrc == 'archive': + # same dir as HTML files: archive/ + img.attrib['src'] = f'./images/{img_filename}' + + img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying + img.attrib['width'] = "" + img.attrib['height'] = "" html = ET.tostring(t, method="html", encoding="unicode") return html