|
|
|
@ -1,8 +1,6 @@
|
|
|
|
|
import os, json, sys, urllib
|
|
|
|
|
import os, json, sys
|
|
|
|
|
from mwclient import Site
|
|
|
|
|
from pprint import pprint
|
|
|
|
|
from jinja2 import Template
|
|
|
|
|
from functions import unpack_response, clean_dir, remove_nonwords
|
|
|
|
|
import html5lib
|
|
|
|
|
from functions import Colors
|
|
|
|
|
import argparse
|
|
|
|
@ -17,20 +15,25 @@ p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Shou
|
|
|
|
|
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages")
|
|
|
|
|
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
|
|
|
|
|
p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images")
|
|
|
|
|
p.add_argument("--imgsrc", default='archive',
|
|
|
|
|
choices=['archive', 'local', 'remote'],
|
|
|
|
|
help="What is the source of the images?")
|
|
|
|
|
|
|
|
|
|
args = p.parse_args()
|
|
|
|
|
print(args)
|
|
|
|
|
# site and login
|
|
|
|
|
|
|
|
|
|
site = Site(host=args.host, path=args.path)
|
|
|
|
|
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
|
|
|
|
|
|
|
|
|
|
with open('login.txt', 'r') as login: # read login user & pwd
|
|
|
|
|
loginlines = login.read()
|
|
|
|
|
user, pwd = loginlines.split('\n')
|
|
|
|
|
site.login(username=user, password=pwd) # login to wiki
|
|
|
|
|
|
|
|
|
|
# read template files
|
|
|
|
|
|
|
|
|
|
imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
|
|
|
|
|
with open(imgsjson_fn, 'r') as imgsjson_file:
|
|
|
|
|
images_info = json.load(imgsjson_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SLASH = "\u2044"
|
|
|
|
@ -61,20 +64,43 @@ def rewritelinks (html):
|
|
|
|
|
if href.startswith("/sandbox/itchwiki/index.php/"):
|
|
|
|
|
new_href = filenameforlink(href)
|
|
|
|
|
a.attrib['href'] = new_href
|
|
|
|
|
if args.local is True:
|
|
|
|
|
for img in t.findall(".//img[@src]"):
|
|
|
|
|
src = img.attrib.get("src")
|
|
|
|
|
if not src.startswith('http'):
|
|
|
|
|
img.attrib['src'] = 'https://hub.xpub.nl' + src
|
|
|
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rewriteimgs(html):
|
|
|
|
|
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
|
|
|
|
|
# remove the srcset value as it prevents images from displaying
|
|
|
|
|
for img in t.findall(".//img[@srcset]"):
|
|
|
|
|
img.attrib['srcset'] = ""
|
|
|
|
|
|
|
|
|
|
# replace images url with local image in ../images
|
|
|
|
|
for img in t.findall(".//img[@src]"):
|
|
|
|
|
# imgsrc can be:
|
|
|
|
|
# remote: url remains
|
|
|
|
|
# archive f' images/{img_filename}'
|
|
|
|
|
# local: f'../../images/{img_filename}'
|
|
|
|
|
|
|
|
|
|
if args.imgsrc == 'remote':
|
|
|
|
|
src = img.attrib.get("src")
|
|
|
|
|
if not src.startswith('http'):
|
|
|
|
|
img.attrib['src'] = 'https://hub.xpub.nl' + src
|
|
|
|
|
else: # local / archive imgsrc
|
|
|
|
|
img_alt = img.attrib.get("alt") # alt property has filename
|
|
|
|
|
img_page = f'File:{img_alt}' # find image it images.json
|
|
|
|
|
try:
|
|
|
|
|
# get its filename
|
|
|
|
|
img_filename = images_info[img_page]['filename']
|
|
|
|
|
except KeyError:
|
|
|
|
|
print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
|
|
|
|
|
print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
|
|
|
|
|
sys.exit()
|
|
|
|
|
if args.imgsrc == 'local':
|
|
|
|
|
# 2 dirs above HTML files dir: archive/
|
|
|
|
|
img.attrib['src'] = f'../../images/{img_filename}'
|
|
|
|
|
if args.imgsrc == 'archive':
|
|
|
|
|
# same dir as HTML files: archive/
|
|
|
|
|
img.attrib['src'] = f'./images/{img_filename}'
|
|
|
|
|
|
|
|
|
|
img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying
|
|
|
|
|
img.attrib['width'] = ""
|
|
|
|
|
img.attrib['height'] = ""
|
|
|
|
|
html = ET.tostring(t, method="html", encoding="unicode")
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|