Merge branch 'localimages' of XPUB/special-issue-11-wiki2html into master

master
acastro 5 years ago committed by Gitea
commit f975fcaaff

@ -39,9 +39,14 @@ mypassword
### locally on your own machine: ### locally on your own machine:
create archive folder: `mkdir archive` create archive folder: `mkdir archive`
run script outputting to archive folder and displaying the images from the wiki: run script outputting to archive folder and **displaying the images from the wiki**:
`python3 dumpwiki.py --imgsrc remote`
run script outputting to archive folder and **displaying the images from local ../archive/images**:
* requires running `download_imgs.py`
`python3 dumpwiki.py`
`python3 dumpwiki.py --output archive --local`
### Categories and Templates: ### Categories and Templates:
@ -83,7 +88,7 @@ Run scripts together with `./run.sh`
1 script at a time: 1 script at a time:
`python3 download_imgs.py` `python3 download_imgs.py`
* Downloads all images from wiki to `images/` directory * Downloads all images from wiki to `../archive/images/` directory
* and stores each image's metadata to `images.json` * and stores each image's metadata to `images.json`
`python3 query2html.py` `python3 query2html.py`

@ -6,9 +6,9 @@ from functions import update_json, remove_nonwords
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.abspath(__file__)) # working directory wd = os.path.dirname(os.path.abspath(__file__)) # working directory
parent_d = os.path.dirname(wd) # parent directory
imgdir = os.path.join(wd, 'images') imgdir = os.path.join(parent_d, 'archive/images')
os.makedirs(imgdir, exist_ok=True) # create images/ dir os.makedirs(imgdir, exist_ok=True) # create images/ dir
imgsjson_fn = os.path.join(wd, 'images.json') imgsjson_fn = os.path.join(wd, 'images.json')

@ -1,8 +1,6 @@
import os, json, sys, urllib import os, json, sys
from mwclient import Site from mwclient import Site
from pprint import pprint
from jinja2 import Template from jinja2 import Template
from functions import unpack_response, clean_dir, remove_nonwords
import html5lib import html5lib
from functions import Colors from functions import Colors
import argparse import argparse
@ -14,23 +12,27 @@ p = argparse.ArgumentParser(description="Dump wiki files to html",
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host') p.add_argument("--host", metavar='', default="hub.xpub.nl/sandbox", help='wiki host')
p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /") p.add_argument("--path", metavar='', default="/itchwiki/", help="Wiki path. Should end with /")
p.add_argument("--output", default="/var/www/html/archive", help="Output path for pages") p.add_argument("--output", default="../archive", help="Output path for pages")
p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only") p.add_argument("--one", default=False, action="store_true", help="Output one page from each category only")
p.add_argument("--local", default=False, action="store_true", help="When creating a local archives. Add full URL to images") p.add_argument("--imgsrc", default='archive',
choices=['archive', 'remote'],
help="What is the source of the images?")
args = p.parse_args() args = p.parse_args()
print(args) print(args)
# site and login # site and login
site = Site(host=args.host, path=args.path) site = Site(host=args.host, path=args.path)
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
wd_name = os.path.split(wd)[-1] # name of dir running script
with open('login.txt', 'r') as login: # read login user & pwd with open('login.txt', 'r') as login: # read login user & pwd
loginlines = login.read() loginlines = login.read()
user, pwd = loginlines.split('\n') user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki site.login(username=user, password=pwd) # login to wiki
# read template files imgsjson_fn = os.path.join(wd, 'images.json') # read images.json file
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
SLASH = "\u2044" SLASH = "\u2044"
@ -61,20 +63,39 @@ def rewritelinks (html):
if href.startswith("/sandbox/itchwiki/index.php/"): if href.startswith("/sandbox/itchwiki/index.php/"):
new_href = filenameforlink(href) new_href = filenameforlink(href)
a.attrib['href'] = new_href a.attrib['href'] = new_href
if args.local is True:
for img in t.findall(".//img[@src]"):
src = img.attrib.get("src")
if not src.startswith('http'):
img.attrib['src'] = 'https://hub.xpub.nl' + src
html = ET.tostring(t, method="html", encoding="unicode")
return html return html
def rewriteimgs(html): def rewriteimgs(html):
t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False) t = html5lib.parseFragment(html, treebuilder = "etree", namespaceHTMLElements = False)
# remove the srcset value as it prevents images from displaying
for img in t.findall(".//img[@srcset]"): # replace images url with local image in ../images
img.attrib['srcset'] = "" for img in t.findall(".//img[@src]"):
# imgsrc can be:
# remote: url remains
# archive f' images/{img_filename}'
# local: f'../../images/{img_filename}'
if args.imgsrc == 'remote':
src = img.attrib.get("src")
if not src.startswith('http'):
img.attrib['src'] = 'https://hub.xpub.nl' + src
else: # local / archive imgsrc
img_alt = img.attrib.get("alt") # alt property has filename
img_page = f'File:{img_alt}' # find image it images.json
try:
# get its filename
img_filename = images_info[img_page]['filename']
except KeyError:
print(Colors.WARNING, f"{printout_dict['page']} is not is missing from the local downloaded images")
print(Colors.GREEN, 'run python3 download_imgs.py to fix the issue', Colors.ENDC)
sys.exit()
# same dir as HTML files: archive/
img.attrib['src'] = f'./images/{img_filename}'
img.attrib['srcset'] = "" # rm srcset value:it prevent imgs displaying
img.attrib['width'] = ""
img.attrib['height'] = ""
html = ET.tostring(t, method="html", encoding="unicode") html = ET.tostring(t, method="html", encoding="unicode")
return html return html
@ -96,10 +117,9 @@ for cat in publish.members():
htmlsrc = rewritelinks(htmlsrc) htmlsrc = rewritelinks(htmlsrc)
htmlsrc = rewriteimgs(htmlsrc) htmlsrc = rewriteimgs(htmlsrc)
if args.local is True: # TODO: ANdre structure of archive: from ./archive/0 to: ./archive ./0
html = template.render(page=p, body=htmlsrc, staticpath='..')
else: html = template.render(page=p, body=htmlsrc, staticpath=f'../{wd_name}')
html = template.render(page=p, body=htmlsrc, staticpath='0')
with open(os.path.join(args.output, filenameforpage(p)), 'w') as f: with open(os.path.join(args.output, filenameforpage(p)), 'w') as f:
f.write(html) f.write(html)

Loading…
Cancel
Save