images2html

andre
Castro0o 5 years ago
parent 73d4df942a
commit a76313c251

7
.gitignore vendored

@ -1,5 +1,8 @@
login.txt
images/ images/
static_html/
login.txt
imgs_info.py
images.json
# venv dirs & files # venv dirs & files
.idea/ .idea/
@ -8,4 +11,4 @@ lib/
lib64 lib64
pyvenv.cfg pyvenv.cfg
share/ share/
images.json __pycache__/

@ -10,6 +10,14 @@
* [mwclient](https://mwclient.readthedocs.io/en/latest/index.html) Python library * [mwclient](https://mwclient.readthedocs.io/en/latest/index.html) Python library
* Install: * Install:
* `pip3 install mwclient` * `pip3 install mwclient`
* [jinja2](https://jinja.palletsprojects.com/en/2.11.x/) Python library
* Install:
* `pip3 install jinja2`
* [pandoc](https://pandoc.org/)
* Install:
* Debian/Ubuntu: `sudo apt install pandoc`
* Mac: `brew install pandoc`
## login.txt ## login.txt
`login.txt` is a secrete file (ignored by git) where you place you itch wiki username and password, in separate lines. `login.txt` is a secrete file (ignored by git) where you place you itch wiki username and password, in separate lines.
@ -22,4 +30,14 @@ mypassword
## Run ## Run
* `python3 download_imgs.py` * `python3 download_imgs.py`
* Downloads all images from wiki to `images/` directory
* and stores each image's metadata to `images.json`
* `python3 images2html.py`
* cycles through the items in `images.json`
* querying the File: wiki for image
* if it contains essential metadata: Title, Page, Total Pages
* retrieves its text content
* generates a HTML page from it
* saves page onto `static_html/` directory

@ -1,4 +1,19 @@
import os, json import os, json, re
import subprocess
def pandoc(content, format_in, format_out):
pandoc_cmd = "echo '{}' | pandoc -f {} -t {}".format(
content, format_in, format_out)
output = subprocess.check_output(pandoc_cmd, shell=True)
return output.decode('utf8')
def page_props(wikicontent):
exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
found = re.findall(exp, wikicontent)
found_dict ={item[0]: item[1] for item in found}
return found_dict
def update_json(imgsjson_fn, img_dict, img_fn): def update_json(imgsjson_fn, img_dict, img_fn):

@ -0,0 +1,76 @@
import os, json
from mwclient import Site
from pprint import pprint
from jinja2 import Template
from functions import pandoc, page_props
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
imgdir = os.path.join(wd, 'images')
imgsjson_fn = os.path.join(wd, 'images.json')
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
static_html = os.path.join(wd, 'static_html')
os.makedirs(static_html, exist_ok=True) # create images/ dir
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
page_html_template = '''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="../static/style.css" />
<title>{{title}}</title>
</head>
<body>
<h1>{{ title }}</h1>
<p><time datetime="{{date}}">{{date}}</time></p>
<div id="img">
<img src="{{ imgsrc }}" />
</div>
<div id="content">
{{ content }}
</div>
<footer>
Part {{part}} of {{partof}}
</footer>
</body>
</html>
'''
page_template = Template(page_html_template)
for img_info in images_info.values():
print(img_info)
page_name = img_info['name']
page = site.pages[page_name]
# print(page)
# pprint(page.__dict__)
# print(dir(page))
pagetext = page.text()
pageproperties = page_props(wikicontent=pagetext)
print(pageproperties)
if pageproperties.get('Title'):
pagetext_html = pandoc(content=pagetext, format_in='mediawiki', format_out='html')
# print('pagetext', pagetext)
# print('pagetext_html', pagetext_html)
page_html = page_template.render(title=pageproperties.get('Title'),
date=pageproperties.get('Date'),
imgsrc=os.path.join(imgdir, img_info.get('filename')),
content=pagetext_html,
part=pageproperties.get('Part'),
partof=pageproperties.get('Partof'))
htmlpage_fn = "{}_{}.html".format(
pageproperties.get('Title').replace(" ", ""),
pageproperties.get('Part').zfill(3)
)
print(htmlpage_fn)
with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
htmlfile.write(page_html)

@ -0,0 +1,70 @@
import os, json
from mwclient import Site
from pprint import pprint
from functions import update_json
site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
wd = os.path.dirname(os.path.abspath(__file__)) # working directory
imgdir = os.path.join(wd, 'images')
os.makedirs(imgdir, exist_ok=True) # create images/ dir
imgsjson_fn = os.path.join(wd, 'images.json')
with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd
loginlines = login.read()
user, pwd = loginlines.split('\n')
site.login(username=user, password=pwd) # login to wiki
with open(imgsjson_fn, 'r') as imgsjson_file:
images_info = json.load(imgsjson_file)
img_info = images_info["File:CCF 003017.jpg"]
print(img_info)
page_name = img_info['name']
page = site.pages[page_name]
print(page)
pprint(page.__dict__)
print(dir(page))
text = page.text()
used_in = list(page.imageusage())
print(text, used_in)
# response = site.api(action='browsebysubject', subject=page_name)
# for q in response['query']['data']:
# print(q['property'], q['dataitem'])
# print(q)
# print('result:', response['query']['data'][0]['item'])
# print('keys', response['query'].keys())
response = site.ask(query='[[Date::+]]|?Date', title=page_name)
print(response)
# import pdb; pdb.set_trace()
for r in response:
print('response', r)
for title, data in r.items():
print(data)
# import pdb; pdb.set_trace()
print(type(data))
if type(data) not in [str, int]:
for k, item in data.items():
print('item', k, item[k])
for subitem in data.items():
print('subitem', subitem, )
# for k, v in data['printouts']:
# print(k,v)
# action=smwbrowse&browse=page&params={ "limit": 10, "offset": 0, "search": "Main", "fullText": true, "fullURL": true }
# for page in site.pages(page_name):
# print(page)
#
# for img_key, img_info in images_info.items():
# if n < 4:
# print(img_info)
# page_name = img_info['name']
# page = site.pages(page_name)
# print(page)
# print('\n')

@ -0,0 +1,3 @@
body{font-size: 12pt;}
div#img img {width: 100%;}
Loading…
Cancel
Save