From a76313c2519b59caadc4ea418ef4be38085201e0 Mon Sep 17 00:00:00 2001 From: Castro0o Date: Sun, 2 Feb 2020 21:03:53 +0100 Subject: [PATCH] images2html --- .gitignore | 7 +++-- README.md | 20 ++++++++++++- functions.py | 17 ++++++++++- images2html.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++ imgs_info.py | 70 ++++++++++++++++++++++++++++++++++++++++++++ static/style.css | 3 ++ 6 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 images2html.py create mode 100644 imgs_info.py create mode 100644 static/style.css diff --git a/.gitignore b/.gitignore index ee6496f..6a5eccb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ -login.txt images/ +static_html/ +login.txt +imgs_info.py +images.json # venv dirs & files .idea/ @@ -8,4 +11,4 @@ lib/ lib64 pyvenv.cfg share/ -images.json +__pycache__/ diff --git a/README.md b/README.md index aea5422..32dd762 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,14 @@ * [mwclient](https://mwclient.readthedocs.io/en/latest/index.html) Python library * Install: * `pip3 install mwclient` +* [jinja2](https://jinja.palletsprojects.com/en/2.11.x/) Python library + * Install: + * `pip3 install jinja2` +* [pandoc](https://pandoc.org/) + * Install: + * Debian/Ubuntu: `sudo apt install pandoc` + * Mac: `brew install pandoc` + ## login.txt `login.txt` is a secrete file (ignored by git) where you place you itch wiki username and password, in separate lines. @@ -22,4 +30,14 @@ mypassword ## Run -* `python3 download_imgs.py` \ No newline at end of file +* `python3 download_imgs.py` + * Downloads all images from wiki to `images/` directory + * and stores each image's metadata to `images.json` +* `python3 images2html.py` + * cycles through the items in `images.json` + * querying the File: wiki for image + * if it contains essential metadata: Title, Page, Total Pages + * retrieves its text content + * generates a HTML page from it + * saves page onto `static_html/` directory + \ No newline at end of file diff --git a/functions.py b/functions.py index eb9e9b7..c99623f 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,19 @@ -import os, json +import os, json, re +import subprocess + +def pandoc(content, format_in, format_out): + pandoc_cmd = "echo '{}' | pandoc -f {} -t {}".format( + content, format_in, format_out) + output = subprocess.check_output(pandoc_cmd, shell=True) + return output.decode('utf8') + + +def page_props(wikicontent): + exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M) + found = re.findall(exp, wikicontent) + found_dict ={item[0]: item[1] for item in found} + return found_dict + def update_json(imgsjson_fn, img_dict, img_fn): diff --git a/images2html.py b/images2html.py new file mode 100644 index 0000000..d195687 --- /dev/null +++ b/images2html.py @@ -0,0 +1,76 @@ +import os, json +from mwclient import Site +from pprint import pprint +from jinja2 import Template +from functions import pandoc, page_props + +site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') +wd = os.path.dirname(os.path.abspath(__file__)) # working directory +imgdir = os.path.join(wd, 'images') +imgsjson_fn = os.path.join(wd, 'images.json') +with open(imgsjson_fn, 'r') as imgsjson_file: + images_info = json.load(imgsjson_file) + +static_html = os.path.join(wd, 'static_html') +os.makedirs(static_html, exist_ok=True) # create images/ dir + +with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd + loginlines = login.read() + user, pwd = loginlines.split('\n') + site.login(username=user, password=pwd) # login to wiki + + +page_html_template = ''' + + + + + + {{title}} + + +

{{ title }}

+

+
+ +
+
+ {{ content }} +
+ + + +''' +page_template = Template(page_html_template) + + +for img_info in images_info.values(): + print(img_info) + page_name = img_info['name'] + page = site.pages[page_name] + # print(page) + # pprint(page.__dict__) + # print(dir(page)) + pagetext = page.text() + pageproperties = page_props(wikicontent=pagetext) + print(pageproperties) + + if pageproperties.get('Title'): + pagetext_html = pandoc(content=pagetext, format_in='mediawiki', format_out='html') + # print('pagetext', pagetext) + # print('pagetext_html', pagetext_html) + page_html = page_template.render(title=pageproperties.get('Title'), + date=pageproperties.get('Date'), + imgsrc=os.path.join(imgdir, img_info.get('filename')), + content=pagetext_html, + part=pageproperties.get('Part'), + partof=pageproperties.get('Partof')) + htmlpage_fn = "{}_{}.html".format( + pageproperties.get('Title').replace(" ", ""), + pageproperties.get('Part').zfill(3) + ) + print(htmlpage_fn) + with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile: + htmlfile.write(page_html) diff --git a/imgs_info.py b/imgs_info.py new file mode 100644 index 0000000..2ce3b9c --- /dev/null +++ b/imgs_info.py @@ -0,0 +1,70 @@ +import os, json +from mwclient import Site +from pprint import pprint +from functions import update_json + +site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/') +wd = os.path.dirname(os.path.abspath(__file__)) # working directory + +imgdir = os.path.join(wd, 'images') +os.makedirs(imgdir, exist_ok=True) # create images/ dir + +imgsjson_fn = os.path.join(wd, 'images.json') + + + +with open(os.path.join(wd, 'login.txt'), 'r') as login: # read login user & pwd + loginlines = login.read() + user, pwd = loginlines.split('\n') + site.login(username=user, password=pwd) # login to wiki + +with open(imgsjson_fn, 'r') as imgsjson_file: + images_info = json.load(imgsjson_file) + +img_info = images_info["File:CCF 003017.jpg"] + +print(img_info) +page_name = img_info['name'] +page = site.pages[page_name] +print(page) +pprint(page.__dict__) +print(dir(page)) +text = page.text() +used_in = list(page.imageusage()) +print(text, used_in) +# response = site.api(action='browsebysubject', subject=page_name) +# for q in response['query']['data']: +# print(q['property'], q['dataitem']) +# print(q) +# print('result:', response['query']['data'][0]['item']) +# print('keys', response['query'].keys()) + +response = site.ask(query='[[Date::+]]|?Date', title=page_name) +print(response) +# import pdb; pdb.set_trace() +for r in response: + print('response', r) + for title, data in r.items(): + print(data) + # import pdb; pdb.set_trace() + print(type(data)) + if type(data) not in [str, int]: + for k, item in data.items(): + print('item', k, item[k]) + for subitem in data.items(): + print('subitem', subitem, ) + # for k, v in data['printouts']: + # print(k,v) +# action=smwbrowse&browse=page¶ms={ "limit": 10, "offset": 0, "search": "Main", "fullText": true, "fullURL": true } +# for page in site.pages(page_name): +# print(page) + + +# +# for img_key, img_info in images_info.items(): +# if n < 4: +# print(img_info) +# page_name = img_info['name'] +# page = site.pages(page_name) +# print(page) +# print('\n') \ No newline at end of file diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..74ec965 --- /dev/null +++ b/static/style.css @@ -0,0 +1,3 @@ +body{font-size: 12pt;} + +div#img img {width: 100%;} \ No newline at end of file