images2html

5 years ago · a76313c251
parent 73d4df942a
commit a76313c251
6 changed files with 189 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
-login.txt
 images/
+static_html/
+login.txt
+imgs_info.py
+images.json

 # venv dirs & files
 .idea/
@ -8,4 +11,4 @@ lib/
 lib64
 pyvenv.cfg
 share/
-images.json
+__pycache__/
--- a/README.md
+++ b/README.md
@ -10,6 +10,14 @@
 * [mwclient](https://mwclient.readthedocs.io/en/latest/index.html) Python library
    * Install:
        * `pip3 install mwclient`
+* [jinja2](https://jinja.palletsprojects.com/en/2.11.x/) Python library
+    * Install:
+        * `pip3 install jinja2`
+* [pandoc](https://pandoc.org/)
+    * Install:
+        * Debian/Ubuntu: `sudo apt install pandoc`
+        * Mac: `brew install pandoc`
+

 ## login.txt
 `login.txt` is a secrete file (ignored by git) where you place you itch wiki username and password, in separate lines.
@ -22,4 +30,14 @@ mypassword


 ## Run
-* `python3 download_imgs.py`
+* `python3 download_imgs.py` 
+    * Downloads all images from wiki to `images/` directory 
+    * and stores each image's metadata to `images.json`
+* `python3 images2html.py`
+    * cycles through the items in `images.json`
+    * querying the File: wiki for image
+    * if it contains essential metadata: Title, Page, Total Pages
+        * retrieves its text content
+        * generates a HTML page from it
+        * saves page onto `static_html/` directory
+        
--- a/functions.py
+++ b/functions.py
@ -1,4 +1,19 @@
-import os, json
+import os, json, re
+import subprocess
+
+def pandoc(content, format_in, format_out):
+    pandoc_cmd = "echo '{}' | pandoc -f {} -t {}".format(
+        content, format_in, format_out)
+    output = subprocess.check_output(pandoc_cmd, shell=True)
+    return output.decode('utf8')
+
+
+def page_props(wikicontent):
+    exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
+    found = re.findall(exp, wikicontent)
+    found_dict ={item[0]: item[1] for item in found}
+    return found_dict
+


 def update_json(imgsjson_fn, img_dict, img_fn):
--- a/images2html.py
+++ b/images2html.py
@ -0,0 +1,76 @@
+import os, json
+from mwclient import Site
+from pprint import pprint
+from jinja2 import Template
+from functions import pandoc, page_props
+
+site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
+wd = os.path.dirname(os.path.abspath(__file__)) # working directory
+imgdir = os.path.join(wd, 'images')
+imgsjson_fn = os.path.join(wd, 'images.json')
+with open(imgsjson_fn, 'r') as imgsjson_file:
+    images_info = json.load(imgsjson_file)
+
+static_html = os.path.join(wd, 'static_html')
+os.makedirs(static_html, exist_ok=True) # create images/ dir
+
+with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
+    loginlines = login.read()
+    user, pwd = loginlines.split('\n')
+    site.login(username=user, password=pwd)  # login to wiki
+
+
+page_html_template = '''
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <link rel="stylesheet" href="../static/style.css" />
+    <title>{{title}}</title>
+</head>
+<body>
+    <h1>{{ title }}</h1>
+    <p><time datetime="{{date}}">{{date}}</time></p>
+    <div id="img">
+        <img src="{{ imgsrc }}" />
+    </div>
+    <div id="content">
+        {{ content }}
+    </div>
+    <footer>
+        Part {{part}} of {{partof}}
+    </footer>
+</body>
+</html>
+'''
+page_template = Template(page_html_template)
+
+
+for img_info in images_info.values():
+    print(img_info)
+    page_name = img_info['name']
+    page = site.pages[page_name]
+    # print(page)
+    # pprint(page.__dict__)
+    # print(dir(page))
+    pagetext = page.text()
+    pageproperties = page_props(wikicontent=pagetext)
+    print(pageproperties)
+
+    if pageproperties.get('Title'):
+        pagetext_html = pandoc(content=pagetext, format_in='mediawiki', format_out='html')
+        # print('pagetext', pagetext)
+        # print('pagetext_html', pagetext_html)
+        page_html = page_template.render(title=pageproperties.get('Title'),
+                                         date=pageproperties.get('Date'),
+                                         imgsrc=os.path.join(imgdir, img_info.get('filename')),
+                                         content=pagetext_html,
+                                         part=pageproperties.get('Part'),
+                                         partof=pageproperties.get('Partof'))
+        htmlpage_fn = "{}_{}.html".format(
+            pageproperties.get('Title').replace(" ", ""),
+            pageproperties.get('Part').zfill(3)
+        )
+        print(htmlpage_fn)
+        with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
+            htmlfile.write(page_html)
--- a/imgs_info.py
+++ b/imgs_info.py
@ -0,0 +1,70 @@
+import os, json
+from mwclient import Site
+from pprint import pprint
+from functions import update_json
+
+site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
+wd = os.path.dirname(os.path.abspath(__file__)) # working directory
+
+imgdir = os.path.join(wd, 'images')
+os.makedirs(imgdir, exist_ok=True) # create images/ dir
+
+imgsjson_fn = os.path.join(wd, 'images.json')
+
+
+
+with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
+    loginlines = login.read()
+    user, pwd = loginlines.split('\n')
+    site.login(username=user, password=pwd)  # login to wiki
+
+with open(imgsjson_fn, 'r') as imgsjson_file:
+    images_info = json.load(imgsjson_file)
+
+img_info = images_info["File:CCF 003017.jpg"]
+
+print(img_info)
+page_name = img_info['name']
+page = site.pages[page_name]
+print(page)
+pprint(page.__dict__)
+print(dir(page))
+text = page.text()
+used_in = list(page.imageusage())
+print(text, used_in)
+# response = site.api(action='browsebysubject', subject=page_name)
+# for q in response['query']['data']:
+#     print(q['property'], q['dataitem'])
+#     print(q)
+# print('result:', response['query']['data'][0]['item'])
+# print('keys', response['query'].keys())
+
+response = site.ask(query='[[Date::+]]|?Date', title=page_name)
+print(response)
+# import pdb; pdb.set_trace()
+for r in response:
+    print('response', r)
+    for title, data in r.items():
+        print(data)
+        # import pdb; pdb.set_trace()
+        print(type(data))
+        if type(data) not in [str, int]:
+            for k, item in data.items():
+                print('item', k, item[k])
+                for subitem in data.items():
+                    print('subitem', subitem, )
+       # for k, v in data['printouts']:
+       #     print(k,v)
+# action=smwbrowse&browse=page&params={ "limit": 10, "offset": 0, "search": "Main", "fullText": true, "fullURL": true }
+# for page in site.pages(page_name):
+#     print(page)
+
+
+#
+# for img_key, img_info in images_info.items():
+#     if n < 4:
+#         print(img_info)
+#         page_name = img_info['name']
+#         page = site.pages(page_name)
+#         print(page)
+#         print('\n')
--- a/static/style.css
+++ b/static/style.css
@ -0,0 +1,3 @@
+body{font-size: 12pt;}
+
+div#img img {width: 100%;}