images2html

5 years ago · a76313c251
parent 73d4df942a
commit a76313c251
6 changed files with 189 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
 login.txt
 images/
 static_html/
 login.txt
 imgs_info.py
 images.json
 # venv dirs & files
 .idea/
@ -8,4 +11,4 @@ lib/
 lib64
 pyvenv.cfg
 share/
-images.json
+__pycache__/
--- a/README.md
+++ b/README.md
@ -10,6 +10,14 @@
 * [mwclient](https://mwclient.readthedocs.io/en/latest/index.html) Python library
    * Install:
        * `pip3 install mwclient`
 * [jinja2](https://jinja.palletsprojects.com/en/2.11.x/) Python library
    * Install:
        * `pip3 install jinja2`
 * [pandoc](https://pandoc.org/)
    * Install:
        * Debian/Ubuntu: `sudo apt install pandoc`
        * Mac: `brew install pandoc`
 ## login.txt
 `login.txt` is a secrete file (ignored by git) where you place you itch wiki username and password, in separate lines.
@ -22,4 +30,14 @@ mypassword
 ## Run
-* `python3 download_imgs.py`
+* `python3 download_imgs.py` 
    * Downloads all images from wiki to `images/` directory 
    * and stores each image's metadata to `images.json`
 * `python3 images2html.py`
    * cycles through the items in `images.json`
    * querying the File: wiki for image
    * if it contains essential metadata: Title, Page, Total Pages
        * retrieves its text content
        * generates a HTML page from it
        * saves page onto `static_html/` directory
--- a/functions.py
+++ b/functions.py
@ -1,4 +1,19 @@
-import os, json
+import os, json, re
 import subprocess
 def pandoc(content, format_in, format_out):
    pandoc_cmd = "echo '{}' | pandoc -f {} -t {}".format(
        content, format_in, format_out)
    output = subprocess.check_output(pandoc_cmd, shell=True)
    return output.decode('utf8')
 def page_props(wikicontent):
    exp = re.compile(r'\|(\w*?)\=(.*)', flags=re.M)
    found = re.findall(exp, wikicontent)
    found_dict ={item[0]: item[1] for item in found}
    return found_dict
 def update_json(imgsjson_fn, img_dict, img_fn):
--- a/images2html.py
+++ b/images2html.py
@ -0,0 +1,76 @@
 import os, json
 from mwclient import Site
 from pprint import pprint
 from jinja2 import Template
 from functions import pandoc, page_props
 site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
 wd = os.path.dirname(os.path.abspath(__file__)) # working directory
 imgdir = os.path.join(wd, 'images')
 imgsjson_fn = os.path.join(wd, 'images.json')
 with open(imgsjson_fn, 'r') as imgsjson_file:
    images_info = json.load(imgsjson_file)
 static_html = os.path.join(wd, 'static_html')
 os.makedirs(static_html, exist_ok=True) # create images/ dir
 with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki
 page_html_template = '''
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="utf-8">
    <link rel="stylesheet" href="../static/style.css" />
    <title>{{title}}</title>
 </head>
 <body>
    <h1>{{ title }}</h1>
    <p><time datetime="{{date}}">{{date}}</time></p>
    <div id="img">
        <img src="{{ imgsrc }}" />
    </div>
    <div id="content">
        {{ content }}
    </div>
    <footer>
        Part {{part}} of {{partof}}
    </footer>
 </body>
 </html>
 '''
 page_template = Template(page_html_template)
 for img_info in images_info.values():
    print(img_info)
    page_name = img_info['name']
    page = site.pages[page_name]
    # print(page)
    # pprint(page.__dict__)
    # print(dir(page))
    pagetext = page.text()
    pageproperties = page_props(wikicontent=pagetext)
    print(pageproperties)
    if pageproperties.get('Title'):
        pagetext_html = pandoc(content=pagetext, format_in='mediawiki', format_out='html')
        # print('pagetext', pagetext)
        # print('pagetext_html', pagetext_html)
        page_html = page_template.render(title=pageproperties.get('Title'),
                                         date=pageproperties.get('Date'),
                                         imgsrc=os.path.join(imgdir, img_info.get('filename')),
                                         content=pagetext_html,
                                         part=pageproperties.get('Part'),
                                         partof=pageproperties.get('Partof'))
        htmlpage_fn = "{}_{}.html".format(
            pageproperties.get('Title').replace(" ", ""),
            pageproperties.get('Part').zfill(3)
        )
        print(htmlpage_fn)
        with open(os.path.join(static_html, htmlpage_fn), 'w') as htmlfile:
            htmlfile.write(page_html)
--- a/imgs_info.py
+++ b/imgs_info.py
@ -0,0 +1,70 @@
 import os, json
 from mwclient import Site
 from pprint import pprint
 from functions import update_json
 site = Site(host='hub.xpub.nl/sandbox', path='/itchwiki/')
 wd = os.path.dirname(os.path.abspath(__file__)) # working directory
 imgdir = os.path.join(wd, 'images')
 os.makedirs(imgdir, exist_ok=True) # create images/ dir
 imgsjson_fn = os.path.join(wd, 'images.json')
 with open(os.path.join(wd, 'login.txt'), 'r') as login:  # read login user & pwd
    loginlines = login.read()
    user, pwd = loginlines.split('\n')
    site.login(username=user, password=pwd)  # login to wiki
 with open(imgsjson_fn, 'r') as imgsjson_file:
    images_info = json.load(imgsjson_file)
 img_info = images_info["File:CCF 003017.jpg"]
 print(img_info)
 page_name = img_info['name']
 page = site.pages[page_name]
 print(page)
 pprint(page.__dict__)
 print(dir(page))
 text = page.text()
 used_in = list(page.imageusage())
 print(text, used_in)
 # response = site.api(action='browsebysubject', subject=page_name)
 # for q in response['query']['data']:
 #     print(q['property'], q['dataitem'])
 #     print(q)
 # print('result:', response['query']['data'][0]['item'])
 # print('keys', response['query'].keys())
 response = site.ask(query='[[Date::+]]|?Date', title=page_name)
 print(response)
 # import pdb; pdb.set_trace()
 for r in response:
    print('response', r)
    for title, data in r.items():
        print(data)
        # import pdb; pdb.set_trace()
        print(type(data))
        if type(data) not in [str, int]:
            for k, item in data.items():
                print('item', k, item[k])
                for subitem in data.items():
                    print('subitem', subitem, )
       # for k, v in data['printouts']:
       #     print(k,v)
 # action=smwbrowse&browse=page&params={ "limit": 10, "offset": 0, "search": "Main", "fullText": true, "fullURL": true }
 # for page in site.pages(page_name):
 #     print(page)
 #
 # for img_key, img_info in images_info.items():
 #     if n < 4:
 #         print(img_info)
 #         page_name = img_info['name']
 #         page = site.pages(page_name)
 #         print(page)
 #         print('\n')
--- a/static/style.css
+++ b/static/style.css
@ -0,0 +1,3 @@
 body{font-size: 12pt;}
 div#img img {width: 100%;}
		`@ -0,0 +1,3 @@`
							`body{font-size: 12pt;}`

							`div#img img {width: 100%;}`