xpub.nl/scripts/html5tidy

#!/usr/bin/python
from __future__ import print_function
from html5lib import parse
import os, sys
from argparse import ArgumentParser
from xml.etree import ElementTree as ET 


def etree_indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            etree_indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

def get_link_type (url):
    lurl = url.lower()
    if lurl.endswith(".html") or lurl.endswith(".htm"):
        return "text/html"
    elif lurl.endswith(".txt"):
        return "text/plain"
    elif lurl.endswith(".rss"):
        return "application/rss+xml"
    elif lurl.endswith(".atom"):
        return "application/atom+xml"
    elif lurl.endswith(".json"):
        return "application/json"
    elif lurl.endswith(".js") or lurl.endswith(".jsonp"):
        return "text/javascript"

def pluralize (x):
    if type(x) == list or type(x) == tuple:
        return x
    else:
        return (x,)

def html5tidy (doc, charset="utf-8", title=None, scripts=None, links=None, indent=False):
    if scripts:
        script_srcs = [x.attrib.get("src") for x in doc.findall(".//script")]
        for src in pluralize(scripts):
            if src not in script_srcs:
                script = ET.SubElement(doc.find(".//head"), "script", src=src)
                script_srcs.append(src)

    if links:
        existinglinks = {}
        for elt in doc.findall(".//link"):
            href = elt.attrib.get("href")
            if href:
                existinglinks[href] = elt  
        for link in links:
            linktype = link.get("type") or get_link_type(link["href"])
            if link["href"] in existinglinks:
                elt = existinglinks[link["href"]]
                elt.attrib["rel"] = link["rel"]
            else:
                elt = ET.SubElement(doc.find(".//head"), "link", href=link["href"], rel=link["rel"])
            if linktype:
                elt.attrib["type"] = linktype            
            if "title" in link:
                elt.attrib["title"] = link["title"]

    if charset:
        meta_charsets = [x.attrib.get("charset") for x in doc.findall(".//meta") if x.attrib.get("charset") != None]
        if not meta_charsets:
            meta = ET.SubElement(doc.find(".//head"), "meta", charset=args.charset)

    if title != None:
        titleelt = doc.find(".//title")
        if not titleelt:
            titleelt = ET.SubElement(doc.find(".//head"), "title")
        titleelt.text = title
            
    if indent:
        etree_indent(doc)
    return doc


if __name__ == "__main__": 
    p = ArgumentParser("")
    p.add_argument("input", nargs="?", default=None)
    p.add_argument("--indent", default=False, action="store_true")
    p.add_argument("--mogrify", default=False, action="store_true", help="modify file in place")
    p.add_argument("--method", default="html", help="method, default: html, values: html, xml, text")
    p.add_argument("--output", default=None, help="")
    p.add_argument("--title", default=None, help="ensure/add title tag in head")
    p.add_argument("--charset", default="utf-8", help="ensure/add meta tag with charset")
    p.add_argument("--script", action="append", default=[], help="ensure/add script tag")
    # <link>s, see https://www.w3.org/TR/html5/links.html#links
    p.add_argument("--stylesheet", action="append", default=[], help="ensure/add style link")
    p.add_argument("--alternate", action="append", default=[], nargs="+", help="ensure/add alternate links (optionally followed by a title and type)")
    p.add_argument("--next", action="append", default=[], nargs="+", help="ensure/add alternate link")
    p.add_argument("--prev", action="append", default=[], nargs="+", help="ensure/add alternate link")
    p.add_argument("--search", action="append", default=[], nargs="+", help="ensure/add search link")
    p.add_argument("--rss", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/rss+xml")
    p.add_argument("--atom", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/atom+xml")

    args = p.parse_args()
    links = []
    def add_links (links, items, rel, _type=None):
        for href in items:
            d = {}
            d["rel"] = rel
            if _type:
                d["type"] = _type

            if type(href) == list:
                if len(href) == 1:
                    d["href"] = href[0]
                elif len(href) == 2:
                    d["href"] = href[0]
                    d["title"] = href[1]
                elif len(href) == 3:
                    d["href"] = href[0]
                    d["title"] = href[1]
                    d["type"] = href[2]
                else:
                    continue
            else:
                d["href"] = href

            links.append(d)
    for rel in ("stylesheet", "alternate", "next", "prev", "search"):
        add_links(links, getattr(args, rel), rel)
    for item in args.rss:
        add_links(links, item, rel="alternate", _type="application/rss+xml")
    for item in args.atom:
        add_links(links, item, rel="alternate", _type="application/atom+xml")

    # INPUT
    if args.input:
        fin = open(args.input)
    else:
        fin = sys.stdin

    doc = parse(fin, namespaceHTMLElements=False)
    if fin != sys.stdin:
        fin.close()
    html5tidy(doc, scripts=args.script, links=links, title=args.title, indent=args.indent)

    # OUTPUT
    tmppath = None
    if args.output:
        fout = open(args.output, "w")
    elif args.mogrify:
        tmppath = args.input+".tmp"
        fout = open(tmppath, "w")
    else:
        fout = sys.stdout

    print (ET.tostring(doc, method=args.method), file=fout)

    if fout != sys.stdout:
        fout.close()

    if tmppath:
        os.rename(args.input, args.input+"~")
        os.rename(tmppath, args.input)
new site 8 years ago			`#!/usr/bin/python`
			`from __future__ import print_function`
			`from html5lib import parse`
			`import os, sys`
			`from argparse import ArgumentParser`
			`from xml.etree import ElementTree as ET`


			`def etree_indent(elem, level=0):`
			`i = "\n" + level*" "`
			`if len(elem):`
			`if not elem.text or not elem.text.strip():`
			`elem.text = i + " "`
			`if not elem.tail or not elem.tail.strip():`
			`elem.tail = i`
			`for elem in elem:`
			`etree_indent(elem, level+1)`
			`if not elem.tail or not elem.tail.strip():`
			`elem.tail = i`
			`else:`
			`if level and (not elem.tail or not elem.tail.strip()):`
			`elem.tail = i`

			`def get_link_type (url):`
			`lurl = url.lower()`
			`if lurl.endswith(".html") or lurl.endswith(".htm"):`
			`return "text/html"`
			`elif lurl.endswith(".txt"):`
			`return "text/plain"`
			`elif lurl.endswith(".rss"):`
			`return "application/rss+xml"`
			`elif lurl.endswith(".atom"):`
			`return "application/atom+xml"`
			`elif lurl.endswith(".json"):`
			`return "application/json"`
			`elif lurl.endswith(".js") or lurl.endswith(".jsonp"):`
			`return "text/javascript"`

			`def pluralize (x):`
			`if type(x) == list or type(x) == tuple:`
			`return x`
			`else:`
			`return (x,)`

			`def html5tidy (doc, charset="utf-8", title=None, scripts=None, links=None, indent=False):`
			`if scripts:`
			`script_srcs = [x.attrib.get("src") for x in doc.findall(".//script")]`
			`for src in pluralize(scripts):`
			`if src not in script_srcs:`
			`script = ET.SubElement(doc.find(".//head"), "script", src=src)`
			`script_srcs.append(src)`

			`if links:`
			`existinglinks = {}`
			`for elt in doc.findall(".//link"):`
			`href = elt.attrib.get("href")`
			`if href:`
			`existinglinks[href] = elt`
			`for link in links:`
			`linktype = link.get("type") or get_link_type(link["href"])`
			`if link["href"] in existinglinks:`
			`elt = existinglinks[link["href"]]`
			`elt.attrib["rel"] = link["rel"]`
			`else:`
			`elt = ET.SubElement(doc.find(".//head"), "link", href=link["href"], rel=link["rel"])`
			`if linktype:`
			`elt.attrib["type"] = linktype`
			`if "title" in link:`
			`elt.attrib["title"] = link["title"]`

			`if charset:`
			`meta_charsets = [x.attrib.get("charset") for x in doc.findall(".//meta") if x.attrib.get("charset") != None]`
			`if not meta_charsets:`
			`meta = ET.SubElement(doc.find(".//head"), "meta", charset=args.charset)`

			`if title != None:`
			`titleelt = doc.find(".//title")`
			`if not titleelt:`
			`titleelt = ET.SubElement(doc.find(".//head"), "title")`
			`titleelt.text = title`

			`if indent:`
			`etree_indent(doc)`
			`return doc`


			`if __name__ == "__main__":`
			`p = ArgumentParser("")`
			`p.add_argument("input", nargs="?", default=None)`
			`p.add_argument("--indent", default=False, action="store_true")`
			`p.add_argument("--mogrify", default=False, action="store_true", help="modify file in place")`
			`p.add_argument("--method", default="html", help="method, default: html, values: html, xml, text")`
			`p.add_argument("--output", default=None, help="")`
			`p.add_argument("--title", default=None, help="ensure/add title tag in head")`
			`p.add_argument("--charset", default="utf-8", help="ensure/add meta tag with charset")`
			`p.add_argument("--script", action="append", default=[], help="ensure/add script tag")`
			`# <link>s, see https://www.w3.org/TR/html5/links.html#links`
			`p.add_argument("--stylesheet", action="append", default=[], help="ensure/add style link")`
			`p.add_argument("--alternate", action="append", default=[], nargs="+", help="ensure/add alternate links (optionally followed by a title and type)")`
			`p.add_argument("--next", action="append", default=[], nargs="+", help="ensure/add alternate link")`
			`p.add_argument("--prev", action="append", default=[], nargs="+", help="ensure/add alternate link")`
			`p.add_argument("--search", action="append", default=[], nargs="+", help="ensure/add search link")`
			`p.add_argument("--rss", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/rss+xml")`
			`p.add_argument("--atom", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/atom+xml")`

			`args = p.parse_args()`
			`links = []`
			`def add_links (links, items, rel, _type=None):`
			`for href in items:`
			`d = {}`
			`d["rel"] = rel`
			`if _type:`
			`d["type"] = _type`

			`if type(href) == list:`
			`if len(href) == 1:`
			`d["href"] = href[0]`
			`elif len(href) == 2:`
			`d["href"] = href[0]`
			`d["title"] = href[1]`
			`elif len(href) == 3:`
			`d["href"] = href[0]`
			`d["title"] = href[1]`
			`d["type"] = href[2]`
			`else:`
			`continue`
			`else:`
			`d["href"] = href`

			`links.append(d)`
			`for rel in ("stylesheet", "alternate", "next", "prev", "search"):`
			`add_links(links, getattr(args, rel), rel)`
			`for item in args.rss:`
			`add_links(links, item, rel="alternate", _type="application/rss+xml")`
			`for item in args.atom:`
			`add_links(links, item, rel="alternate", _type="application/atom+xml")`

			`# INPUT`
			`if args.input:`
			`fin = open(args.input)`
			`else:`
			`fin = sys.stdin`

			`doc = parse(fin, namespaceHTMLElements=False)`
			`if fin != sys.stdin:`
			`fin.close()`
			`html5tidy(doc, scripts=args.script, links=links, title=args.title, indent=args.indent)`

			`# OUTPUT`
			`tmppath = None`
			`if args.output:`
			`fout = open(args.output, "w")`
			`elif args.mogrify:`
			`tmppath = args.input+".tmp"`
			`fout = open(tmppath, "w")`
			`else:`
			`fout = sys.stdout`

			`print (ET.tostring(doc, method=args.method), file=fout)`

			`if fout != sys.stdout:`
			`fout.close()`

			`if tmppath:`
			`os.rename(args.input, args.input+"~")`
			`os.rename(tmppath, args.input)`