project.xpub.nl/scripts/insert_rdfa.py

import html5lib
from xml.etree import ElementTree as ET
import datetime


def patch_project_div (div):
    div.attrib['typeof'] = "xpub:project"
    div.attrib['property'] = "dc:hasPart"
    student, title = None, None
    for i, p in enumerate(div.findall("./p")):
        # print (f"{i}: p")
        if i == 0:
            p.attrib['property'] = "xpub:student"
            student = p.text
        elif i == 1:
            p.attrib['property'] = "dc:title"
            project_link = p.find("./a")
            # project_link.attrib['property'] = "xpub:project"
            title = project_link.text
            div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
            div.attrib['resource'] = "/"+div.attrib['id']+"/"
        elif i == 2:
            for ai, pdf_link in enumerate(p.findall("./a")):
                if ai == 0:
                    pdf_link.attrib['property'] = "xpub:thesis"
                elif ai == 1:
                    # pdf_link.attrib['property'] = "xpub:thesis_image"
                    pass
                else:
                    raise Exception (f"{title}: too many links in PDF columns")
        else:
            raise Exception(f"{title}: too many p columns")

def patch_year_div(div):
    col1 = div.find('.//div[@class="col1"]')
    logo = col1.find('.//img[@class="logo"]')
    if logo is None:
        return
    div.attrib["typeof"] = "xpub:gradshow"
    logo.attrib["property"] = "dc:image"
    year = None
    for pi, p in enumerate(col1.findall("./p")):
        print (f"{pi}: {p}: {p.text}")
        if pi == 0:
            p.attrib['property'] = "dc:title"
            div.attrib["id"] = p.text.replace(" ", "-")
            div.attrib["resource"] = "#"+div.attrib["id"]
        elif pi == 1:
            # p.attrib['property'] = "dc:date"
            month, year = p.text.split(" ")
            year = int(year)
            if month == "June":
                month = 6
            else:
                month = 7
            print (year, month)
            dt = ET.SubElement(p, "datetime")
            dt.text = p.text
            p.text = ""
            dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat()
            dt.attrib['property'] = "dc:date"
    # p_gradshow = col1.find('.//p[@class="gradShow"]')
    # p_gradshow.attrib["property"] = "dc:title"
    # title = p_gradshow.text
    # print (f"patch_gradshow: {title}")
    for project_div in div.findall('.//div[@class="subrow"]'):
        patch_project_div(project_div)
    return True

def patch_page (t):
    for div in t.findall('.//div[@class="row"]'):
        patch_year_div(div)


with open("index.html") as fin:
    t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
patch_page(t)
# print (f"count: {count}")
with open("index.rdfa.html", "w") as fout:
    print (ET.tostring(t, method="html", encoding="unicode"), file=fout)