project.xpub.nl/scripts/insert_rdfa.py

import html5lib
from xml.etree import ElementTree as ET


with open("index.html") as fin:
    t = html5lib.parse(fin.read(), namespaceHTMLElements=False)

body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"

count = 0
for div in t.findall('.//div[@class="subrow"]'):
    count += 1
    div.attrib['typeof'] = "xpub:project"
    student, title = None, None
    for i, p in enumerate(div.findall("./p")):
        # print (f"{i}: p")
        if i == 0:
            p.attrib['property'] = "xpub:student"
            student = p.text
        elif i == 1:
            p.attrib['property'] = "dc:title"
            project_link = p.find("./a")
            project_link.attrib['property'] = "xpub:project"
            title = project_link.text
            div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
            div.attrib['resource'] = "#"+div.attrib['id']
        elif i == 2:
            for ai, pdf_link in enumerate(p.findall("./a")):
                if ai == 0:
                    pdf_link.attrib['property'] = "xpub:thesis"
                elif ai == 1:
                    pdf_link.attrib['property'] = "xpub:thesis_image"
                else:
                    raise Exception (f"{title}: too many links in PDF columns")
        else:
            raise Exception(f"{title}: too many p columns")

print (f"count: {count}")
with open("index.rdfa.html", "w") as fout:
    print (ET.tostring(t, method="html", encoding="unicode"), file=fout)