project.xpub.nl/scripts/insert_rdfa.py

import html5lib
from xml.etree import ElementTree as ET


with open("index.html") as fin:
    t = html5lib.parse(fin.read(), namespaceHTMLElements=False)

body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"

count = 0
for div in t.findall('.//div[@class="subrow"]'):
    count += 1
    div.attrib['typeof'] = "xpub:project"
    student, title = None, None
    for i, p in enumerate(div.findall("./p")):
        # print (f"{i}: p")
        if i == 0:
            p.attrib['property'] = "xpub:student"
            student = p.text
        elif i == 1:
            p.attrib['property'] = "dc:title"
            project_link = p.find("./a")
            project_link.attrib['property'] = "xpub:project"
            title = project_link.text
            div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
            div.attrib['resource'] = "#"+div.attrib['id']
        elif i == 2:
            for ai, pdf_link in enumerate(p.findall("./a")):
                if ai == 0:
                    pdf_link.attrib['property'] = "xpub:thesis"
                elif ai == 1:
                    pdf_link.attrib['property'] = "xpub:thesis_image"
                else:
                    raise Exception (f"{title}: too many links in PDF columns")
        else:
            raise Exception(f"{title}: too many p columns")

print (f"count: {count}")
with open("index.rdfa.html", "w") as fout:
    print (ET.tostring(t, method="html", encoding="unicode"), file=fout)
scripts 7 months ago			`import html5lib`
			`from xml.etree import ElementTree as ET`


			`with open("index.html") as fin:`
			`t = html5lib.parse(fin.read(), namespaceHTMLElements=False)`

			`body = t.find(".//body")`
			`body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"`
			`body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"`

			`count = 0`
			`for div in t.findall('.//div[@class="subrow"]'):`
			`count += 1`
			`div.attrib['typeof'] = "xpub:project"`
			`student, title = None, None`
			`for i, p in enumerate(div.findall("./p")):`
			`# print (f"{i}: p")`
			`if i == 0:`
			`p.attrib['property'] = "xpub:student"`
			`student = p.text`
			`elif i == 1:`
			`p.attrib['property'] = "dc:title"`
			`project_link = p.find("./a")`
			`project_link.attrib['property'] = "xpub:project"`
			`title = project_link.text`
fixed pdf urls in index 7 months ago			`div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]`
			`div.attrib['resource'] = "#"+div.attrib['id']`
scripts 7 months ago			`elif i == 2:`
			`for ai, pdf_link in enumerate(p.findall("./a")):`
			`if ai == 0:`
			`pdf_link.attrib['property'] = "xpub:thesis"`
			`elif ai == 1:`
			`pdf_link.attrib['property'] = "xpub:thesis_image"`
			`else:`
			`raise Exception (f"{title}: too many links in PDF columns")`
			`else:`
			`raise Exception(f"{title}: too many p columns")`

			`print (f"count: {count}")`
			`with open("index.rdfa.html", "w") as fout:`
			`print (ET.tostring(t, method="html", encoding="unicode"), file=fout)`