import html5lib from xml.etree import ElementTree as ET with open("index.html") as fin: t = html5lib.parse(fin.read(), namespaceHTMLElements=False) body = t.find(".//body") body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/" body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/" count = 0 for div in t.findall('.//div[@class="subrow"]'): count += 1 div.attrib['typeof'] = "xpub:project" student, title = None, None for i, p in enumerate(div.findall("./p")): # print (f"{i}: p") if i == 0: p.attrib['property'] = "xpub:student" student = p.text elif i == 1: p.attrib['property'] = "dc:title" project_link = p.find("./a") project_link.attrib['property'] = "xpub:project" title = project_link.text div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1] div.attrib['resource'] = "#"+div.attrib['id'] elif i == 2: for ai, pdf_link in enumerate(p.findall("./a")): if ai == 0: pdf_link.attrib['property'] = "xpub:thesis" elif ai == 1: pdf_link.attrib['property'] = "xpub:thesis_image" else: raise Exception (f"{title}: too many links in PDF columns") else: raise Exception(f"{title}: too many p columns") print (f"count: {count}") with open("index.rdfa.html", "w") as fout: print (ET.tostring(t, method="html", encoding="unicode"), file=fout)