import html5lib from xml.etree import ElementTree as ET import datetime def patch_project_div (div): div.attrib['typeof'] = "xpub:project" div.attrib['property'] = "dc:hasPart" student, title = None, None for i, p in enumerate(div.findall("./p")): # print (f"{i}: p") if i == 0: p.attrib['property'] = "xpub:student" student = p.text elif i == 1: p.attrib['property'] = "dc:title" project_link = p.find("./a") # project_link.attrib['property'] = "xpub:project" title = project_link.text div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1] div.attrib['resource'] = "/"+div.attrib['id']+"/" elif i == 2: for ai, pdf_link in enumerate(p.findall("./a")): if ai == 0: pdf_link.attrib['property'] = "xpub:thesis" elif ai == 1: # pdf_link.attrib['property'] = "xpub:thesis_image" pass else: raise Exception (f"{title}: too many links in PDF columns") else: raise Exception(f"{title}: too many p columns") def patch_year_div(div): col1 = div.find('.//div[@class="col1"]') logo = col1.find('.//img[@class="logo"]') if logo is None: return div.attrib["typeof"] = "xpub:gradshow" logo.attrib["property"] = "dc:image" year = None for pi, p in enumerate(col1.findall("./p")): print (f"{pi}: {p}: {p.text}") if pi == 0: p.attrib['property'] = "dc:title" div.attrib["id"] = p.text.replace(" ", "-") div.attrib["resource"] = "#"+div.attrib["id"] elif pi == 1: # p.attrib['property'] = "dc:date" month, year = p.text.split(" ") year = int(year) if month == "June": month = 6 else: month = 7 print (year, month) dt = ET.SubElement(p, "datetime") dt.text = p.text p.text = "" dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat() dt.attrib['property'] = "dc:date" # p_gradshow = col1.find('.//p[@class="gradShow"]') # p_gradshow.attrib["property"] = "dc:title" # title = p_gradshow.text # print (f"patch_gradshow: {title}") for project_div in div.findall('.//div[@class="subrow"]'): patch_project_div(project_div) return True def patch_page (t): for div in t.findall('.//div[@class="row"]'): patch_year_div(div) with open("index.html") as fin: t = html5lib.parse(fin.read(), namespaceHTMLElements=False) body = t.find(".//body") body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/" body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/" patch_page(t) # print (f"count: {count}") with open("index.rdfa.html", "w") as fout: print (ET.tostring(t, method="html", encoding="unicode"), file=fout)