You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
3.0 KiB
Python

1 year ago
import html5lib
from xml.etree import ElementTree as ET
1 year ago
import datetime
1 year ago
1 year ago
def patch_project_div (div):
1 year ago
div.attrib['typeof'] = "xpub:project"
1 year ago
div.attrib['property'] = "dc:hasPart"
1 year ago
student, title = None, None
for i, p in enumerate(div.findall("./p")):
# print (f"{i}: p")
if i == 0:
p.attrib['property'] = "xpub:student"
student = p.text
elif i == 1:
p.attrib['property'] = "dc:title"
project_link = p.find("./a")
1 year ago
# project_link.attrib['property'] = "xpub:project"
1 year ago
title = project_link.text
div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
1 year ago
div.attrib['resource'] = "/"+div.attrib['id']+"/"
1 year ago
elif i == 2:
for ai, pdf_link in enumerate(p.findall("./a")):
if ai == 0:
pdf_link.attrib['property'] = "xpub:thesis"
elif ai == 1:
1 year ago
# pdf_link.attrib['property'] = "xpub:thesis_image"
pass
1 year ago
else:
raise Exception (f"{title}: too many links in PDF columns")
else:
raise Exception(f"{title}: too many p columns")
1 year ago
def patch_year_div(div):
col1 = div.find('.//div[@class="col1"]')
logo = col1.find('.//img[@class="logo"]')
if logo is None:
return
1 year ago
div.attrib["typeof"] = "xpub:gradshow"
1 year ago
logo.attrib["property"] = "dc:image"
1 year ago
year = None
1 year ago
for pi, p in enumerate(col1.findall("./p")):
print (f"{pi}: {p}: {p.text}")
if pi == 0:
p.attrib['property'] = "dc:title"
1 year ago
div.attrib["id"] = p.text.replace(" ", "-")
div.attrib["resource"] = "#"+div.attrib["id"]
1 year ago
elif pi == 1:
# p.attrib['property'] = "dc:date"
month, year = p.text.split(" ")
year = int(year)
if month == "June":
month = 6
else:
month = 7
print (year, month)
dt = ET.SubElement(p, "datetime")
dt.text = p.text
p.text = ""
dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat()
dt.attrib['property'] = "dc:date"
# p_gradshow = col1.find('.//p[@class="gradShow"]')
# p_gradshow.attrib["property"] = "dc:title"
# title = p_gradshow.text
# print (f"patch_gradshow: {title}")
for project_div in div.findall('.//div[@class="subrow"]'):
patch_project_div(project_div)
return True
def patch_page (t):
for div in t.findall('.//div[@class="row"]'):
patch_year_div(div)
with open("index.html") as fin:
t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
patch_page(t)
# print (f"count: {count}")
1 year ago
with open("index.rdfa.html", "w") as fout:
print (ET.tostring(t, method="html", encoding="unicode"), file=fout)