You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
3.0 KiB
Python

10 months ago
import html5lib
from xml.etree import ElementTree as ET
10 months ago
import datetime
10 months ago
10 months ago
def patch_project_div (div):
10 months ago
div.attrib['typeof'] = "xpub:project"
10 months ago
div.attrib['property'] = "dc:hasPart"
10 months ago
student, title = None, None
for i, p in enumerate(div.findall("./p")):
# print (f"{i}: p")
if i == 0:
p.attrib['property'] = "xpub:student"
student = p.text
elif i == 1:
p.attrib['property'] = "dc:title"
project_link = p.find("./a")
10 months ago
# project_link.attrib['property'] = "xpub:project"
10 months ago
title = project_link.text
div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
10 months ago
div.attrib['resource'] = "/"+div.attrib['id']+"/"
10 months ago
elif i == 2:
for ai, pdf_link in enumerate(p.findall("./a")):
if ai == 0:
pdf_link.attrib['property'] = "xpub:thesis"
elif ai == 1:
10 months ago
# pdf_link.attrib['property'] = "xpub:thesis_image"
pass
10 months ago
else:
raise Exception (f"{title}: too many links in PDF columns")
else:
raise Exception(f"{title}: too many p columns")
10 months ago
def patch_year_div(div):
col1 = div.find('.//div[@class="col1"]')
logo = col1.find('.//img[@class="logo"]')
if logo is None:
return
10 months ago
div.attrib["typeof"] = "xpub:gradshow"
10 months ago
logo.attrib["property"] = "dc:image"
10 months ago
year = None
10 months ago
for pi, p in enumerate(col1.findall("./p")):
print (f"{pi}: {p}: {p.text}")
if pi == 0:
p.attrib['property'] = "dc:title"
10 months ago
div.attrib["id"] = p.text.replace(" ", "-")
div.attrib["resource"] = "#"+div.attrib["id"]
10 months ago
elif pi == 1:
# p.attrib['property'] = "dc:date"
month, year = p.text.split(" ")
year = int(year)
if month == "June":
month = 6
else:
month = 7
print (year, month)
dt = ET.SubElement(p, "datetime")
dt.text = p.text
p.text = ""
dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat()
dt.attrib['property'] = "dc:date"
# p_gradshow = col1.find('.//p[@class="gradShow"]')
# p_gradshow.attrib["property"] = "dc:title"
# title = p_gradshow.text
# print (f"patch_gradshow: {title}")
for project_div in div.findall('.//div[@class="subrow"]'):
patch_project_div(project_div)
return True
def patch_page (t):
for div in t.findall('.//div[@class="row"]'):
patch_year_div(div)
with open("index.html") as fin:
t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
patch_page(t)
# print (f"count: {count}")
10 months ago
with open("index.rdfa.html", "w") as fout:
print (ET.tostring(t, method="html", encoding="unicode"), file=fout)