project.xpub.nl/scripts/insert_rdfa.py

import html5lib
from xml.etree import ElementTree as ET
import datetime


def patch_project_div (div):
    div.attrib['typeof'] = "xpub:project"
    div.attrib['property'] = "dc:hasPart"
    student, title = None, None
    for i, p in enumerate(div.findall("./p")):
        # print (f"{i}: p")
        if i == 0:
            p.attrib['property'] = "xpub:student"
            student = p.text
        elif i == 1:
            p.attrib['property'] = "dc:title"
            project_link = p.find("./a")
            # project_link.attrib['property'] = "xpub:project"
            title = project_link.text
            div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]
            div.attrib['resource'] = "/"+div.attrib['id']+"/"
        elif i == 2:
            for ai, pdf_link in enumerate(p.findall("./a")):
                if ai == 0:
                    pdf_link.attrib['property'] = "xpub:thesis"
                elif ai == 1:
                    # pdf_link.attrib['property'] = "xpub:thesis_image"
                    pass
                else:
                    raise Exception (f"{title}: too many links in PDF columns")
        else:
            raise Exception(f"{title}: too many p columns")

def patch_year_div(div):
    col1 = div.find('.//div[@class="col1"]')
    logo = col1.find('.//img[@class="logo"]')
    if logo is None:
        return
    div.attrib["typeof"] = "xpub:gradshow"
    logo.attrib["property"] = "dc:image"
    year = None
    for pi, p in enumerate(col1.findall("./p")):
        print (f"{pi}: {p}: {p.text}")
        if pi == 0:
            p.attrib['property'] = "dc:title"
            div.attrib["id"] = p.text.replace(" ", "-")
            div.attrib["resource"] = "#"+div.attrib["id"]
        elif pi == 1:
            # p.attrib['property'] = "dc:date"
            month, year = p.text.split(" ")
            year = int(year)
            if month == "June":
                month = 6
            else:
                month = 7
            print (year, month)
            dt = ET.SubElement(p, "datetime")
            dt.text = p.text
            p.text = ""
            dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat()
            dt.attrib['property'] = "dc:date"
    # p_gradshow = col1.find('.//p[@class="gradShow"]')
    # p_gradshow.attrib["property"] = "dc:title"
    # title = p_gradshow.text
    # print (f"patch_gradshow: {title}")
    for project_div in div.findall('.//div[@class="subrow"]'):
        patch_project_div(project_div)
    return True

def patch_page (t):
    for div in t.findall('.//div[@class="row"]'):
        patch_year_div(div)


with open("index.html") as fin:
    t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
body = t.find(".//body")
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
patch_page(t)
# print (f"count: {count}")
with open("index.rdfa.html", "w") as fout:
    print (ET.tostring(t, method="html", encoding="unicode"), file=fout)
scripts 1 year ago			`import html5lib`
			`from xml.etree import ElementTree as ET`
more rdfa 1 year ago			`import datetime`
scripts 1 year ago

more rdfa 1 year ago			`def patch_project_div (div):`
scripts 1 year ago			`div.attrib['typeof'] = "xpub:project"`
rdfa 1 year ago			`div.attrib['property'] = "dc:hasPart"`
scripts 1 year ago			`student, title = None, None`
			`for i, p in enumerate(div.findall("./p")):`
			`# print (f"{i}: p")`
			`if i == 0:`
			`p.attrib['property'] = "xpub:student"`
			`student = p.text`
			`elif i == 1:`
			`p.attrib['property'] = "dc:title"`
			`project_link = p.find("./a")`
rdfa 1 year ago			`# project_link.attrib['property'] = "xpub:project"`
scripts 1 year ago			`title = project_link.text`
fixed pdf urls in index 1 year ago			`div.attrib['id'] = project_link.attrib['href'].rstrip("/").split("/")[-1]`
rdfa 1 year ago			`div.attrib['resource'] = "/"+div.attrib['id']+"/"`
scripts 1 year ago			`elif i == 2:`
			`for ai, pdf_link in enumerate(p.findall("./a")):`
			`if ai == 0:`
			`pdf_link.attrib['property'] = "xpub:thesis"`
			`elif ai == 1:`
more rdfa work 1 year ago			`# pdf_link.attrib['property'] = "xpub:thesis_image"`
			`pass`
scripts 1 year ago			`else:`
			`raise Exception (f"{title}: too many links in PDF columns")`
			`else:`
			`raise Exception(f"{title}: too many p columns")`

more rdfa 1 year ago			`def patch_year_div(div):`
			`col1 = div.find('.//div[@class="col1"]')`
			`logo = col1.find('.//img[@class="logo"]')`
			`if logo is None:`
			`return`
rdfa 1 year ago			`div.attrib["typeof"] = "xpub:gradshow"`
more rdfa 1 year ago			`logo.attrib["property"] = "dc:image"`
rdfa 1 year ago			`year = None`
more rdfa 1 year ago			`for pi, p in enumerate(col1.findall("./p")):`
			`print (f"{pi}: {p}: {p.text}")`
			`if pi == 0:`
			`p.attrib['property'] = "dc:title"`
rdfa 1 year ago			`div.attrib["id"] = p.text.replace(" ", "-")`
			`div.attrib["resource"] = "#"+div.attrib["id"]`
more rdfa 1 year ago			`elif pi == 1:`
			`# p.attrib['property'] = "dc:date"`
			`month, year = p.text.split(" ")`
			`year = int(year)`
			`if month == "June":`
			`month = 6`
			`else:`
			`month = 7`
			`print (year, month)`
			`dt = ET.SubElement(p, "datetime")`
			`dt.text = p.text`
			`p.text = ""`
			`dt.attrib['datetime'] = datetime.date(year, month, 1).isoformat()`
			`dt.attrib['property'] = "dc:date"`
			`# p_gradshow = col1.find('.//p[@class="gradShow"]')`
			`# p_gradshow.attrib["property"] = "dc:title"`
			`# title = p_gradshow.text`
			`# print (f"patch_gradshow: {title}")`
			`for project_div in div.findall('.//div[@class="subrow"]'):`
			`patch_project_div(project_div)`
			`return True`

			`def patch_page (t):`
			`for div in t.findall('.//div[@class="row"]'):`
			`patch_year_div(div)`


			`with open("index.html") as fin:`
			`t = html5lib.parse(fin.read(), namespaceHTMLElements=False)`
			`body = t.find(".//body")`
			`body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"`
			`body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"`
			`patch_page(t)`
			`# print (f"count: {count}")`
scripts 1 year ago			`with open("index.rdfa.html", "w") as fout:`
			`print (ET.tostring(t, method="html", encoding="unicode"), file=fout)`