From 7de5d3a23142326fea86094487ad7222e7bec87f Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Sun, 10 Dec 2023 12:26:31 +0100 Subject: [PATCH] scripts --- index.html | 2 +- index.rdfa.html | 3 +-- scripts/extract_rdf.py | 16 ++++++++++++++++ scripts/insert_rdfa.py | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 scripts/extract_rdf.py create mode 100644 scripts/insert_rdfa.py diff --git a/index.html b/index.html index 65f2130..eee6a99 100644 --- a/index.html +++ b/index.html @@ -404,7 +404,7 @@

Clara Noseda

CONSIDER DISASTER, DESIRE REVOLUTION: A repository for astropolitical research

-

PDF

+

PDF

Mika Motskobili

diff --git a/index.rdfa.html b/index.rdfa.html index a737cf8..faaf6c7 100644 --- a/index.rdfa.html +++ b/index.rdfa.html @@ -1,4 +1,3 @@ - @@ -404,7 +403,7 @@

Mika Motskobili

diff --git a/scripts/extract_rdf.py b/scripts/extract_rdf.py new file mode 100644 index 0000000..57ea3a2 --- /dev/null +++ b/scripts/extract_rdf.py @@ -0,0 +1,16 @@ +from rdflib import Graph +from rdflib import RDF, URIRef, Namespace +XPUB = Namespace("http://xpub.nl/terms/") +import sys + +# url = "https://gallery.constantvzw.org/index.php/Maison-des-arts-expo/" + +# g = Graph() +# with open("index.rdfa.html") as fin: +# data = fin.read() +# g.parse(publicID="index.html", data=data, format="rdfa") +# # g.parse(file=fin, format="rdfa", override_encoding="utf-8") +g = Graph() +g.parse("https://project.xpub.nl/index.rdfa.html", format="rdfa") + +print (g.serialize()) diff --git a/scripts/insert_rdfa.py b/scripts/insert_rdfa.py new file mode 100644 index 0000000..fc00c9a --- /dev/null +++ b/scripts/insert_rdfa.py @@ -0,0 +1,41 @@ +import html5lib +from xml.etree import ElementTree as ET + + +with open("index.html") as fin: + t = html5lib.parse(fin.read(), namespaceHTMLElements=False) + +body = t.find(".//body") +body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/" +body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/" + +count = 0 +for div in t.findall('.//div[@class="subrow"]'): + count += 1 + div.attrib['typeof'] = "xpub:project" + student, title = None, None + for i, p in enumerate(div.findall("./p")): + # print (f"{i}: p") + if i == 0: + p.attrib['property'] = "xpub:student" + student = p.text + elif i == 1: + p.attrib['property'] = "dc:title" + project_link = p.find("./a") + project_link.attrib['property'] = "xpub:project" + title = project_link.text + elif i == 2: + for ai, pdf_link in enumerate(p.findall("./a")): + if ai == 0: + pdf_link.attrib['property'] = "xpub:thesis" + elif ai == 1: + pdf_link.attrib['property'] = "xpub:thesis_image" + else: + raise Exception (f"{title}: too many links in PDF columns") + else: + raise Exception(f"{title}: too many p columns") + +print (f"count: {count}") +with open("index.rdfa.html", "w") as fout: + print (ET.tostring(t, method="html", encoding="unicode"), file=fout) +