From 7de5d3a23142326fea86094487ad7222e7bec87f Mon Sep 17 00:00:00 2001
From: Michael Murtaugh <mm@automatist.org>
Date: Sun, 10 Dec 2023 12:26:31 +0100
Subject: [PATCH] scripts

---
 index.html             |  2 +-
 index.rdfa.html        |  3 +--
 scripts/extract_rdf.py | 16 ++++++++++++++++
 scripts/insert_rdfa.py | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100644 scripts/extract_rdf.py
 create mode 100644 scripts/insert_rdfa.py
diff --git a/index.html b/index.html
index 65f2130..eee6a99 100644
--- a/index.html
+++ b/index.html
@@ -404,7 +404,7 @@
  <div class="subrow">
  <p class="subcl1">Clara Noseda</p>
  <p class="subcl2"><a href="desire_revolution/" target="_blank" class="ext">CONSIDER DISASTER, DESIRE REVOLUTION: A repository for astropolitical research</a></p>
-  <p class="subcl3"><a href="desire_revolution/pdf/A SAILORS GUIDE TO EARTH_Clara Noseda.pdf" target="_blank" class="ext">PDF</a></p>
+  <p class="subcl3"><a href="desire_revolution/pdf/A%20SAILORS%20GUIDE%20TO%20EARTH_Clara%20Noseda.pdf" target="_blank" class="ext">PDF</a></p>
  </div>
  <div class="subrow">
  <p class="subcl1">Mika Motskobili</p>
diff --git a/index.rdfa.html b/index.rdfa.html
index a737cf8..faaf6c7 100644
--- a/index.rdfa.html
+++ b/index.rdfa.html
@@ -1,4 +1,3 @@
-<!DOCTYPE html>
 <html lang="en"><head>
   <meta charset="utf-8">
 
@@ -404,7 +403,7 @@
  <div class="subrow" typeof="xpub:project">
  <p class="subcl1" property="xpub:student">Clara Noseda</p>
  <p class="subcl2" property="dc:title"><a href="desire_revolution/" target="_blank" class="ext" property="xpub:project">CONSIDER DISASTER, DESIRE REVOLUTION: A repository for astropolitical research</a></p>
-  <p class="subcl3"><a href="desire_revolution/pdf/A SAILORS GUIDE TO EARTH_Clara Noseda.pdf" target="_blank" class="ext" property="xpub:thesis">PDF</a></p>
+  <p class="subcl3"><a href="desire_revolution/pdf/A%20SAILORS%20GUIDE%20TO%20EARTH_Clara%20Noseda.pdf" target="_blank" class="ext" property="xpub:thesis">PDF</a></p>
  </div>
  <div class="subrow" typeof="xpub:project">
  <p class="subcl1" property="xpub:student">Mika Motskobili</p>
diff --git a/scripts/extract_rdf.py b/scripts/extract_rdf.py
new file mode 100644
index 0000000..57ea3a2
--- /dev/null
+++ b/scripts/extract_rdf.py
@@ -0,0 +1,16 @@
+from rdflib import Graph
+from rdflib import RDF, URIRef, Namespace
+XPUB = Namespace("http://xpub.nl/terms/")
+import sys
+
+# url = "https://gallery.constantvzw.org/index.php/Maison-des-arts-expo/"
+
+# g = Graph()
+# with open("index.rdfa.html") as fin:
+#     data = fin.read()
+#     g.parse(publicID="index.html", data=data, format="rdfa")
+#     # g.parse(file=fin, format="rdfa", override_encoding="utf-8")
+g = Graph()
+g.parse("https://project.xpub.nl/index.rdfa.html", format="rdfa")
+
+print (g.serialize())
diff --git a/scripts/insert_rdfa.py b/scripts/insert_rdfa.py
new file mode 100644
index 0000000..fc00c9a
--- /dev/null
+++ b/scripts/insert_rdfa.py
@@ -0,0 +1,41 @@
+import html5lib
+from xml.etree import ElementTree as ET
+
+
+with open("index.html") as fin:
+    t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
+
+body = t.find(".//body")
+body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
+body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
+
+count = 0
+for div in t.findall('.//div[@class="subrow"]'):
+    count += 1
+    div.attrib['typeof'] = "xpub:project"
+    student, title = None, None
+    for i, p in enumerate(div.findall("./p")):
+        # print (f"{i}: p")
+        if i == 0:
+            p.attrib['property'] = "xpub:student"
+            student = p.text
+        elif i == 1:
+            p.attrib['property'] = "dc:title"
+            project_link = p.find("./a")
+            project_link.attrib['property'] = "xpub:project"
+            title = project_link.text
+        elif i == 2:
+            for ai, pdf_link in enumerate(p.findall("./a")):
+                if ai == 0:
+                    pdf_link.attrib['property'] = "xpub:thesis"
+                elif ai == 1:
+                    pdf_link.attrib['property'] = "xpub:thesis_image"
+                else:
+                    raise Exception (f"{title}: too many links in PDF columns")
+        else:
+            raise Exception(f"{title}: too many p columns")
+
+print (f"count: {count}")
+with open("index.rdfa.html", "w") as fout:
+    print (ET.tostring(t, method="html", encoding="unicode"), file=fout)
+