scripts
parent
87352c6eed
commit
7de5d3a231
@ -0,0 +1,16 @@
|
|||||||
|
from rdflib import Graph
|
||||||
|
from rdflib import RDF, URIRef, Namespace
|
||||||
|
XPUB = Namespace("http://xpub.nl/terms/")
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# url = "https://gallery.constantvzw.org/index.php/Maison-des-arts-expo/"
|
||||||
|
|
||||||
|
# g = Graph()
|
||||||
|
# with open("index.rdfa.html") as fin:
|
||||||
|
# data = fin.read()
|
||||||
|
# g.parse(publicID="index.html", data=data, format="rdfa")
|
||||||
|
# # g.parse(file=fin, format="rdfa", override_encoding="utf-8")
|
||||||
|
g = Graph()
|
||||||
|
g.parse("https://project.xpub.nl/index.rdfa.html", format="rdfa")
|
||||||
|
|
||||||
|
print (g.serialize())
|
@ -0,0 +1,41 @@
|
|||||||
|
import html5lib
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
with open("index.html") as fin:
|
||||||
|
t = html5lib.parse(fin.read(), namespaceHTMLElements=False)
|
||||||
|
|
||||||
|
body = t.find(".//body")
|
||||||
|
body.attrib['xmlns:xpub'] = "http://xpub.nl/terms/"
|
||||||
|
body.attrib['xmlns:dc'] = "http://purl.org/dc/terms/"
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for div in t.findall('.//div[@class="subrow"]'):
|
||||||
|
count += 1
|
||||||
|
div.attrib['typeof'] = "xpub:project"
|
||||||
|
student, title = None, None
|
||||||
|
for i, p in enumerate(div.findall("./p")):
|
||||||
|
# print (f"{i}: p")
|
||||||
|
if i == 0:
|
||||||
|
p.attrib['property'] = "xpub:student"
|
||||||
|
student = p.text
|
||||||
|
elif i == 1:
|
||||||
|
p.attrib['property'] = "dc:title"
|
||||||
|
project_link = p.find("./a")
|
||||||
|
project_link.attrib['property'] = "xpub:project"
|
||||||
|
title = project_link.text
|
||||||
|
elif i == 2:
|
||||||
|
for ai, pdf_link in enumerate(p.findall("./a")):
|
||||||
|
if ai == 0:
|
||||||
|
pdf_link.attrib['property'] = "xpub:thesis"
|
||||||
|
elif ai == 1:
|
||||||
|
pdf_link.attrib['property'] = "xpub:thesis_image"
|
||||||
|
else:
|
||||||
|
raise Exception (f"{title}: too many links in PDF columns")
|
||||||
|
else:
|
||||||
|
raise Exception(f"{title}: too many p columns")
|
||||||
|
|
||||||
|
print (f"count: {count}")
|
||||||
|
with open("index.rdfa.html", "w") as fout:
|
||||||
|
print (ET.tostring(t, method="html", encoding="unicode"), file=fout)
|
||||||
|
|
Loading…
Reference in New Issue