From d4ab1c1b654b403b48bc8b90054875254d79cf00 Mon Sep 17 00:00:00 2001
From: Michael Murtaugh <mm@automatist.org>
Date: Tue, 14 Apr 2020 13:05:34 +0200
Subject: [PATCH] use markdown + html5lib to structure the text

---
 readfrompad.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/readfrompad.py b/readfrompad.py
index 9b23a85..e48abaa 100644
--- a/readfrompad.py
+++ b/readfrompad.py
@@ -5,6 +5,15 @@ import markdown, html5lib
 sample_text = """
 # https://hub.xpub.nl/bootleglibrary/book/374
 This is an annotation of Mrs. Gersande's Binding Index
+
+
+# https://hub.xpub.nl/bootleglibrary/book/348
+Telegraph Telephone Teletype
+
+More TEXT HERERERE!!!!!!!!!!!!!!
+
+# unrelated
+Stuff now
 """
 
 pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
@@ -25,5 +34,21 @@ print ()
 # Turn html text in an elementtree
 t = html5lib.parseFragment(html, namespaceHTMLElements=False)
 print (t)
+
+# create a "database" of paragraphs associated with each URL given in an H1
+paragraphs_by_header = {}
+current_header = None
+
 for elt in t:
-    print (elt)
+    if elt.tag == "h1" and elt.text is not None:
+        print (("HEADER"), elt.text)
+        current_header = elt.text.strip()
+    elif elt.tag == "p":
+        if current_header:
+            if current_header not in paragraphs_by_header:
+                paragraphs_by_header[current_header] = []
+            paragraphs_by_header[current_header].append(elt)
+
+from pprint import pprint
+pprint(paragraphs_by_header)
+