use markdown + html5lib to structure the text

5 years ago · d4ab1c1b65
parent 2b9cf12140
commit d4ab1c1b65
1 changed files with 26 additions and 1 deletions
--- a/readfrompad.py
+++ b/readfrompad.py
@ -5,6 +5,15 @@ import markdown, html5lib
 sample_text = """
 # https://hub.xpub.nl/bootleglibrary/book/374
 This is an annotation of Mrs. Gersande's Binding Index
+
+
+# https://hub.xpub.nl/bootleglibrary/book/348
+Telegraph Telephone Teletype
+
+More TEXT HERERERE!!!!!!!!!!!!!!
+
+# unrelated
+Stuff now
 """

 pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
@ -25,5 +34,21 @@ print ()
 # Turn html text in an elementtree
 t = html5lib.parseFragment(html, namespaceHTMLElements=False)
 print (t)
+
+# create a "database" of paragraphs associated with each URL given in an H1
+paragraphs_by_header = {}
+current_header = None
+
 for elt in t:
-    print (elt)
+    if elt.tag == "h1" and elt.text is not None:
+        print (("HEADER"), elt.text)
+        current_header = elt.text.strip()
+    elif elt.tag == "p":
+        if current_header:
+            if current_header not in paragraphs_by_header:
+                paragraphs_by_header[current_header] = []
+            paragraphs_by_header[current_header].append(elt)
+
+from pprint import pprint
+pprint(paragraphs_by_header)
+