use markdown + html5lib to structure the text

5 years ago · d4ab1c1b65
parent 2b9cf12140
commit d4ab1c1b65
1 changed files with 26 additions and 1 deletions
--- a/readfrompad.py
+++ b/readfrompad.py
@ -5,6 +5,15 @@ import markdown, html5lib
 sample_text = """
 # https://hub.xpub.nl/bootleglibrary/book/374
 This is an annotation of Mrs. Gersande's Binding Index
 # https://hub.xpub.nl/bootleglibrary/book/348
 Telegraph Telephone Teletype
 More TEXT HERERERE!!!!!!!!!!!!!!
 # unrelated
 Stuff now
 """
 pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
@ -25,5 +34,21 @@ print ()
 # Turn html text in an elementtree
 t = html5lib.parseFragment(html, namespaceHTMLElements=False)
 print (t)
 # create a "database" of paragraphs associated with each URL given in an H1
 paragraphs_by_header = {}
 current_header = None
 for elt in t:
-    print (elt)
+    if elt.tag == "h1" and elt.text is not None:
        print (("HEADER"), elt.text)
        current_header = elt.text.strip()
    elif elt.tag == "p":
        if current_header:
            if current_header not in paragraphs_by_header:
                paragraphs_by_header[current_header] = []
            paragraphs_by_header[current_header].append(elt)
 from pprint import pprint
 pprint(paragraphs_by_header)