From d4ab1c1b654b403b48bc8b90054875254d79cf00 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Tue, 14 Apr 2020 13:05:34 +0200 Subject: [PATCH] use markdown + html5lib to structure the text --- readfrompad.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/readfrompad.py b/readfrompad.py index 9b23a85..e48abaa 100644 --- a/readfrompad.py +++ b/readfrompad.py @@ -5,6 +5,15 @@ import markdown, html5lib sample_text = """ # https://hub.xpub.nl/bootleglibrary/book/374 This is an annotation of Mrs. Gersande's Binding Index + + +# https://hub.xpub.nl/bootleglibrary/book/348 +Telegraph Telephone Teletype + +More TEXT HERERERE!!!!!!!!!!!!!! + +# unrelated +Stuff now """ pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" @@ -25,5 +34,21 @@ print () # Turn html text in an elementtree t = html5lib.parseFragment(html, namespaceHTMLElements=False) print (t) + +# create a "database" of paragraphs associated with each URL given in an H1 +paragraphs_by_header = {} +current_header = None + for elt in t: - print (elt) + if elt.tag == "h1" and elt.text is not None: + print (("HEADER"), elt.text) + current_header = elt.text.strip() + elif elt.tag == "p": + if current_header: + if current_header not in paragraphs_by_header: + paragraphs_by_header[current_header] = [] + paragraphs_by_header[current_header].append(elt) + +from pprint import pprint +pprint(paragraphs_by_header) +