use markdown + html5lib to structure the text

master
Michael Murtaugh 5 years ago
parent 2b9cf12140
commit d4ab1c1b65

@ -5,6 +5,15 @@ import markdown, html5lib
sample_text = """ sample_text = """
# https://hub.xpub.nl/bootleglibrary/book/374 # https://hub.xpub.nl/bootleglibrary/book/374
This is an annotation of Mrs. Gersande's Binding Index This is an annotation of Mrs. Gersande's Binding Index
# https://hub.xpub.nl/bootleglibrary/book/348
Telegraph Telephone Teletype
More TEXT HERERERE!!!!!!!!!!!!!!
# unrelated
Stuff now
""" """
pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
@ -25,5 +34,21 @@ print ()
# Turn html text in an elementtree # Turn html text in an elementtree
t = html5lib.parseFragment(html, namespaceHTMLElements=False) t = html5lib.parseFragment(html, namespaceHTMLElements=False)
print (t) print (t)
# create a "database" of paragraphs associated with each URL given in an H1
paragraphs_by_header = {}
current_header = None
for elt in t: for elt in t:
print (elt) if elt.tag == "h1" and elt.text is not None:
print (("HEADER"), elt.text)
current_header = elt.text.strip()
elif elt.tag == "p":
if current_header:
if current_header not in paragraphs_by_header:
paragraphs_by_header[current_header] = []
paragraphs_by_header[current_header].append(elt)
from pprint import pprint
pprint(paragraphs_by_header)

Loading…
Cancel
Save