|
|
@ -2,53 +2,48 @@ from urllib.request import urlopen
|
|
|
|
import markdown, html5lib
|
|
|
|
import markdown, html5lib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_text = """
|
|
|
|
def parse_pad (pad_text):
|
|
|
|
# https://hub.xpub.nl/bootleglibrary/book/374
|
|
|
|
|
|
|
|
This is an annotation of Mrs. Gersande's Binding Index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# https://hub.xpub.nl/bootleglibrary/book/348
|
|
|
|
|
|
|
|
Telegraph Telephone Teletype
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
More TEXT HERERERE!!!!!!!!!!!!!!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# unrelated
|
|
|
|
|
|
|
|
Stuff now
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
|
|
|
|
|
|
|
|
pad_text_url = pad_url + "/export/txt"
|
|
|
|
|
|
|
|
f = urlopen(pad_text_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pad_text = f.read().decode('utf-8')
|
|
|
|
|
|
|
|
pad_text = sample_text
|
|
|
|
|
|
|
|
print (pad_text)
|
|
|
|
|
|
|
|
print ()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print (pad_text)
|
|
|
|
|
|
|
|
# Turn pad text into html text
|
|
|
|
|
|
|
|
html = markdown.markdown(pad_text)
|
|
|
|
html = markdown.markdown(pad_text)
|
|
|
|
print (html)
|
|
|
|
|
|
|
|
print ()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Turn html text in an elementtree
|
|
|
|
|
|
|
|
t = html5lib.parseFragment(html, namespaceHTMLElements=False)
|
|
|
|
t = html5lib.parseFragment(html, namespaceHTMLElements=False)
|
|
|
|
print (t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# create a "database" of paragraphs associated with each URL given in an H1
|
|
|
|
# create a "database" of paragraphs associated with each URL given in an H1
|
|
|
|
paragraphs_by_header = {}
|
|
|
|
paragraphs_by_header = {}
|
|
|
|
current_header = None
|
|
|
|
current_header = None
|
|
|
|
|
|
|
|
|
|
|
|
for elt in t:
|
|
|
|
for elt in t:
|
|
|
|
if elt.tag == "h1" and elt.text is not None:
|
|
|
|
if elt.tag == "h1" and elt.text is not None:
|
|
|
|
print (("HEADER"), elt.text)
|
|
|
|
# print (("HEADER"), elt.text)
|
|
|
|
current_header = elt.text.strip()
|
|
|
|
current_header = elt.text.strip()
|
|
|
|
elif elt.tag == "p":
|
|
|
|
elif elt.tag == "p":
|
|
|
|
if current_header:
|
|
|
|
if current_header:
|
|
|
|
if current_header not in paragraphs_by_header:
|
|
|
|
if current_header not in paragraphs_by_header:
|
|
|
|
paragraphs_by_header[current_header] = []
|
|
|
|
paragraphs_by_header[current_header] = []
|
|
|
|
paragraphs_by_header[current_header].append(elt)
|
|
|
|
paragraphs_by_header[current_header].append(elt)
|
|
|
|
|
|
|
|
return paragraphs_by_header
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def curl (url):
|
|
|
|
|
|
|
|
f = urlopen(pad_text_url)
|
|
|
|
|
|
|
|
return f.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__== "__main__":
|
|
|
|
|
|
|
|
# THIS CODE ONLY HAPPENS WHEN YOU RUN THE SCRIPT DIRECTLY
|
|
|
|
|
|
|
|
pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
|
|
|
|
|
|
|
|
pad_text_url = pad_url + "/export/txt"
|
|
|
|
|
|
|
|
pad_text = curl(pad_text_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_text = """
|
|
|
|
|
|
|
|
# https://hub.xpub.nl/bootleglibrary/book/374
|
|
|
|
|
|
|
|
This is an annotation of Mrs. Gersande's Binding Index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# https://hub.xpub.nl/bootleglibrary/book/348
|
|
|
|
|
|
|
|
Telegraph Telephone Teletype
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
More TEXT HERERERE!!!!!!!!!!!!!!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# unrelated
|
|
|
|
|
|
|
|
Stuff now
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paragraphs_by_header = parse_pad(pad_text)
|
|
|
|
from pprint import pprint
|
|
|
|
from pprint import pprint
|
|
|
|
pprint(paragraphs_by_header)
|
|
|
|
pprint(paragraphs_by_header)
|
|
|
|
|
|
|
|
|
|
|
|