reorganized readfrompad with functions

master
Michael Murtaugh 5 years ago
parent d4ab1c1b65
commit 2a23d871de

@ -2,11 +2,38 @@ from urllib.request import urlopen
import markdown, html5lib
sample_text = """
def parse_pad (pad_text):
html = markdown.markdown(pad_text)
t = html5lib.parseFragment(html, namespaceHTMLElements=False)
# create a "database" of paragraphs associated with each URL given in an H1
paragraphs_by_header = {}
current_header = None
for elt in t:
if elt.tag == "h1" and elt.text is not None:
# print (("HEADER"), elt.text)
current_header = elt.text.strip()
elif elt.tag == "p":
if current_header:
if current_header not in paragraphs_by_header:
paragraphs_by_header[current_header] = []
paragraphs_by_header[current_header].append(elt)
return paragraphs_by_header
def curl (url):
f = urlopen(pad_text_url)
return f.read().decode('utf-8')
if __name__== "__main__":
# THIS CODE ONLY HAPPENS WHEN YOU RUN THE SCRIPT DIRECTLY
pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
pad_text_url = pad_url + "/export/txt"
pad_text = curl(pad_text_url)
sample_text = """
# https://hub.xpub.nl/bootleglibrary/book/374
This is an annotation of Mrs. Gersande's Binding Index
# https://hub.xpub.nl/bootleglibrary/book/348
Telegraph Telephone Teletype
@ -16,39 +43,7 @@ More TEXT HERERERE!!!!!!!!!!!!!!
Stuff now
"""
pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
pad_text_url = pad_url + "/export/txt"
f = urlopen(pad_text_url)
pad_text = f.read().decode('utf-8')
pad_text = sample_text
print (pad_text)
print ()
# print (pad_text)
# Turn pad text into html text
html = markdown.markdown(pad_text)
print (html)
print ()
# Turn html text in an elementtree
t = html5lib.parseFragment(html, namespaceHTMLElements=False)
print (t)
# create a "database" of paragraphs associated with each URL given in an H1
paragraphs_by_header = {}
current_header = None
for elt in t:
if elt.tag == "h1" and elt.text is not None:
print (("HEADER"), elt.text)
current_header = elt.text.strip()
elif elt.tag == "p":
if current_header:
if current_header not in paragraphs_by_header:
paragraphs_by_header[current_header] = []
paragraphs_by_header[current_header].append(elt)
from pprint import pprint
pprint(paragraphs_by_header)
paragraphs_by_header = parse_pad(pad_text)
from pprint import pprint
pprint(paragraphs_by_header)

Loading…
Cancel
Save