From 2a23d871de0207c81120b9a66d2cfaa6c0dc33eb Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Tue, 14 Apr 2020 13:11:45 +0200 Subject: [PATCH] reorganized readfrompad with functions --- readfrompad.py | 69 +++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/readfrompad.py b/readfrompad.py index e48abaa..5e59807 100644 --- a/readfrompad.py +++ b/readfrompad.py @@ -2,11 +2,38 @@ from urllib.request import urlopen import markdown, html5lib -sample_text = """ +def parse_pad (pad_text): + html = markdown.markdown(pad_text) + t = html5lib.parseFragment(html, namespaceHTMLElements=False) + # create a "database" of paragraphs associated with each URL given in an H1 + paragraphs_by_header = {} + current_header = None + for elt in t: + if elt.tag == "h1" and elt.text is not None: + # print (("HEADER"), elt.text) + current_header = elt.text.strip() + elif elt.tag == "p": + if current_header: + if current_header not in paragraphs_by_header: + paragraphs_by_header[current_header] = [] + paragraphs_by_header[current_header].append(elt) + return paragraphs_by_header + +def curl (url): + f = urlopen(pad_text_url) + return f.read().decode('utf-8') + + +if __name__== "__main__": + # THIS CODE ONLY HAPPENS WHEN YOU RUN THE SCRIPT DIRECTLY + pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" + pad_text_url = pad_url + "/export/txt" + pad_text = curl(pad_text_url) + + sample_text = """ # https://hub.xpub.nl/bootleglibrary/book/374 This is an annotation of Mrs. Gersande's Binding Index - # https://hub.xpub.nl/bootleglibrary/book/348 Telegraph Telephone Teletype @@ -16,39 +43,7 @@ More TEXT HERERERE!!!!!!!!!!!!!! Stuff now """ -pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" -pad_text_url = pad_url + "/export/txt" -f = urlopen(pad_text_url) - -pad_text = f.read().decode('utf-8') -pad_text = sample_text -print (pad_text) -print () - -# print (pad_text) -# Turn pad text into html text -html = markdown.markdown(pad_text) -print (html) -print () - -# Turn html text in an elementtree -t = html5lib.parseFragment(html, namespaceHTMLElements=False) -print (t) - -# create a "database" of paragraphs associated with each URL given in an H1 -paragraphs_by_header = {} -current_header = None - -for elt in t: - if elt.tag == "h1" and elt.text is not None: - print (("HEADER"), elt.text) - current_header = elt.text.strip() - elif elt.tag == "p": - if current_header: - if current_header not in paragraphs_by_header: - paragraphs_by_header[current_header] = [] - paragraphs_by_header[current_header].append(elt) - -from pprint import pprint -pprint(paragraphs_by_header) + paragraphs_by_header = parse_pad(pad_text) + from pprint import pprint + pprint(paragraphs_by_header)