from urllib.request import urlopen import markdown, html5lib def parse_pad (pad_text): print ("parse", pad_text) html = markdown.markdown(pad_text) t = html5lib.parseFragment(html, namespaceHTMLElements=False) # create a "database" of paragraphs associated with each URL given in an H1 paragraphs_by_header = {} current_header = None for elt in t: if elt.tag == "h1" and elt.text is not None: # print (("HEADER"), elt.text) current_header = elt.text.strip() elif elt.tag == "p": if current_header: if current_header not in paragraphs_by_header: paragraphs_by_header[current_header] = [] paragraphs_by_header[current_header].append(elt) return paragraphs_by_header def curl (url): return urlopen(url).read().decode('utf-8') if __name__== "__main__": # THIS CODE ONLY HAPPENS WHEN YOU RUN THE SCRIPT DIRECTLY pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" pad_text_url = pad_url + "/export/txt" pad_text = curl(pad_text_url) sample_text = """ # https://hub.xpub.nl/bootleglibrary/book/374 This is an annotation of Mrs. Gersande's Binding Index # https://hub.xpub.nl/bootleglibrary/book/348 Telegraph Telephone Teletype More TEXT HERERERE!!!!!!!!!!!!!! # unrelated Stuff now """ paragraphs_by_header = parse_pad(pad_text) from pprint import pprint pprint(paragraphs_by_header)