bootleg/readfrompad.py

from urllib.request import urlopen
import markdown, html5lib


def parse_pad (pad_text):
    print ("parse", pad_text)
    html = markdown.markdown(pad_text)
    t = html5lib.parseFragment(html, namespaceHTMLElements=False)
    # create a "database" of paragraphs associated with each URL given in an H1
    paragraphs_by_header = {}
    current_header = None
    for elt in t:
        if elt.tag == "h1" and elt.text is not None:
            # print (("HEADER"), elt.text)
            current_header = elt.text.strip()
        elif elt.tag == "p":
            if current_header:
                if current_header not in paragraphs_by_header:
                    paragraphs_by_header[current_header] = []
                paragraphs_by_header[current_header].append(elt)
    return paragraphs_by_header

def curl (url):
    return urlopen(url).read().decode('utf-8')


if __name__== "__main__":
    # THIS CODE ONLY HAPPENS WHEN YOU RUN THE SCRIPT DIRECTLY
    pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches"
    pad_text_url = pad_url + "/export/txt"
    pad_text = curl(pad_text_url)

    sample_text = """
# https://hub.xpub.nl/bootleglibrary/book/374
This is an annotation of Mrs. Gersande's Binding Index

# https://hub.xpub.nl/bootleglibrary/book/348
Telegraph Telephone Teletype

More TEXT HERERERE!!!!!!!!!!!!!!

# unrelated
Stuff now
"""

    paragraphs_by_header = parse_pad(pad_text)
    from pprint import pprint
    pprint(paragraphs_by_header)