From 5806ed6e90d93d7b4871f950957b39cc97674a90 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Tue, 14 Apr 2020 12:48:35 +0200 Subject: [PATCH] use markdown + html5lib to structure the text --- readfrompad.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/readfrompad.py b/readfrompad.py index 8158413..f4e803e 100644 --- a/readfrompad.py +++ b/readfrompad.py @@ -1,11 +1,14 @@ from urllib.request import urlopen +import markdown, html5lib pad_url = "https://pad.xpub.nl/p/boring_old_tomato_sandwiches" pad_text_url = pad_url + "/export/txt" - f = urlopen(pad_text_url) pad_text = f.read().decode('utf-8') - -print (pad_text) - +# print (pad_text) +html = markdown.markdown(pad_text) +t = html5lib.parseFragment(html, namespaceHTMLElements=False) +print (t) +for elt in t: + print (elt)