federated-publishing-protot.../xmpp-channels/jargon.py

#!/usr/bin/env python3
import os, requests, urllib, subprocess
from bs4 import BeautifulSoup

def pandoc (src, fro="markdown", to="html5"):
    # print ("[pandoc]", file=sys.stderr)
    p = subprocess.Popen(["pandoc", "--from", fro, "--to", to, "--section-divs"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate(src.encode("utf-8"))
    return stdout.decode("utf-8")


def read_jargon_dictionary(url):
	data = urllib.request.urlopen(url).read() #bytes object
	# print('data', type(data), data)
	string = data.decode("utf-8")
	# print("string", type(string), string)
	# print ()
	# html = markdown(string)
	html = pandoc(string) # convert from markdown to html
	# print("html", html)

	soup = BeautifulSoup(html, features="html5lib")
	# headers = soup.find_all('section', {"class": "level1"})
	# print("headers", headers)

	defs = {} # creating empty dictionary
	for section in soup.find_all('section', {"class": "level1"}):
		h = section.find("h1")
		word = h.text # TCP
		d = section.find("p")
		if not d:
			d = section.find("ul")
		# print("found",h.text,d.text)
		#if d:
			#defition =
		#store the definition in the dict...
		# print ("i am in a loop with a variable header", type(header), header.text)
		defs[word.upper()] ={'name': word, 'text': d.text} # add to dictionary word(key): d.text (value)


	return defs

if __name__ == "__main__":
	# this only happens when you type python read.py
	# NOT when this file is imported
	url = "https://pad.xpub.nl/p/jargon-file.test/export/txt"
	defs = read_jargon_dictionary(url)
	print ("OUR JARGON DICTIONARY")
	from pprint import pprint
	pprint(defs)
	# print(defs["UDP"])

# cmd = '#TCP'
# query = cmd.replace('#','')
# for header in soup('h1'):
#         if query in header.string:
#                 print(header.string)
#                 print('---')

# cmd = '#UDP'
# query = cmd.replace('#','')
# for header in soup('h1'):
#         if query in header.string:
#                 print(header.string)
#                 print('---')