#!/usr/bin/env python3 import os, requests, urllib, subprocess from bs4 import BeautifulSoup def pandoc (src, fro="markdown", to="html5"): # print ("[pandoc]", file=sys.stderr) p = subprocess.Popen(["pandoc", "--from", fro, "--to", to, "--section-divs"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate(src.encode("utf-8")) return stdout.decode("utf-8") def read_jargon_dictionary(url): data = urllib.request.urlopen(url).read() #bytes object # print('data', type(data), data) string = data.decode("utf-8") # print("string", type(string), string) # print () # html = markdown(string) html = pandoc(string) # convert from markdown to html # print("html", html) soup = BeautifulSoup(html, features="html5lib") # headers = soup.find_all('section', {"class": "level1"}) # print("headers", headers) defs = {} # creating empty dictionary for section in soup.find_all('section', {"class": "level1"}): h = section.find("h1") word = h.text # TCP d = section.find("p") if not d: d = section.find("ul") # print("found",h.text,d.text) #if d: #defition = #store the definition in the dict... # print ("i am in a loop with a variable header", type(header), header.text) defs[word.upper()] ={'name': word, 'text': d.text} # add to dictionary word(key): d.text (value) return defs if __name__ == "__main__": # this only happens when you type python read.py # NOT when this file is imported url = "https://pad.xpub.nl/p/jargon-file.test/export/txt" defs = read_jargon_dictionary(url) print ("OUR JARGON DICTIONARY") from pprint import pprint pprint(defs) # print(defs["UDP"]) # cmd = '#TCP' # query = cmd.replace('#','') # for header in soup('h1'): # if query in header.string: # print(header.string) # print('---') # cmd = '#UDP' # query = cmd.replace('#','') # for header in soup('h1'): # if query in header.string: # print(header.string) # print('---')