You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
65 lines
2.0 KiB
Python
65 lines
2.0 KiB
Python
#!/usr/bin/env python3
|
|
import os, requests, urllib, subprocess
|
|
from bs4 import BeautifulSoup
|
|
|
|
def pandoc (src, fro="markdown", to="html5"):
|
|
# print ("[pandoc]", file=sys.stderr)
|
|
p = subprocess.Popen(["pandoc", "--from", fro, "--to", to, "--section-divs"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
stdout, stderr = p.communicate(src.encode("utf-8"))
|
|
return stdout.decode("utf-8")
|
|
|
|
|
|
def read_jargon_dictionary(url):
|
|
data = urllib.request.urlopen(url).read() #bytes object
|
|
# print('data', type(data), data)
|
|
string = data.decode("utf-8")
|
|
# print("string", type(string), string)
|
|
# print ()
|
|
# html = markdown(string)
|
|
html = pandoc(string) # convert from markdown to html
|
|
# print("html", html)
|
|
|
|
soup = BeautifulSoup(html, features="html5lib")
|
|
# headers = soup.find_all('section', {"class": "level1"})
|
|
# print("headers", headers)
|
|
|
|
defs = {} # creating empty dictionary
|
|
for section in soup.find_all('section', {"class": "level1"}):
|
|
h = section.find("h1")
|
|
word = h.text # TCP
|
|
d = section.find("p")
|
|
if not d:
|
|
d = section.find("ul")
|
|
# print("found",h.text,d.text)
|
|
#if d:
|
|
#defition =
|
|
#store the definition in the dict...
|
|
# print ("i am in a loop with a variable header", type(header), header.text)
|
|
defs[word.upper()] ={'name': word, 'text': d.text} # add to dictionary word(key): d.text (value)
|
|
|
|
|
|
return defs
|
|
|
|
if __name__ == "__main__":
|
|
# this only happens when you type python read.py
|
|
# NOT when this file is imported
|
|
url = "https://pad.xpub.nl/p/jargon-file.test/export/txt"
|
|
defs = read_jargon_dictionary(url)
|
|
print ("OUR JARGON DICTIONARY")
|
|
from pprint import pprint
|
|
pprint(defs)
|
|
# print(defs["UDP"])
|
|
|
|
# cmd = '#TCP'
|
|
# query = cmd.replace('#','')
|
|
# for header in soup('h1'):
|
|
# if query in header.string:
|
|
# print(header.string)
|
|
# print('---')
|
|
|
|
# cmd = '#UDP'
|
|
# query = cmd.replace('#','')
|
|
# for header in soup('h1'):
|
|
# if query in header.string:
|
|
# print(header.string)
|
|
# print('---') |