You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
6.8 KiB
6.8 KiB
Scrap System¶
This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box.
[
{
"Word#": "3",
"title": "Property",
"properties": [
"word",
"proposition",
"logic"
],
"voices": [
{
"voice": "⤷ An attribute, characteristic, or quality",
"link": "link"
},
{
"voice": "⤷ From etymology the word comes from propert",
"link":"link"
}
]
}
]
In [94]:
from bs4 import BeautifulSoup import json
In [95]:
with open("2.0/index.html") as file_in: soup = BeautifulSoup(file_in, 'html.parser')
Define My GLossary Bag¶
In [96]:
glossary_bag = []
TITLE
In [97]:
title = soup.find(id="title").text
PROPERTIES¶
In [98]:
master_properties = [ { 'title':'action', 'symbol':'A', 'color': 'var(--action-color)' }, { 'title':'situation', 'symbol':'S', 'color': 'var(--situation-color)' }, { 'title':'logic', 'symbol':'C', 'color': 'var(--logic-color)' }, { 'title':'proposition', 'symbol':'T', 'color': 'var(--proposition-color)' }, { 'title':'hyperlink', 'symbol':'N', 'color': 'var(--hyperlink-color)' }, { 'title':'process', 'symbol':'P', 'color': 'var(--process-color)' }, { 'title':'language', 'symbol':'G', 'color': 'var(--language-color)' }, { 'title':'agent', 'symbol':'E', 'color': 'var(--agent-color)' }, { 'title':'tool', 'symbol':'T', 'color': 'var(--tool-color)' }, { 'title':'form', 'symbol':'Y', 'color': 'var(--form-color)' } ]
In [99]:
title1 = [ {'title': title } ] properties = [ {'properties' : master_properties } ] glossary_bag.append(title1) glossary_bag.append(properties)
WORDS
In [100]:
word_no = 1 words = soup.find_all('div',attrs={'class':'word'}) glossary = [] for word in words: title = word.find('h1').text voices = word.find_all('p') links = word.find_all('a') properties = word.get('class') li_properties = [] for prop in properties: title_p = prop for m_prop in master_properties: if title_p == m_prop['title']: symb = m_prop['symbol'] color = m_prop['color'] propert = {} propert["title"] = title_p propert["symbol"] = symb propert["color"] = color li_properties.append(propert) li_voices = [] for voice in voices: links = voice.find_all('a') sentence = {} sentence["voice"]= voice.text.replace("⤴","") if len(links) > 0: sentence["link"]= [] for link in links: url = link.get('href') sentence["link"].append(url) li_voices.append(sentence) word = { 'Word#': str(word_no), 'title': title, 'properties': li_properties, 'voices': li_voices, } glossary.append(word) word_no += 1 words = [ { 'words' : glossary } ] glossary_bag.append(words)
In [101]:
with open('glossary.json', 'w+', encoding='utf-8') as f: json.dump(glossary_bag, f, indent=5, ensure_ascii=False)
In [ ]:
glossary_bag