You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3.9 KiB
3.9 KiB
Scrap System¶
This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box.
glossary = {
'title': f'{title}',
'properties': {
}
'words':{
'reflection': {
'voices': ['','',''],
'properties': ['','',''],
}
'version': '0.1',
}
In [10]:
from bs4 import BeautifulSoup import json
In [6]:
with open("2.0/index.html") as file_in: soup = BeautifulSoup(file_in, 'html.parser')
TITLE
In [7]:
title = soup.find(id="title")
WORDS
In [8]:
glossary_bag = [] word_no = 1 words = soup.find_all('div',attrs={'class':'word'}) for word in words: title = word.find('h1').text voices = word.find_all('p') links = word.find_all('a') properties = word.get('class') li_voices = [] li_links = [] for voice in voices: li_voices.append(voice.text) for link in links: url = link.get('href') li_links.append(url) word = { 'Word#': str(word_no), 'title': title, 'properties': properties, 'voices': li_voices, 'links': li_links } glossary_bag.append(word) word_no += 1
In [159]:
In [9]:
with open('glossary.json', 'w+', encoding='utf-8') as f: json.dump(glossary_bag, f, indent=5, ensure_ascii=False)
In [140]:
Out[140]:
<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>
In [ ]: