You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3.9 KiB

Scrap System

This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box.

glossary = {
    'title': f'{title}',
    'properties': {
    }
    'words':{
        'reflection': {
            'voices': ['','',''],
            'properties': ['','',''],
    }
    'version': '0.1',
}
In [10]:
from bs4 import BeautifulSoup
import json
In [6]:
with open("2.0/index.html") as file_in:
    soup = BeautifulSoup(file_in, 'html.parser')

TITLE

In [7]:
title = soup.find(id="title")

WORDS

In [8]:
glossary_bag = [] 
word_no = 1
words = soup.find_all('div',attrs={'class':'word'})

for word in words:
    
    title = word.find('h1').text
    
    voices = word.find_all('p')
    
    links = word.find_all('a')
    
    properties = word.get('class')
    
    li_voices = []
    
    li_links = []
    
    for voice in voices:
        li_voices.append(voice.text)
    
    for link in links:
        url = link.get('href')
        li_links.append(url)
        
    word = {
        'Word#': str(word_no), 
        'title': title, 
        'properties': properties,
        'voices': li_voices,
        'links': li_links
    }
    
    glossary_bag.append(word)
    
    word_no += 1
In [159]:
 
In [9]:
 with open('glossary.json', 'w+', encoding='utf-8') as f:
        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)
In [140]:
 
Out[140]:
<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>
In [ ]: