You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

6.7 KiB

Scrap System

This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box.

[
{
   "Word#": "3",
   "title": "Property",
   "properties": [
        "word",
        "proposition",
        "logic"
     ],
    "voices": [
        {
           "voice": "⤷ An attribute, characteristic, or quality",
           "link": "link"
        },
       {
           "voice": "⤷ From etymology the word comes from propert",
              "link":"link"
       }
    ]
}
]
In [1]:
from bs4 import BeautifulSoup
import json
In [2]:
with open("index-data.html") as file_in:
    soup = BeautifulSoup(file_in, 'html.parser')

Define My GLossary Bag

In [3]:
glossary_bag = [] 

TITLE

In [4]:
title = soup.find(id="title").text

PROPERTIES

In [5]:
master_properties = [
    {
      'title':'action',
      'symbol':'A',
      'color': 'var(--action-color)'
    },
    {
      'title':'situation',
      'symbol':'S',
      'color': 'var(--situation-color)'
    },
    {
      'title':'logic',
      'symbol':'C',
      'color': 'var(--logic-color)'
    },
    {
      'title':'proposition',
      'symbol':'T',
      'color': 'var(--proposition-color)'
    },
    {
      'title':'hyperlink',
      'symbol':'N',
      'color': 'var(--hyperlink-color)'
    },
    {
      'title':'process',
      'symbol':'P',
      'color': 'var(--process-color)'
    },
    {
      'title':'language',
      'symbol':'G',
      'color': 'var(--language-color)'
    },
    {
      'title':'agent',
      'symbol':'E',
      'color': 'var(--agent-color)'
    },
    {
      'title':'tool',
      'symbol':'T',
      'color': 'var(--tool-color)'
    },
    {
      'title':'form',
      'symbol':'Y',
      'color': 'var(--form-color)'
    }
]
In [6]:
title1 = [ {'title': title } ]
properties = [ {'properties' : master_properties } ]

glossary_bag.append(title1)
glossary_bag.append(properties)

WORDS

In [7]:
word_no = 1
words = soup.find_all('div',attrs={'class':'word'})
glossary = []


for word in words:
    
    title = word.find('h1').text

    voices = word.find_all('p')
    
    links = word.find_all('a')
    
    properties = word.get('class')

    li_properties = []

    for prop in properties:
        title_p = prop
        for m_prop in master_properties:
            if title_p == m_prop['title']:
                symb = m_prop['symbol']
                color = m_prop['color']
                propert = {}
                propert["title"] = title_p
                propert["symbol"] = symb
                propert["color"] = color

                li_properties.append(propert)
            
    li_voices = []
    
    for voice in voices:
        links = voice.find_all('a')
        sentence = {}
        sentence["voice"]= voice.text.replace("⤴","")
        if len(links) > 0:
            sentence["link"]= []
        
            for link in links:
                url = link.get('href')
                sentence["link"].append(url)
            
        li_voices.append(sentence)
        
    word = {
        'Word#': str(word_no), 
        'title': title, 
        'properties': li_properties,
        'voices': li_voices,
    }
    
    glossary.append(word)
    
    word_no += 1

words = [ { 'words' : glossary } ]

glossary_bag.append(words)
In [8]:
with open('glossary.json', 'w+', encoding='utf-8') as f:
        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)
In [ ]:
glossary_bag