You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

9.1 KiB

Scrap System

This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box.

[
{
   "Word#": "3",
   "title": "Property",
   "properties": [
        "word",
        "proposition",
        "logic"
     ],
    "voices": [
        {
           "voice": "⤷ An attribute, characteristic, or quality",
           "link": "link"
        },
       {
           "voice": "⤷ From etymology the word comes from propert",
              "link":"link"
       }
    ]
}
]
In [2]:
from bs4 import BeautifulSoup
import json
In [3]:
with open("2.0/index.html") as file_in:
    soup = BeautifulSoup(file_in, 'html.parser')

TITLE

In [7]:
title = soup.find(id="title")

WORDS

In [13]:
glossary_bag = [] 
word_no = 1
words = soup.find_all('div',attrs={'class':'word'})

for word in words:
    
    title = word.find('h1').text

    voices = word.find_all('p')
    
    links = word.find_all('a')
    
    properties = word.get('class')
    
    li_voices = []
    
    for voice in voices:
        links = voice.find_all('a')
        print(links)
        sentence = {}
        sentence["voice"]= voice.text.replace("⤴","")
        if len(links) > 0:
            sentence["link"]= []
        
            for link in links:
                url = link.get('href')
                sentence["link"].append(url)
            
        li_voices.append(sentence)
    
    for link in links:
        url = link.get('href')
        li_links.append(url)
        
    word = {
        'Word#': str(word_no), 
        'title': title, 
        'properties': properties,
        'voices': li_voices,
    }
    
    glossary_bag.append(word)
    
    word_no += 1
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://issue.xpub.nl/13/ATATA/">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf">⤴</a>]
[]
[]
[]
[<a href="http://postgrowth.art/">⤴</a>]
[]
[<a href="https://tdingsun.github.io/reading-machines/">⤴</a>]
[<a href="https://en.wikipedia.org/wiki/Teleology">⤴</a>]
[]
[]
[<a href="https://soulellis.com/writing/urgentcraft2/">⤴</a>]
[<a href="https://soulellis.com/work/urgentcraft/index.html">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://docs.google.com/spreadsheets/d/1UkgIsDpFMuA0_hvqf5f4ytPTKX4wfPp8ByRs3Uymvag/edit#gid=0">⤴</a>]
[<a href="https://networkcultures.org/digitalpublishing/">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://odotoo.com/risograph/">⤴</a>]
[]
[]
[]
[]
[]
[]
[<a href="https://constantvzw.org/wefts/webpublications.en.html">⤴</a>]
[]
[]
[]
[]
[]
[<a href="https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12">⤴</a>]
[<a href="https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12">⤴</a>]
[<a href="https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://daringfireball.net/projects/markdown/syntax#overview">⤴</a>]
[]
[<a href="https://hub.xpub.nl/soupboat/pad/p/camilo_glossary">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://ia800509.us.archive.org/15/items/p-dpa_booklet/p-dpa_booklet.pdf">⤴</a>]
[]
[]
[]
[<a href="https://thenewbridgeproject.com/product/why-publish-noise-miekal-and/">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[<a href="https://www.livingbooksaboutlife.org/">⤴</a>]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
In [159]:
 
In [14]:
with open('glossary.json', 'w+', encoding='utf-8') as f:
        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)
In [140]:
 
Out[140]:
<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>
In [ ]: