Permeable-Glossary/Database-making.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrap System"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "[\n",
    "{\n",
    "   \"Word#\": \"3\",\n",
    "   \"title\": \"Property\",\n",
    "   \"properties\": [\n",
    "        \"word\",\n",
    "        \"proposition\",\n",
    "        \"logic\"\n",
    "     ],\n",
    "    \"voices\": [\n",
    "        {\n",
    "           \"voice\": \"⤷ An attribute, characteristic, or quality\",\n",
    "           \"link\": \"link\"\n",
    "        },\n",
    "       {\n",
    "           \"voice\": \"⤷ From etymology the word comes from propert\",\n",
    "              \"link\":\"link\"\n",
    "       }\n",
    "    ]\n",
    "}\n",
    "]\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"index-data.html\") as file_in:\n",
    "    soup = BeautifulSoup(file_in, 'html.parser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define My GLossary Bag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "glossary_bag = [] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "source": [
    "TITLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = soup.find(id=\"title\").text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PROPERTIES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "master_properties = [\n",
    "    {\n",
    "      'title':'action',\n",
    "      'symbol':'A',\n",
    "      'color': 'var(--action-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'situation',\n",
    "      'symbol':'S',\n",
    "      'color': 'var(--situation-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'logic',\n",
    "      'symbol':'C',\n",
    "      'color': 'var(--logic-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'proposition',\n",
    "      'symbol':'T',\n",
    "      'color': 'var(--proposition-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'hyperlink',\n",
    "      'symbol':'N',\n",
    "      'color': 'var(--hyperlink-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'process',\n",
    "      'symbol':'P',\n",
    "      'color': 'var(--process-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'language',\n",
    "      'symbol':'G',\n",
    "      'color': 'var(--language-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'agent',\n",
    "      'symbol':'E',\n",
    "      'color': 'var(--agent-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'tool',\n",
    "      'symbol':'T',\n",
    "      'color': 'var(--tool-color)'\n",
    "    },\n",
    "    {\n",
    "      'title':'form',\n",
    "      'symbol':'Y',\n",
    "      'color': 'var(--form-color)'\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "title1 = [ {'title': title } ]\n",
    "properties = [ {'properties' : master_properties } ]\n",
    "\n",
    "glossary_bag.append(title1)\n",
    "glossary_bag.append(properties)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "WORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "word_no = 1\n",
    "words = soup.find_all('div',attrs={'class':'word'})\n",
    "glossary = []\n",
    "\n",
    "\n",
    "for word in words:\n",
    "    \n",
    "    title = word.find('h1').text\n",
    "\n",
    "    voices = word.find_all('p')\n",
    "    \n",
    "    links = word.find_all('a')\n",
    "    \n",
    "    properties = word.get('class')\n",
    "\n",
    "    li_properties = []\n",
    "\n",
    "    for prop in properties:\n",
    "        title_p = prop\n",
    "        for m_prop in master_properties:\n",
    "            if title_p == m_prop['title']:\n",
    "                symb = m_prop['symbol']\n",
    "                color = m_prop['color']\n",
    "                propert = {}\n",
    "                propert[\"title\"] = title_p\n",
    "                propert[\"symbol\"] = symb\n",
    "                propert[\"color\"] = color\n",
    "\n",
    "                li_properties.append(propert)\n",
    "            \n",
    "    li_voices = []\n",
    "    \n",
    "    for voice in voices:\n",
    "        links = voice.find_all('a')\n",
    "        sentence = {}\n",
    "        sentence[\"voice\"]= voice.text.replace(\"⤴\",\"\")\n",
    "        if len(links) > 0:\n",
    "            sentence[\"link\"]= []\n",
    "        \n",
    "            for link in links:\n",
    "                url = link.get('href')\n",
    "                sentence[\"link\"].append(url)\n",
    "            \n",
    "        li_voices.append(sentence)\n",
    "        \n",
    "    word = {\n",
    "        'Word#': str(word_no), \n",
    "        'title': title, \n",
    "        'properties': li_properties,\n",
    "        'voices': li_voices,\n",
    "    }\n",
    "    \n",
    "    glossary.append(word)\n",
    "    \n",
    "    word_no += 1\n",
    "\n",
    "words = [ { 'words' : glossary } ]\n",
    "\n",
    "glossary_bag.append(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('glossary.json', 'w+', encoding='utf-8') as f:\n",
    "        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "glossary_bag"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}