{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrap System"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "[\n",
    "{\n",
    "   \"Word#\": \"3\",\n",
    "   \"title\": \"Property\",\n",
    "   \"properties\": [\n",
    "        \"word\",\n",
    "        \"proposition\",\n",
    "        \"logic\"\n",
    "     ],\n",
    "    \"voices\": [\n",
    "        {\n",
    "           \"voice\": \"⤷ An attribute, characteristic, or quality\",\n",
    "           \"link\": \"link\"\n",
    "        },\n",
    "       {\n",
    "           \"voice\": \"⤷ From etymology the word comes from propert\",\n",
    "              \"link\":\"link\"\n",
    "       }\n",
    "    ]\n",
    "}\n",
    "]\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"2.0/index.html\") as file_in:\n",
    "    soup = BeautifulSoup(file_in, 'html.parser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "source": [
    "TITLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = soup.find(id=\"title\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "WORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://issue.xpub.nl/13/ATATA/\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"http://postgrowth.art/\">⤴</a>]\n",
      "[]\n",
      "[<a href=\"https://tdingsun.github.io/reading-machines/\">⤴</a>]\n",
      "[<a href=\"https://en.wikipedia.org/wiki/Teleology\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://soulellis.com/writing/urgentcraft2/\">⤴</a>]\n",
      "[<a href=\"https://soulellis.com/work/urgentcraft/index.html\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://docs.google.com/spreadsheets/d/1UkgIsDpFMuA0_hvqf5f4ytPTKX4wfPp8ByRs3Uymvag/edit#gid=0\">⤴</a>]\n",
      "[<a href=\"https://networkcultures.org/digitalpublishing/\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://odotoo.com/risograph/\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://constantvzw.org/wefts/webpublications.en.html\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
      "[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
      "[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://daringfireball.net/projects/markdown/syntax#overview\">⤴</a>]\n",
      "[]\n",
      "[<a href=\"https://hub.xpub.nl/soupboat/pad/p/camilo_glossary\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://ia800509.us.archive.org/15/items/p-dpa_booklet/p-dpa_booklet.pdf\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://thenewbridgeproject.com/product/why-publish-noise-miekal-and/\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[<a href=\"https://www.livingbooksaboutlife.org/\">⤴</a>]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "glossary_bag = [] \n",
    "word_no = 1\n",
    "words = soup.find_all('div',attrs={'class':'word'})\n",
    "\n",
    "for word in words:\n",
    "    \n",
    "    title = word.find('h1').text\n",
    "\n",
    "    voices = word.find_all('p')\n",
    "    \n",
    "    links = word.find_all('a')\n",
    "    \n",
    "    properties = word.get('class')\n",
    "    \n",
    "    li_voices = []\n",
    "    \n",
    "    for voice in voices:\n",
    "        links = voice.find_all('a')\n",
    "        print(links)\n",
    "        sentence = {}\n",
    "        sentence[\"voice\"]= voice.text.replace(\"⤴\",\"\")\n",
    "        if len(links) > 0:\n",
    "            sentence[\"link\"]= []\n",
    "        \n",
    "            for link in links:\n",
    "                url = link.get('href')\n",
    "                sentence[\"link\"].append(url)\n",
    "            \n",
    "        li_voices.append(sentence)\n",
    "    \n",
    "    for link in links:\n",
    "        url = link.get('href')\n",
    "        li_links.append(url)\n",
    "        \n",
    "    word = {\n",
    "        'Word#': str(word_no), \n",
    "        'title': title, \n",
    "        'properties': properties,\n",
    "        'voices': li_voices,\n",
    "    }\n",
    "    \n",
    "    glossary_bag.append(word)\n",
    "    \n",
    "    word_no += 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('glossary.json', 'w+', encoding='utf-8') as f:\n",
    "        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}