Permeable-Glossary/Database-making.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scrap System"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "glossary = {\n",
    "    'title': f'{title}',\n",
    "    'properties': {\n",
    "    }\n",
    "    'words':{\n",
    "        'reflection': {\n",
    "            'voices': ['','',''],\n",
    "            'properties': ['','',''],\n",
    "    }\n",
    "    'version': '0.1',\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"2.0/index.html\") as file_in:\n",
    "    soup = BeautifulSoup(file_in, 'html.parser')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "source": [
    "TITLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = soup.find(id=\"title\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "WORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "glossary_bag = [] \n",
    "word_no = 1\n",
    "words = soup.find_all('div',attrs={'class':'word'})\n",
    "\n",
    "for word in words:\n",
    "    \n",
    "    title = word.find('h1').text\n",
    "    \n",
    "    voices = word.find_all('p')\n",
    "    \n",
    "    links = word.find_all('a')\n",
    "    \n",
    "    properties = word.get('class')\n",
    "    \n",
    "    li_voices = []\n",
    "    \n",
    "    li_links = []\n",
    "    \n",
    "    for voice in voices:\n",
    "        li_voices.append(voice.text)\n",
    "    \n",
    "    for link in links:\n",
    "        url = link.get('href')\n",
    "        li_links.append(url)\n",
    "        \n",
    "    word = {\n",
    "        'Word#': str(word_no), \n",
    "        'title': title, \n",
    "        'properties': properties,\n",
    "        'voices': li_voices,\n",
    "        'links': li_links\n",
    "    }\n",
    "    \n",
    "    glossary_bag.append(word)\n",
    "    \n",
    "    word_no += 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    " with open('glossary.json', 'w+', encoding='utf-8') as f:\n",
    "        json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
BackUp and ready to start 2 years ago			`{`
			`"cells": [`
Scrapping and data based attempts 2 years ago			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# Scrap System"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`" This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			"```\n",
			`"glossary = {\n",`
			`" 'title': f'{title}',\n",`
			`" 'properties': {\n",`
			`" }\n",`
			`" 'words':{\n",`
			`" 'reflection': {\n",`
			`" 'voices': ['','',''],\n",`
			`" 'properties': ['','',''],\n",`
			`" }\n",`
			`" 'version': '0.1',\n",`
			`"}\n",`
			"```"
			`]`
			`},`
BackUp and ready to start 2 years ago			`{`
			`"cell_type": "code",`
Scrapping and data based attempts 2 years ago			`"execution_count": 10,`
BackUp and ready to start 2 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from bs4 import BeautifulSoup\n",`
			`"import json"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Scrapping and data based attempts 2 years ago			`"execution_count": 6,`
BackUp and ready to start 2 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
Scrapping and data based attempts 2 years ago			`"with open(\"2.0/index.html\") as file_in:\n",`
BackUp and ready to start 2 years ago			`" soup = BeautifulSoup(file_in, 'html.parser')"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {`
			`"jupyter": {`
			`"outputs_hidden": true`
			`},`
			`"tags": []`
			`},`
			`"source": [`
			`"TITLE"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Scrapping and data based attempts 2 years ago			`"execution_count": 7,`
BackUp and ready to start 2 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"title = soup.find(id=\"title\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"source": [`
			`"WORDS"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Scrapping and data based attempts 2 years ago			`"execution_count": 8,`
BackUp and ready to start 2 years ago			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": [`
			`"glossary_bag = [] \n",`
			`"word_no = 1\n",`
			`"words = soup.find_all('div',attrs={'class':'word'})\n",`
			`"\n",`
			`"for word in words:\n",`
			`" \n",`
			`" title = word.find('h1').text\n",`
			`" \n",`
			`" voices = word.find_all('p')\n",`
			`" \n",`
			`" links = word.find_all('a')\n",`
			`" \n",`
			`" properties = word.get('class')\n",`
			`" \n",`
			`" li_voices = []\n",`
			`" \n",`
			`" li_links = []\n",`
			`" \n",`
			`" for voice in voices:\n",`
			`" li_voices.append(voice.text)\n",`
			`" \n",`
			`" for link in links:\n",`
			`" url = link.get('href')\n",`
			`" li_links.append(url)\n",`
			`" \n",`
			`" word = {\n",`
			`" 'Word#': str(word_no), \n",`
			`" 'title': title, \n",`
			`" 'properties': properties,\n",`
			`" 'voices': li_voices,\n",`
			`" 'links': li_links\n",`
			`" }\n",`
			`" \n",`
			`" glossary_bag.append(word)\n",`
			`" \n",`
			`" word_no += 1\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 159,`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": []`
			`},`
			`{`
			`"cell_type": "code",`
Scrapping and data based attempts 2 years ago			`"execution_count": 9,`
BackUp and ready to start 2 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`" with open('glossary.json', 'w+', encoding='utf-8') as f:\n",`
			`" json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 140,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>"`
			`]`
			`},`
			`"execution_count": 140,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": []`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.7.3"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 4`
Scrapping and data based attempts 2 years ago			`}`