{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Scrap System"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"[\n",
"{\n",
" \"Word#\": \"3\",\n",
" \"title\": \"Property\",\n",
" \"properties\": [\n",
" \"word\",\n",
" \"proposition\",\n",
" \"logic\"\n",
" ],\n",
" \"voices\": [\n",
" {\n",
" \"voice\": \"⤷ An attribute, characteristic, or quality\",\n",
" \"link\": \"link\"\n",
" },\n",
" {\n",
" \"voice\": \"⤷ From etymology the word comes from propert\",\n",
" \"link\":\"link\"\n",
" }\n",
" ]\n",
"}\n",
"]\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open(\"2.0/index.html\") as file_in:\n",
" soup = BeautifulSoup(file_in, 'html.parser')"
]
},
{
"cell_type": "markdown",
"metadata": {
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"source": [
"TITLE"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"title = soup.find(id=\"title\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"WORDS"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[⤴]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[⤴]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[⤴]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n",
"[]\n"
]
}
],
"source": [
"glossary_bag = [] \n",
"word_no = 1\n",
"words = soup.find_all('div',attrs={'class':'word'})\n",
"\n",
"for word in words:\n",
" \n",
" title = word.find('h1').text\n",
"\n",
" voices = word.find_all('p')\n",
" \n",
" links = word.find_all('a')\n",
" \n",
" properties = word.get('class')\n",
" \n",
" li_voices = []\n",
" \n",
" for voice in voices:\n",
" links = voice.find_all('a')\n",
" print(links)\n",
" sentence = {}\n",
" sentence[\"voice\"]= voice.text.replace(\"⤴\",\"\")\n",
" if len(links) > 0:\n",
" sentence[\"link\"]= []\n",
" \n",
" for link in links:\n",
" url = link.get('href')\n",
" sentence[\"link\"].append(url)\n",
" \n",
" li_voices.append(sentence)\n",
" \n",
" for link in links:\n",
" url = link.get('href')\n",
" li_links.append(url)\n",
" \n",
" word = {\n",
" 'Word#': str(word_no), \n",
" 'title': title, \n",
" 'properties': properties,\n",
" 'voices': li_voices,\n",
" }\n",
" \n",
" glossary_bag.append(word)\n",
" \n",
" word_no += 1\n"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"tags": []
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"with open('glossary.json', 'w+', encoding='utf-8') as f:\n",
" json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}