You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
451 lines
9.1 KiB
Plaintext
451 lines
9.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Scrap System"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
" This is how I scrapped my website to make a database and then perform more relationships and make a better shearch box."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"```\n",
|
|
"[\n",
|
|
"{\n",
|
|
" \"Word#\": \"3\",\n",
|
|
" \"title\": \"Property\",\n",
|
|
" \"properties\": [\n",
|
|
" \"word\",\n",
|
|
" \"proposition\",\n",
|
|
" \"logic\"\n",
|
|
" ],\n",
|
|
" \"voices\": [\n",
|
|
" {\n",
|
|
" \"voice\": \"⤷ An attribute, characteristic, or quality\",\n",
|
|
" \"link\": \"link\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"voice\": \"⤷ From etymology the word comes from propert\",\n",
|
|
" \"link\":\"link\"\n",
|
|
" }\n",
|
|
" ]\n",
|
|
"}\n",
|
|
"]\n",
|
|
"```"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bs4 import BeautifulSoup\n",
|
|
"import json"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"2.0/index.html\") as file_in:\n",
|
|
" soup = BeautifulSoup(file_in, 'html.parser')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"TITLE"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"title = soup.find(id=\"title\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"WORDS"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://issue.xpub.nl/13/ATATA/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"http://postgrowth.art/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://tdingsun.github.io/reading-machines/\">⤴</a>]\n",
|
|
"[<a href=\"https://en.wikipedia.org/wiki/Teleology\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://soulellis.com/writing/urgentcraft2/\">⤴</a>]\n",
|
|
"[<a href=\"https://soulellis.com/work/urgentcraft/index.html\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://docs.google.com/spreadsheets/d/1UkgIsDpFMuA0_hvqf5f4ytPTKX4wfPp8ByRs3Uymvag/edit#gid=0\">⤴</a>]\n",
|
|
"[<a href=\"https://networkcultures.org/digitalpublishing/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://odotoo.com/risograph/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://constantvzw.org/wefts/webpublications.en.html\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
|
|
"[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
|
|
"[<a href=\"https://files.cargocollective.com/c1032387/publicacion-independiente-PRINT.pdf#page=12\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://daringfireball.net/projects/markdown/syntax#overview\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://hub.xpub.nl/soupboat/pad/p/camilo_glossary\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://ia800509.us.archive.org/15/items/p-dpa_booklet/p-dpa_booklet.pdf\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://thenewbridgeproject.com/product/why-publish-noise-miekal-and/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[<a href=\"https://www.livingbooksaboutlife.org/\">⤴</a>]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n",
|
|
"[]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"glossary_bag = [] \n",
|
|
"word_no = 1\n",
|
|
"words = soup.find_all('div',attrs={'class':'word'})\n",
|
|
"\n",
|
|
"for word in words:\n",
|
|
" \n",
|
|
" title = word.find('h1').text\n",
|
|
"\n",
|
|
" voices = word.find_all('p')\n",
|
|
" \n",
|
|
" links = word.find_all('a')\n",
|
|
" \n",
|
|
" properties = word.get('class')\n",
|
|
" \n",
|
|
" li_voices = []\n",
|
|
" \n",
|
|
" for voice in voices:\n",
|
|
" links = voice.find_all('a')\n",
|
|
" print(links)\n",
|
|
" sentence = {}\n",
|
|
" sentence[\"voice\"]= voice.text.replace(\"⤴\",\"\")\n",
|
|
" if len(links) > 0:\n",
|
|
" sentence[\"link\"]= []\n",
|
|
" \n",
|
|
" for link in links:\n",
|
|
" url = link.get('href')\n",
|
|
" sentence[\"link\"].append(url)\n",
|
|
" \n",
|
|
" li_voices.append(sentence)\n",
|
|
" \n",
|
|
" for link in links:\n",
|
|
" url = link.get('href')\n",
|
|
" li_links.append(url)\n",
|
|
" \n",
|
|
" word = {\n",
|
|
" 'Word#': str(word_no), \n",
|
|
" 'title': title, \n",
|
|
" 'properties': properties,\n",
|
|
" 'voices': li_voices,\n",
|
|
" }\n",
|
|
" \n",
|
|
" glossary_bag.append(word)\n",
|
|
" \n",
|
|
" word_no += 1\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 159,
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('glossary.json', 'w+', encoding='utf-8') as f:\n",
|
|
" json.dump(glossary_bag, f, indent=5, ensure_ascii=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 140,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<_io.TextIOWrapper name='glossary.json' mode='w' encoding='utf-8'>"
|
|
]
|
|
},
|
|
"execution_count": 140,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|