You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

328 lines
12 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 179,
"id": "3c7f546e-2a12-456d-be2b-a01df62c32f4",
"metadata": {},
"outputs": [],
"source": [
"image = 'think-classify.jpg'\n",
"image2 = 'think-classify2.jpg'\n",
"image3 = 'think-classify3.jpg'\n",
"image4 = 'think-classify4.jpg'\n",
"image5 = 'think-classify5.jpg'\n",
"image6 = 'think-classify6.jpg'\n",
"image7 = 'think-classify7.jpg'\n",
"image8 = 'think-classify8.jpg'\n",
"image9 = 'think-classify9.jpg'\n",
"image10 = 'think-classify10.jpg'\n",
"image11 = 'think-classify11.jpg'\n",
"image12 = 'think-classify12.jpg'\n",
"image13 = 'think-classify13.jpg'\n",
"image14 = 'think-classify14.jpg'\n"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "42b71c63-544e-4e39-9182-f5d349eb5389",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['to', 'meet,', 'but', 'also', 'to', 'collect.', 'are', 'you', 'gathering', 'people', 'or', 'a', 'bulk', 'of', 'leaves', 'in', 'your', 'garden', 'when', 'winter', 'is', 'coming?'], ['to', 'gather', 'friends', 'inside', 'your', 'place,', 'to', 'gather', 'information', 'for', 'the', 'police,', 'to', 'gather', 'food', 'for', 'the', 'homeless,', 'to', 'gather', 'objects', 'inside', 'your', 'bag,', 'to', 'gather', 'your', 'drunk', 'friend,', 'to', 'gather', 'a', 'molotof', 'cocktail', 'for', 'a', 'revolutionary', 'party'], ['it', 'is', 'the', 'opposite', 'of', 'dividing', 'because', 'the', 'basic', 'idea', 'is', 'that', 'you', 'have', 'a', 'lot', 'of', 'objects', 'far', 'away', 'from', 'each', 'other,', 'in', 'different', 'rooms,', 'in', 'different', 'building,', 'in', 'different', 'cities', 'and', 'you', 'want', 'to', 'have', 'all', 'of', 'them', 'on', 'your', 'bed', 'so', 'you', 'just', 'go', 'and', 'pick', 'them', 'up', 'from', 'where', 'they', 'are', 'you', 'put', 'them', 'in', 'a', 'bag', 'and', 'go', 'back', 'home'], ['my', 'favorite', 'activity', 'since', '200000000', 'years,', 'to', 'gather', 'food,', 'to', 'find', 'something', 'and', 'to', 'collect', 'it.', 'since', 'im', 'a', 'racoon', 'i', 'like', 'to', 'gather', 'things', 'around', 'from', 'the', 'street.', 'my', 'flatmate', 'is', 'desperate', 'about', 'it.', 'but', 'i', 'always', 'find', 'nice', 'things:', 'a', 'table,', 'a', 'confy', 'armchair,', 'some', 'baskets,', 'some', 'vases.']]\n"
]
}
],
"source": [
"from nltk.corpus import stopwords\n",
"sw = stopwords.words(\"english\")\n",
"from urllib.request import urlopen\n",
"import json\n",
"\n",
"resultSentences = []\n",
"labels_corpus = []\n",
"\n",
"url = f\"https://hub.xpub.nl/soupboat/generic-labels/get-labels/?image=think-classify7.jpg\"\n",
"response = urlopen(url)\n",
"data_json = json.loads(response.read()) \n",
"\n",
"labels = data_json['labels']\n",
"\n",
"\n",
"for label in labels:\n",
" sent = label['text'].split()\n",
" labels_corpus.append(sent)\n",
" \n",
" \n",
"print(labels_corpus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "387f722e-bb19-45cc-a0f9-8a1186febd56",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 199,
"id": "b3d40ae4-cfe8-4346-9d9f-23e2dbc50c0d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"punctuation = ['.', ',', ';', '(', ')', ':']\n",
"\n",
"# since .split(' ') does not split a word from any punctuation, \n",
"# this function search for any string which last character (word[-1]]) is in the variable 'punctuation';\n",
"# if that is the case, the function will remove the last charachter, else it will leave it as it is.\n",
"def clean_word(word):\n",
" for character in word:\n",
" if word[-1] in punctuation:\n",
" return word[0:-1]\n",
" if word[0] in punctuation:\n",
" return word[1:]\n",
" else:\n",
" return word"
]
},
{
"cell_type": "code",
"execution_count": 200,
"id": "5c37d124-eaa8-4275-9556-473a966fdcb7",
"metadata": {},
"outputs": [],
"source": [
"# The arguments in this functions are 2 texts (text_a and text_b) an index for where the text_a starts and an index for where it ends.\n",
"def bridge(text_a, text_b, start_a, isLast):\n",
" \n",
" matchFound = 0\n",
" start_next = 0\n",
" \n",
" # for index i in text_a from a given index until the end of text_a\n",
" for i in range(start_a, len(text_a)):\n",
" if matchFound:\n",
" break\n",
" \n",
" # we name word_a the index i in text_a\n",
" word_a = text_a[i]\n",
" # if word_a is not in the given list of stopwords:\n",
" if word_a not in sw:\n",
" # for index j in the entire text_b:\n",
" for j in range(0, len(text_b)):\n",
" \n",
" # we name word_b the word with index j in text_b\n",
" word_b = text_b[j]\n",
" \n",
" # if word_a equals to word_b:\n",
" if clean_word(word_a) == clean_word(word_b):\n",
" \n",
" # resultSentences is a list to which the following informations will add up:\n",
" resultSentences.append({\n",
" 'text': text_a,\n",
" 'start': start_a,\n",
" 'end': i,\n",
" 'hasMatch': 1\n",
" })\n",
" \n",
" # if the text in position text_a is the last text to be compared:\n",
" # the same informations as above will be added, except that there will be no index for its end.\n",
" if isLast:\n",
" resultSentences.append({\n",
" 'text': text_b,\n",
" 'start': j,\n",
" 'end': None,\n",
" 'hasMatch': 1\n",
" })\n",
" \n",
" # after the match is found between the 2 texts, the function will break\n",
" matchFound = 1 \n",
" start_next = j\n",
" break\n",
" \n",
" if matchFound == 0:\n",
" resultSentences.append({\n",
" 'text': text_a,\n",
" 'start': start_a,\n",
" 'end': None,\n",
" 'hasMatch': 0\n",
" })\n",
" \n",
" if isLast:\n",
" resultSentences.append({\n",
" 'text': text_b,\n",
" 'start': 0,\n",
" 'end': None,\n",
" 'hasMatch': 0\n",
" })\n",
"\n",
" \n",
" # the function returns the index of the 'same word' in the text_b\n",
" return start_next"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "d4862edf-c8cf-44dc-bd41-33549da33c0b",
"metadata": {},
"outputs": [],
"source": [
"def bridge_list(corpus):\n",
" start_a = 0\n",
" result = \"\"\n",
" \n",
" #for all texts indexes within the corpus to be compared:\n",
" for text_index in range(0, len(corpus)-1):\n",
"\n",
" # the last text_a to be compared has to be the text indexed as corpus[-2];\n",
" # the last text_b will then be the last text of the corpus (corpus[-1]).\n",
" isLast = text_index == len(corpus)-2\n",
" # text_a is a given index of the corpus and text_b is the following index\n",
" text_a = corpus[text_index]\n",
" text_b = corpus[text_index + 1]\n",
" \n",
" \n",
" #start_a is the index (in text_b) of the first 'common word' between text_a and text_b;\n",
" #start_a is the starting point to compare a text and its following (in index order within the corpus); \n",
"\n",
" start_next = bridge(text_a, text_b, start_a, isLast)\n",
" start_a = start_next\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 202,
"id": "4363e385-7772-422b-b7b5-7f425e79878d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"\n",
"def render_sentence(sentence, highlightNext):\n",
" result = ''\n",
" start = 0\n",
" end = len(sentence['text'])\n",
" if(sentence['start']):\n",
" start = sentence['start']\n",
" if(sentence['end']):\n",
" end = sentence['end']\n",
" \n",
" text = sentence['text']\n",
" \n",
" highlight = highlightNext\n",
" \n",
" for index in range(start, end):\n",
" word = text[index]\n",
" \n",
" if(highlight == 1):\n",
" result = result + '<span class=\"highlighit\">' + word + '</span>'\n",
" highlight = 0;\n",
" continue\n",
" else:\n",
" if index == end -1 and sentence['hasMatch']:\n",
" highlight = 1\n",
"\n",
" result = result + \" \" + word\n",
" \n",
" return result, highlight\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 203,
"id": "d30e2363-37e7-4437-af7a-572eb56d9266",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" to meet, but also to collect. are you gathering people or a bulk of leaves in your garden when winter is coming? to gather friends inside your place, to gather information for the police, to gather food for the homeless, to gather <span class=\"highlighit\">objects</span> far away from each other, in different rooms, in different building, in different cities and you want to have all of them on your bed so you just go and pick them up from where they are you put them in a bag and go back home my favorite activity since 200000000 years, to gather food, to find something and to collect it. since im a racoon i like to gather things around from the street. my flatmate is desperate about it. but i always find nice things: a table, a confy armchair, some baskets, some vases.\n"
]
}
],
"source": [
"bridge_list(labels_corpus)\n",
"\n",
"endResult = ''\n",
"\n",
"highlightNext = 0\n",
"\n",
"for i in range(0, len(resultSentences)):\n",
" sentence = resultSentences[i]\n",
" start = sentence['start']\n",
" end = sentence['end']\n",
" sentenceText = sentence['text']\n",
" \n",
" sentence, highlight = render_sentence(sentence, highlightNext)\n",
" highlightNext = highlight\n",
" \n",
" endResult = endResult + \" \" + sentence\n",
"\n",
"print(endResult)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5e13d80-deef-423d-82f7-be71e69ee902",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "3014a45f-d18f-4fef-b683-45eb50abed03",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1602a233-c977-4e8c-87f4-133f0792d181",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}