You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

265 lines
6.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NLTK - Frequency Distribution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.nltk.org/book/ch01.html#frequency-distributions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The symbols of computer control languages inevitably do have semantic connotations simply because there exist no symbols with which humans would not associate some meaning.\n",
"\n"
]
}
],
"source": [
"lines = open('txt/language.txt').readlines()\n",
"sentence = random.choice(lines)\n",
"print(sentence)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokens"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['The', 'symbols', 'of', 'computer', 'control', 'languages', 'inevitably', 'do', 'have', 'semantic', 'connotations', 'simply', 'because', 'there', 'exist', 'no', 'symbols', 'with', 'which', 'humans', 'would', 'not', 'associate', 'some', 'meaning', '.']\n"
]
}
],
"source": [
"tokens = nltk.word_tokenize(sentence)\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Frequency Distribution"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 26 samples and 173 outcomes>\n"
]
}
],
"source": [
"# frequency of characters\n",
"fd = nltk.FreqDist(sentence)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(' ', 24), ('o', 15), ('e', 14), ('s', 14), ('n', 12), ('t', 11), ('a', 11), ('i', 10), ('m', 8), ('h', 7), ('l', 7), ('c', 7), ('u', 5), ('y', 4), ('b', 4), ('r', 3), ('g', 3), ('w', 3), ('p', 2), ('v', 2), ('d', 2), ('T', 1), ('f', 1), ('x', 1), ('.', 1), ('\\n', 1)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 25 samples and 26 outcomes>\n"
]
}
],
"source": [
"# frequency of words\n",
"fd = nltk.FreqDist(tokens)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('symbols', 2), ('The', 1), ('of', 1), ('computer', 1), ('control', 1), ('languages', 1), ('inevitably', 1), ('do', 1), ('have', 1), ('semantic', 1), ('connotations', 1), ('simply', 1), ('because', 1), ('there', 1), ('exist', 1), ('no', 1), ('with', 1), ('which', 1), ('humans', 1), ('would', 1), ('not', 1), ('associate', 1), ('some', 1), ('meaning', 1), ('.', 1)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 944 samples and 2835 outcomes>\n"
]
}
],
"source": [
"# frequency of a text\n",
"txt = open('txt/language.txt').read()\n",
"tokens = nltk.word_tokenize(txt)\n",
"fd = nltk.FreqDist(tokens)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(',', 172), ('.', 93), ('the', 88), ('of', 88), ('”', 66), ('“', 65), ('and', 61), ('a', 61), ('is', 58), ('languages', 54), ('in', 51), ('language', 47), ('to', 41), ('as', 37), ('computer', 32), ('that', 29), ('programming', 25), ('control', 23), ('are', 22), ('for', 21), ('', 21), ('The', 18), ('can', 17), ('be', 16), ('it', 16), ('machine', 16), ('human', 15), ('not', 15), ('software', 14), ('formal', 14), ('or', 14), ('symbols', 14), ('s', 12), ('with', 12), (':', 11), ('its', 11), ('this', 11), ('common', 11), ('their', 10), ('example', 9), (';', 9), ('operations', 9), ('such', 9), ('from', 8), ('through', 8), ('code', 8), ('since', 7), ('different', 7), ('In', 7), ('like', 7)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"47\n"
]
}
],
"source": [
"# Requesting the frequency of a specific word\n",
"print(fd['language'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}