You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SI13/LIQUID/MANIFESTO/nltk-frequency-distribution...

265 lines
6.5 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NLTK - Frequency Distribution"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.nltk.org/book/ch01.html#frequency-distributions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LIQUID BODIES ARE POLITICAL AGENTS. THEY RE-DEFINE THE BOUNDARIES AND CONDITIONS FOR EXISTENCE IN THE CONTEXT OF DYNAMIC, UNRULY ENVIRONMENTS. THEY PROPOSE ALTERNATIVE MODES OF LIVING THAT ARE RADICALLY TRANSFORMED, MONSTROUS, COHERENT, RAW AND SELECTIVELY PERMEATED BY THEIR NURTURING MEDIA.\n",
"\n"
]
}
],
"source": [
"lines = open('manifesto.txt').readlines()\n",
"sentence = random.choice(lines)\n",
"print(sentence)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokens"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['LIQUID', 'BODIES', 'ARE', 'POLITICAL', 'AGENTS', '.', 'THEY', 'RE-DEFINE', 'THE', 'BOUNDARIES', 'AND', 'CONDITIONS', 'FOR', 'EXISTENCE', 'IN', 'THE', 'CONTEXT', 'OF', 'DYNAMIC', ',', 'UNRULY', 'ENVIRONMENTS', '.', 'THEY', 'PROPOSE', 'ALTERNATIVE', 'MODES', 'OF', 'LIVING', 'THAT', 'ARE', 'RADICALLY', 'TRANSFORMED', ',', 'MONSTROUS', ',', 'COHERENT', ',', 'RAW', '', 'AND', 'SELECTIVELY', 'PERMEATED', 'BY', 'THEIR', 'NURTURING', 'MEDIA', '.']\n"
]
}
],
"source": [
"tokens = nltk.word_tokenize(sentence)\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Frequency Distribution"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 29 samples and 295 outcomes>\n"
]
}
],
"source": [
"# frequency of characters\n",
"fd = nltk.FreqDist(sentence)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(' ', 40), ('E', 33), ('T', 22), ('N', 22), ('I', 21), ('R', 19), ('O', 17), ('A', 17), ('D', 13), ('S', 12), ('L', 10), ('C', 8), ('U', 7), ('H', 7), ('Y', 7), ('M', 7), ('F', 5), ('P', 4), (',', 4), ('V', 4), ('B', 3), ('G', 3), ('.', 3), ('X', 2), ('Q', 1), ('-', 1), ('W', 1), ('', 1), ('\\n', 1)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 38 samples and 48 outcomes>\n"
]
}
],
"source": [
"# frequency of words\n",
"fd = nltk.FreqDist(tokens)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(',', 4), ('.', 3), ('ARE', 2), ('THEY', 2), ('THE', 2), ('AND', 2), ('OF', 2), ('LIQUID', 1), ('BODIES', 1), ('POLITICAL', 1), ('AGENTS', 1), ('RE-DEFINE', 1), ('BOUNDARIES', 1), ('CONDITIONS', 1), ('FOR', 1), ('EXISTENCE', 1), ('IN', 1), ('CONTEXT', 1), ('DYNAMIC', 1), ('UNRULY', 1), ('ENVIRONMENTS', 1), ('PROPOSE', 1), ('ALTERNATIVE', 1), ('MODES', 1), ('LIVING', 1), ('THAT', 1), ('RADICALLY', 1), ('TRANSFORMED', 1), ('MONSTROUS', 1), ('COHERENT', 1), ('RAW', 1), ('', 1), ('SELECTIVELY', 1), ('PERMEATED', 1), ('BY', 1), ('THEIR', 1), ('NURTURING', 1), ('MEDIA', 1)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<FreqDist with 660 samples and 1463 outcomes>\n"
]
}
],
"source": [
"# frequency of a text\n",
"txt = open('manifesto.txt').read()\n",
"tokens = nltk.word_tokenize(txt)\n",
"fd = nltk.FreqDist(tokens)\n",
"print(fd)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(',', 100), ('.', 69), ('OF', 68), ('THE', 57), ('AND', 50), ('LIQUID', 25), ('A', 25), ('ARE', 19), ('THAT', 19), ('TO', 18), ('ITS', 17), ('IN', 15), ('THEY', 15), ('IS', 14), ('THEIR', 14), ('LIFE', 11), ('', 11), ('BODIES', 11), ('IT', 10), ('AS', 9), ('WHICH', 9), ('THESE', 9), ('', 8), ('', 8), ('OUR', 7), ('THROUGH', 7), ('MATTER', 7), ('NOT', 6), ('CAN', 6), ('INTO', 6), ('FROM', 6), ('WITH', 6), ('BEING', 6), ('LIKE', 5), ('ON', 5), ('AN', 4), ('OR', 4), ('WE', 4), ('LIVING', 4), ('BE', 4), ('METABOLIC', 4), ('CHEMICAL', 4), ('FOR', 4), ('OWN', 4), ('US', 4), ('ACTS', 4), ('REALM', 3), ('YET', 3), ('THEM', 3), ('RECOGNISE', 3)]\n"
]
}
],
"source": [
"print(fd.most_common(50))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"47\n"
]
}
],
"source": [
"# Requesting the frequency of a specific word\n",
"print(fd['language'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}