SI13/LIQUID/MANIFESTO/nltk-frequency-distribution...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NLTK - Frequency Distribution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://www.nltk.org/book/ch01.html#frequency-distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LIQUID BODIES ARE POLITICAL AGENTS. THEY RE-DEFINE THE BOUNDARIES AND CONDITIONS FOR EXISTENCE IN THE CONTEXT OF DYNAMIC, UNRULY ENVIRONMENTS. THEY PROPOSE ALTERNATIVE MODES OF LIVING THAT ARE RADICALLY TRANSFORMED, MONSTROUS, COHERENT, RAW – AND SELECTIVELY PERMEATED BY THEIR NURTURING MEDIA.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "lines = open('manifesto.txt').readlines()\n",
    "sentence = random.choice(lines)\n",
    "print(sentence)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['LIQUID', 'BODIES', 'ARE', 'POLITICAL', 'AGENTS', '.', 'THEY', 'RE-DEFINE', 'THE', 'BOUNDARIES', 'AND', 'CONDITIONS', 'FOR', 'EXISTENCE', 'IN', 'THE', 'CONTEXT', 'OF', 'DYNAMIC', ',', 'UNRULY', 'ENVIRONMENTS', '.', 'THEY', 'PROPOSE', 'ALTERNATIVE', 'MODES', 'OF', 'LIVING', 'THAT', 'ARE', 'RADICALLY', 'TRANSFORMED', ',', 'MONSTROUS', ',', 'COHERENT', ',', 'RAW', '–', 'AND', 'SELECTIVELY', 'PERMEATED', 'BY', 'THEIR', 'NURTURING', 'MEDIA', '.']\n"
     ]
    }
   ],
   "source": [
    "tokens = nltk.word_tokenize(sentence)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Frequency Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<FreqDist with 29 samples and 295 outcomes>\n"
     ]
    }
   ],
   "source": [
    "# frequency of characters\n",
    "fd = nltk.FreqDist(sentence)\n",
    "print(fd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(' ', 40), ('E', 33), ('T', 22), ('N', 22), ('I', 21), ('R', 19), ('O', 17), ('A', 17), ('D', 13), ('S', 12), ('L', 10), ('C', 8), ('U', 7), ('H', 7), ('Y', 7), ('M', 7), ('F', 5), ('P', 4), (',', 4), ('V', 4), ('B', 3), ('G', 3), ('.', 3), ('X', 2), ('Q', 1), ('-', 1), ('W', 1), ('–', 1), ('\\n', 1)]\n"
     ]
    }
   ],
   "source": [
    "print(fd.most_common(50))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<FreqDist with 38 samples and 48 outcomes>\n"
     ]
    }
   ],
   "source": [
    "# frequency of words\n",
    "fd = nltk.FreqDist(tokens)\n",
    "print(fd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(',', 4), ('.', 3), ('ARE', 2), ('THEY', 2), ('THE', 2), ('AND', 2), ('OF', 2), ('LIQUID', 1), ('BODIES', 1), ('POLITICAL', 1), ('AGENTS', 1), ('RE-DEFINE', 1), ('BOUNDARIES', 1), ('CONDITIONS', 1), ('FOR', 1), ('EXISTENCE', 1), ('IN', 1), ('CONTEXT', 1), ('DYNAMIC', 1), ('UNRULY', 1), ('ENVIRONMENTS', 1), ('PROPOSE', 1), ('ALTERNATIVE', 1), ('MODES', 1), ('LIVING', 1), ('THAT', 1), ('RADICALLY', 1), ('TRANSFORMED', 1), ('MONSTROUS', 1), ('COHERENT', 1), ('RAW', 1), ('–', 1), ('SELECTIVELY', 1), ('PERMEATED', 1), ('BY', 1), ('THEIR', 1), ('NURTURING', 1), ('MEDIA', 1)]\n"
     ]
    }
   ],
   "source": [
    "print(fd.most_common(50))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<FreqDist with 660 samples and 1463 outcomes>\n"
     ]
    }
   ],
   "source": [
    "# frequency of a text\n",
    "txt = open('manifesto.txt').read()\n",
    "tokens = nltk.word_tokenize(txt)\n",
    "fd = nltk.FreqDist(tokens)\n",
    "print(fd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(',', 100), ('.', 69), ('OF', 68), ('THE', 57), ('AND', 50), ('LIQUID', 25), ('A', 25), ('ARE', 19), ('THAT', 19), ('TO', 18), ('ITS', 17), ('IN', 15), ('THEY', 15), ('IS', 14), ('THEIR', 14), ('LIFE', 11), ('’', 11), ('BODIES', 11), ('IT', 10), ('AS', 9), ('WHICH', 9), ('THESE', 9), ('‘', 8), ('–', 8), ('OUR', 7), ('THROUGH', 7), ('MATTER', 7), ('NOT', 6), ('CAN', 6), ('INTO', 6), ('FROM', 6), ('WITH', 6), ('BEING', 6), ('LIKE', 5), ('ON', 5), ('AN', 4), ('OR', 4), ('WE', 4), ('LIVING', 4), ('BE', 4), ('METABOLIC', 4), ('CHEMICAL', 4), ('FOR', 4), ('OWN', 4), ('US', 4), ('ACTS', 4), ('REALM', 3), ('YET', 3), ('THEM', 3), ('RECOGNISE', 3)]\n"
     ]
    }
   ],
   "source": [
    "print(fd.most_common(50))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "47\n"
     ]
    }
   ],
   "source": [
    "# Requesting the frequency of a specific word\n",
    "print(fd['language'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}