{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK - Part of Speech" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import random" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LIQUID BODIES ARE PLURI-PONTENT. THEY ARE CAPABLE OF MANY ACTS OF TRANSFORMATION. THEY DE-SIMPLIFY THE MATTER OF BEING A BODY THROUGH THEIR VISCERAL ENTANGLEMENTS. WHILE THE BÊTE MACHINE DEPENDS ON AN ABSTRACTED UNDERSTANDING OF ANATOMY FOUNDED UPON GENERALIZATIONS AND IDEALS, LIQUID BODIES RESIST THESE TROPES. LIQUID BODIES DISCUSS A MODE OF EXISTENCE THAT IS CONSTANTLY CHANGING – NOT AS THE CUMULATIVE OUTCOMES OF ‘ERROR’ – BUT AS A HIGHLY CHOREOGRAPHED AND CONTINUOUS SPECTRUM STREAM OF EVENTS THAT ARISE FROM THE PHYSICAL INTERACTIONS OF MATTER. THEY INTERNALIZE OTHER BODIES AS MANIFOLDS WITHIN THEIR SUBSTANCE AND ASSERT THEIR IDENTITY THROUGH THEIR ENVIRONMENTAL CONTEXTS. SUCH ENTANGLEMENTS INVOKE MARGINAL RELATIONS BETWEEN MULTIPLE AGENCIES AND EXCEED THE CLASSICAL LOGIC OF OBJECTS. THEY ARE INSEPARABLE FROM THEIR CONTEXT AND OFFER WAYS OF THINKING AND EXPERIMENTING WITH THE CONVENTIONS OF MAKING AND BEING EMBODIED.\n", "\n" ] } ], "source": [ "lines = open('manifesto.txt').readlines()\n", "sentence = random.choice(lines)\n", "print(sentence)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokens" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['LIQUID', 'BODIES', 'ARE', 'PLURI-PONTENT', '.', 'THEY', 'ARE', 'CAPABLE', 'OF', 'MANY', 'ACTS', 'OF', 'TRANSFORMATION', '.', 'THEY', 'DE-SIMPLIFY', 'THE', 'MATTER', 'OF', 'BEING', 'A', 'BODY', 'THROUGH', 'THEIR', 'VISCERAL', 'ENTANGLEMENTS', '.', 'WHILE', 'THE', 'BÊTE', 'MACHINE', 'DEPENDS', 'ON', 'AN', 'ABSTRACTED', 'UNDERSTANDING', 'OF', 'ANATOMY', 'FOUNDED', 'UPON', 'GENERALIZATIONS', 'AND', 'IDEALS', ',', 'LIQUID', 'BODIES', 'RESIST', 'THESE', 'TROPES', '.', 'LIQUID', 'BODIES', 'DISCUSS', 'A', 'MODE', 'OF', 'EXISTENCE', 'THAT', 'IS', 'CONSTANTLY', 'CHANGING', '–', 'NOT', 'AS', 'THE', 'CUMULATIVE', 'OUTCOMES', 'OF', '‘', 'ERROR', '’', '–', 'BUT', 'AS', 'A', 'HIGHLY', 'CHOREOGRAPHED', 'AND', 'CONTINUOUS', 'SPECTRUM', 'STREAM', 'OF', 'EVENTS', 'THAT', 'ARISE', 'FROM', 'THE', 'PHYSICAL', 'INTERACTIONS', 'OF', 'MATTER', '.', 'THEY', 'INTERNALIZE', 'OTHER', 'BODIES', 'AS', 'MANIFOLDS', 'WITHIN', 'THEIR', 'SUBSTANCE', 'AND', 'ASSERT', 'THEIR', 'IDENTITY', 'THROUGH', 'THEIR', 'ENVIRONMENTAL', 'CONTEXTS', '.', 'SUCH', 'ENTANGLEMENTS', 'INVOKE', 'MARGINAL', 'RELATIONS', 'BETWEEN', 'MULTIPLE', 'AGENCIES', 'AND', 'EXCEED', 'THE', 'CLASSICAL', 'LOGIC', 'OF', 'OBJECTS', '.', 'THEY', 'ARE', 'INSEPARABLE', 'FROM', 'THEIR', 'CONTEXT', 'AND', 'OFFER', 'WAYS', 'OF', 'THINKING', 'AND', 'EXPERIMENTING', 'WITH', 'THE', 'CONVENTIONS', 'OF', 'MAKING', 'AND', 'BEING', 'EMBODIED', '.']\n" ] } ], "source": [ "tokens = nltk.word_tokenize(sentence)\n", "print(tokens)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Part of Speech \"tags\"" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('LIQUID', 'JJ'), ('BODIES', 'NNP'), ('ARE', 'NNP'), ('PLURI-PONTENT', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'NNP'), ('CAPABLE', 'NNP'), ('OF', 'NNP'), ('MANY', 'NNP'), ('ACTS', 'NNP'), ('OF', 'NNP'), ('TRANSFORMATION', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('DE-SIMPLIFY', 'VBP'), ('THE', 'NNP'), ('MATTER', 'NNP'), ('OF', 'NNP'), ('BEING', 'NNP'), ('A', 'NNP'), ('BODY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('VISCERAL', 'NNP'), ('ENTANGLEMENTS', 'NNP'), ('.', '.'), ('WHILE', 'IN'), ('THE', 'DT'), ('BÊTE', 'NNP'), ('MACHINE', 'NNP'), ('DEPENDS', 'NNP'), ('ON', 'NNP'), ('AN', 'NNP'), ('ABSTRACTED', 'NNP'), ('UNDERSTANDING', 'NN'), ('OF', 'NNP'), ('ANATOMY', 'NNP'), ('FOUNDED', 'NNP'), ('UPON', 'NNP'), ('GENERALIZATIONS', 'NNP'), ('AND', 'NNP'), ('IDEALS', 'NNP'), (',', ','), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('RESIST', 'NNP'), ('THESE', 'NNP'), ('TROPES', 'NNP'), ('.', '.'), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('DISCUSS', 'NNP'), ('A', 'NNP'), ('MODE', 'NNP'), ('OF', 'NNP'), ('EXISTENCE', 'NNP'), ('THAT', 'NNP'), ('IS', 'VBZ'), ('CONSTANTLY', 'NNP'), ('CHANGING', 'NNP'), ('–', 'NNP'), ('NOT', 'NNP'), ('AS', 'IN'), ('THE', 'NNP'), ('CUMULATIVE', 'NNP'), ('OUTCOMES', 'NNP'), ('OF', 'NNP'), ('‘', 'NNP'), ('ERROR', 'NNP'), ('’', 'NNP'), ('–', 'NNP'), ('BUT', 'NNP'), ('AS', 'IN'), ('A', 'NNP'), ('HIGHLY', 'NNP'), ('CHOREOGRAPHED', 'NNP'), ('AND', 'NNP'), ('CONTINUOUS', 'NNP'), ('SPECTRUM', 'NNP'), ('STREAM', 'NNP'), ('OF', 'NNP'), ('EVENTS', 'NNP'), ('THAT', 'NNP'), ('ARISE', 'NNP'), ('FROM', 'NNP'), ('THE', 'NNP'), ('PHYSICAL', 'NNP'), ('INTERACTIONS', 'NNP'), ('OF', 'NNP'), ('MATTER', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('INTERNALIZE', 'NNP'), ('OTHER', 'NNP'), ('BODIES', 'NNP'), ('AS', 'NNP'), ('MANIFOLDS', 'NNP'), ('WITHIN', 'NNP'), ('THEIR', 'NNP'), ('SUBSTANCE', 'NNP'), ('AND', 'NNP'), ('ASSERT', 'NNP'), ('THEIR', 'NNP'), ('IDENTITY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('ENVIRONMENTAL', 'NNP'), ('CONTEXTS', 'NNP'), ('.', '.'), ('SUCH', 'JJ'), ('ENTANGLEMENTS', 'NNP'), ('INVOKE', 'NNP'), ('MARGINAL', 'NNP'), ('RELATIONS', 'NNP'), ('BETWEEN', 'NNP'), ('MULTIPLE', 'NNP'), ('AGENCIES', 'NNP'), ('AND', 'NNP'), ('EXCEED', 'NNP'), ('THE', 'NNP'), ('CLASSICAL', 'NNP'), ('LOGIC', 'NNP'), ('OF', 'NNP'), ('OBJECTS', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'VBP'), ('INSEPARABLE', 'NNP'), ('FROM', 'NNP'), ('THEIR', 'NNP'), ('CONTEXT', 'NNP'), ('AND', 'NNP'), ('OFFER', 'NNP'), ('WAYS', 'NNP'), ('OF', 'NNP'), ('THINKING', 'NNP'), ('AND', 'NNP'), ('EXPERIMENTING', 'NNP'), ('WITH', 'NNP'), ('THE', 'NNP'), ('CONVENTIONS', 'NNP'), ('OF', 'NNP'), ('MAKING', 'NNP'), ('AND', 'NNP'), ('BEING', 'NNP'), ('EMBODIED', 'NNP'), ('.', '.')]\n" ] } ], "source": [ "tagged = nltk.pos_tag(tokens)\n", "print(tagged)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, you could select for example all the type of **verbs**:" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['LIQUID', 'BODIES', 'INVITE', 'US', 'TO', 'ARTICULATE', 'THE', 'FUZZINESS', 'PARADOXES', 'AND', 'UNCERTAINTIES', 'THE', 'LIVING', 'REALM', 'THEY', 'ARE', 'STILL', 'INSTANTLY', 'RECOGNIZABLE', 'CAN', 'BE', 'NAMED', 'AS', 'TORNADO', 'CIRRUS', 'SOIL', 'EMBRYO', 'OR', 'BIOFILM', 'THESE', 'CONTRADICTIONS', '–', 'OF', 'FORM', 'AND', 'CONSTANCY', '–', 'ENCOURAGE', 'ALTERNATIVE', 'READINGS', 'OF', 'HOW', 'WE', 'ORDER', 'AND', 'SORT', 'THE', 'WORLD', 'WHOSE', 'MAIN', 'METHODOLOGY', 'IS', 'THROUGH', 'RELATING', 'ONE', 'BODY', 'TO', 'ANOTHER', 'INDEED', 'PROTEAN', 'LIQUID', 'BODIES', 'HELP', 'US', 'UNDERSTAND', 'THAT', 'WHILE', 'UNIVERSALISMS', 'AVERAGES', 'AND', 'GENERALIZATIONS', 'ARE', 'USEFUL', 'IN', 'PRODUCING', 'MAPS', 'OUR', 'BEING', 'IN', 'THE', 'WORLD', 'THEY', 'NEGLECT', 'SPECIFIC', 'DETAILS', 'WHICH', '‘', 'BRING', 'FORTH', 'THE', 'MATERIALITY', 'THE', 'ENVIRONMENT']\n" ] } ], "source": [ "selection = []\n", "\n", "for word, tag in tagged:\n", " if 'NN' in tag:\n", " selection.append(word)\n", "\n", "print(selection)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Where do these tags come from?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n", "\n", "From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n", "\n", "From: http://www.nltk.org/book_1ed/ch05.html" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "ename": "LookupError", "evalue": "\n**********************************************************************\n Resource \u001b[93mtagsets\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('tagsets')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mhelp/tagsets/PY3/upenn_tagset.pickle\u001b[0m\n\n Searched in:\n - '/home/kendalb/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n - ''\n**********************************************************************\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhelp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupenn_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'PRP'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/help.py\u001b[0m in \u001b[0;36mupenn_tagset\u001b[0;34m(tagpattern)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mupenn_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagpattern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0m_format_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"upenn_tagset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/help.py\u001b[0m in \u001b[0;36m_format_tagset\u001b[0;34m(tagset, tagpattern)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_format_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mtagdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"help/tagsets/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtagset\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".pickle\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0m_print_entries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 751\u001b[0m \u001b[0;31m# Load the resource.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 752\u001b[0;31m \u001b[0mopened_resource\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 753\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"raw\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(resource_url)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"nltk\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"file\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;31m# urllib might not use mode='rb', so handle this one ourselves:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mfind\u001b[0;34m(resource_name, paths)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"*\"\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m70\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0mresource_not_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\n%s\\n%s\\n%s\\n\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0msep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mLookupError\u001b[0m: \n**********************************************************************\n Resource \u001b[93mtagsets\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('tagsets')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mhelp/tagsets/PY3/upenn_tagset.pickle\u001b[0m\n\n Searched in:\n - '/home/kendalb/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n - ''\n**********************************************************************\n" ] } ], "source": [ "nltk.help.upenn_tagset('PRP')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
Number
\n", "
\n", "
Tag
\n", "
\n", "
Description
\n", "
1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
6. IN Preposition or subordinating conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10. LS List item marker
11. MD Modal
12. NN Noun, singular or mass
13. NNS Noun, plural
14. NNP Proper noun, singular
15. NNPS Proper noun, plural
16. PDT Predeterminer
17. POS Possessive ending
18. PRP Personal pronoun
19. PRP\\$ Possessive pronoun
20. RB Adverb
21. RBR Adverb, comparative
22. RBS Adverb, superlative
23. RP Particle
24. SYM Symbol
25. TO to
26. UH Interjection
27. VB Verb, base form
28. VBD Verb, past tense
29. VBG Verb, gerund or present participle
30. VBN Verb, past participle
31. VBP Verb, non-3rd person singular present
32. VBZ Verb, 3rd person singular present
33. WDT Wh-determiner
34. WP Wh-pronoun
35. WP$ Possessive wh-pronoun
36. WRB Wh-adverb \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A telling/tricky case\n", "It's important to realize that POS tagging is not a fixed property of a word -- but depends on the context of each word. The NLTK book gives an example of [homonyms](http://www.nltk.org/book_1ed/ch05.html#using-a-tagger) -- words that are written the same, but are actually pronounced differently and have different meanings depending on their use." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = nltk.word_tokenize(\"They refuse to permit us to obtain the refuse permit\")\n", "nltk.pos_tag(text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From the book:\n", "\n", "> Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning \"deny,\" while REFuse is a noun meaning \"trash\" (i.e. they are not homophones). Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Applying to an entire text" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "language = open('../txt/language.txt').read()\n", "tokens = nltk.word_tokenize(language)\n", "tagged = nltk.pos_tag(tokens)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "data": { "text/plain": [ "[('Language', 'NN'),\n", " ('Florian', 'JJ'),\n", " ('Cramer', 'NNP'),\n", " ('Software', 'NNP'),\n", " ('and', 'CC'),\n", " ('language', 'NN'),\n", " ('are', 'VBP'),\n", " ('intrinsically', 'RB'),\n", " ('related', 'VBN'),\n", " (',', ','),\n", " ('since', 'IN'),\n", " ('software', 'NN'),\n", " ('may', 'MD'),\n", " ('process', 'VB'),\n", " ('language', 'NN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('is', 'VBZ'),\n", " ('constructed', 'VBN'),\n", " ('in', 'IN'),\n", " ('language', 'NN'),\n", " ('.', '.'),\n", " ('Yet', 'CC'),\n", " ('language', 'NN'),\n", " ('means', 'VBZ'),\n", " ('different', 'JJ'),\n", " ('things', 'NNS'),\n", " ('in', 'IN'),\n", " ('the', 'DT'),\n", " ('context', 'NN'),\n", " ('of', 'IN'),\n", " ('computing', 'VBG'),\n", " (':', ':'),\n", " ('formal', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('in', 'IN'),\n", " ('which', 'WDT'),\n", " ('algorithms', 'EX'),\n", " ('are', 'VBP'),\n", " ('expressed', 'VBN'),\n", " ('and', 'CC'),\n", " ('software', 'NN'),\n", " ('is', 'VBZ'),\n", " ('implemented', 'VBN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('in', 'IN'),\n", " ('so-called', 'JJ'),\n", " ('“', 'NNP'),\n", " ('natural', 'JJ'),\n", " ('”', 'NNP'),\n", " ('spoken', 'NN'),\n", " ('languages', 'NNS'),\n", " ('.', '.'),\n", " ('There', 'EX'),\n", " ('are', 'VBP'),\n", " ('at', 'IN'),\n", " ('least', 'JJS'),\n", " ('two', 'CD'),\n", " ('layers', 'NNS'),\n", " ('of', 'IN'),\n", " ('formal', 'JJ'),\n", " ('language', 'NN'),\n", " ('in', 'IN'),\n", " ('software', 'NN'),\n", " (':', ':'),\n", " ('programming', 'NN'),\n", " ('language', 'NN'),\n", " ('in', 'IN'),\n", " ('which', 'WDT'),\n", " ('the', 'DT'),\n", " ('software', 'NN'),\n", " ('is', 'VBZ'),\n", " ('written', 'VBN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('the', 'DT'),\n", " ('language', 'NN'),\n", " ('implemented', 'VBD'),\n", " ('within', 'IN'),\n", " ('the', 'DT'),\n", " ('software', 'NN'),\n", " ('as', 'IN'),\n", " ('its', 'PRP$'),\n", " ('symbolic', 'JJ'),\n", " ('controls', 'NNS'),\n", " ('.', '.'),\n", " ('In', 'IN'),\n", " ('the', 'DT'),\n", " ('case', 'NN'),\n", " ('of', 'IN'),\n", " ('compilers', 'NNS'),\n", " (',', ','),\n", " ('shells', 'NNS'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('macro', 'NN'),\n", " ('languages', 'NNS'),\n", " (',', ','),\n", " ('for', 'IN'),\n", " ('example', 'NN'),\n", " (',', ','),\n", " ('these', 'DT'),\n", " ('layers', 'NNS'),\n", " ('can', 'MD'),\n", " ('overlap', 'VB'),\n", " ('.', '.'),\n", " ('“', 'VB'),\n", " ('Natural', 'NNP'),\n", " ('”', 'NNP'),\n", " ('language', 'NN'),\n", " ('is', 'VBZ'),\n", " ('what', 'WP'),\n", " ('can', 'MD'),\n", " ('be', 'VB'),\n", " ('processed', 'VBN'),\n", " ('as', 'IN'),\n", " ('data', 'NNS'),\n", " ('by', 'IN'),\n", " ('software', 'NN'),\n", " (';', ':'),\n", " ('since', 'IN'),\n", " ('this', 'DT'),\n", " ('processing', 'NN'),\n", " ('is', 'VBZ'),\n", " ('formal', 'JJ'),\n", " (',', ','),\n", " ('however', 'RB'),\n", " (',', ','),\n", " ('it', 'PRP'),\n", " ('is', 'VBZ'),\n", " ('restricted', 'VBN'),\n", " ('to', 'TO'),\n", " ('syntactical', 'JJ'),\n", " ('operations', 'NNS'),\n", " ('.', '.'),\n", " ('While', 'IN'),\n", " ('differentiation', 'NN'),\n", " ('of', 'IN'),\n", " ('computer', 'NN'),\n", " ('programming', 'VBG'),\n", " ('languages', 'NNS'),\n", " ('as', 'IN'),\n", " ('“', 'JJ'),\n", " ('artificial', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('”', 'VBP'),\n", " ('from', 'IN'),\n", " ('languages', 'NNS'),\n", " ('like', 'VBP'),\n", " ('English', 'NNP'),\n", " ('as', 'IN'),\n", " ('“', 'NNP'),\n", " ('natural', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('”', 'VBP'),\n", " ('is', 'VBZ'),\n", " ('conceptually', 'RB'),\n", " ('important', 'JJ'),\n", " ('and', 'CC'),\n", " ('undisputed', 'JJ'),\n", " (',', ','),\n", " ('it', 'PRP'),\n", " ('remains', 'VBZ'),\n", " ('problematic', 'JJ'),\n", " ('in', 'IN'),\n", " ('its', 'PRP$'),\n", " ('pure', 'NN'),\n", " ('terminology', 'NN'),\n", " (':', ':'),\n", " ('There', 'EX'),\n", " ('is', 'VBZ'),\n", " ('nothing', 'NN'),\n", " ('“', 'JJ'),\n", " ('natural', 'JJ'),\n", " ('”', 'NN'),\n", " ('about', 'IN'),\n", " ('spoken', 'JJ'),\n", " ('language', 'NN'),\n", " (';', ':'),\n", " ('it', 'PRP'),\n", " ('is', 'VBZ'),\n", " ('a', 'DT'),\n", " ('cultural', 'JJ'),\n", " ('construct', 'NN'),\n", " ('and', 'CC'),\n", " ('thus', 'RB'),\n", " ('just', 'RB'),\n", " ('as', 'IN'),\n", " ('“', 'JJ'),\n", " ('artificial', 'JJ'),\n", " ('”', 'NN'),\n", " ('as', 'IN'),\n", " ('any', 'DT'),\n", " ('formal', 'JJ'),\n", " ('machine', 'NN'),\n", " ('control', 'NN'),\n", " ('language', 'NN'),\n", " ('.', '.'),\n", " ('To', 'TO'),\n", " ('call', 'VB'),\n", " ('programming', 'NN'),\n", " ('languages', 'NNS'),\n", " ('“', 'VBP'),\n", " ('machine', 'NN'),\n", " ('languages', 'NNS'),\n", " ('”', 'VBP'),\n", " ('doesn', 'JJ'),\n", " ('’', 'NNP'),\n", " ('t', 'NN'),\n", " ('solve', 'VBP'),\n", " ('the', 'DT'),\n", " ('problem', 'NN'),\n", " ('either', 'RB'),\n", " (',', ','),\n", " ('as', 'IN'),\n", " ('it', 'PRP'),\n", " ('obscures', 'VBZ'),\n", " ('that', 'IN'),\n", " ('“', 'FW'),\n", " ('machine', 'NN'),\n", " ('languages', 'NNS'),\n", " ('”', 'VBP'),\n", " ('are', 'VBP'),\n", " ('human', 'JJ'),\n", " ('creations', 'NNS'),\n", " ('.', '.'),\n", " ('High-level', 'JJ'),\n", " ('machine-independent', 'JJ'),\n", " ('programming', 'NN'),\n", " ('languages', 'NNS'),\n", " ('such', 'JJ'),\n", " ('as', 'IN'),\n", " ('Fortran', 'NNP'),\n", " (',', ','),\n", " ('C', 'NNP'),\n", " (',', ','),\n", " ('Java', 'NNP'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('Basic', 'NNP'),\n", " ('are', 'VBP'),\n", " ('not', 'RB'),\n", " ('even', 'RB'),\n", " ('direct', 'JJ'),\n", " ('mappings', 'NNS'),\n", " ('of', 'IN'),\n", " ('machine', 'NN'),\n", " ('logic', 'NN'),\n", " ('.', '.'),\n", " ('If', 'IN'),\n", " ('programming', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('are', 'VBP'),\n", " ('human', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('for', 'IN'),\n", " ('machine', 'NN'),\n", " ('control', 'NN'),\n", " (',', ','),\n", " ('they', 'PRP'),\n", " ('could', 'MD'),\n", " ('be', 'VB'),\n", " ('called', 'VBN'),\n", " ('cybernetic', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('.', '.'),\n", " ('But', 'CC'),\n", " ('these', 'DT'),\n", " ('languages', 'NNS'),\n", " ('can', 'MD'),\n", " ('also', 'RB'),\n", " ('be', 'VB'),\n", " ('used', 'VBN'),\n", " ('outside', 'JJ'),\n", " ('machines—in', 'NN'),\n", " ('programming', 'VBG'),\n", " ('handbooks', 'NNS'),\n", " (',', ','),\n", " ('for', 'IN'),\n", " ('example', 'NN'),\n", " (',', ','),\n", " ('in', 'IN'),\n", " ('programmer', 'NN'),\n", " ('’', 'NNP'),\n", " ('s', 'NN'),\n", " ('dinner', 'NN'),\n", " ('table', 'JJ'),\n", " ('jokes', 'NNS'),\n", " (',', ','),\n", " ('or', 'CC'),\n", " ('as', 'IN'),\n", " ('abstract', 'JJ'),\n", " ('formal', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('for', 'IN'),\n", " ('expressing', 'VBG'),\n", " ('logical', 'JJ'),\n", " ('constructs', 'NNS'),\n", " (',', ','),\n", " ('such', 'JJ'),\n", " ('as', 'IN'),\n", " ('in', 'IN'),\n", " ('Hugh', 'NNP'),\n", " ('Kenner', 'NNP'),\n", " ('’', 'NNP'),\n", " ('s', 'NN'),\n", " ('use', 'NN'),\n", " ('of', 'IN'),\n", " ('the', 'DT'),\n", " ('Pascal', 'NNP'),\n", " ('programming', 'NN'),\n", " ('language', 'NN'),\n", " ('to', 'TO'),\n", " ('explain', 'VB'),\n", " ('aspects', 'NNS'),\n", " ('of', 'IN'),\n", " ('the', 'DT'),\n", " ('structure', 'NN'),\n", " ('of', 'IN'),\n", " ('Samuel', 'NNP'),\n", " ('Beckett', 'NNP'),\n", " ('’', 'NNP'),\n", " ('s', 'VBD'),\n", " ('writing.1', 'NN'),\n", " ('In', 'IN'),\n", " ('this', 'DT'),\n", " ('sense', 'NN'),\n", " (',', ','),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('languages', 'NNS'),\n", " ('could', 'MD'),\n", " ('be', 'VB'),\n", " ('more', 'RBR'),\n", " ('broadly', 'RB'),\n", " ('defined', 'VBN'),\n", " ('as', 'IN'),\n", " ('syntactical', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('as', 'IN'),\n", " ('opposed', 'VBN'),\n", " ('to', 'TO'),\n", " ('semantic', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('.', '.'),\n", " ('But', 'CC'),\n", " ('this', 'DT'),\n", " ('terminology', 'NN'),\n", " ('is', 'VBZ'),\n", " ('not', 'RB'),\n", " ('without', 'IN'),\n", " ('its', 'PRP$'),\n", " ('problems', 'NNS'),\n", " ('either', 'DT'),\n", " ('.', '.'),\n", " ('Common', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('like', 'IN'),\n", " ('English', 'NNP'),\n", " ('are', 'VBP'),\n", " ('both', 'DT'),\n", " ('formal', 'JJ'),\n", " ('and', 'CC'),\n", " ('semantic', 'JJ'),\n", " (';', ':'),\n", " ('although', 'IN'),\n", " ('their', 'PRP$'),\n", " ('scope', 'NN'),\n", " ('extends', 'VBZ'),\n", " ('beyond', 'IN'),\n", " ('the', 'DT'),\n", " ('formal', 'JJ'),\n", " (',', ','),\n", " ('anything', 'NN'),\n", " ('that', 'WDT'),\n", " ('can', 'MD'),\n", " ('be', 'VB'),\n", " ('expressed', 'VBN'),\n", " ('in', 'IN'),\n", " ('a', 'DT'),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('language', 'NN'),\n", " ('can', 'MD'),\n", " ('also', 'RB'),\n", " ('be', 'VB'),\n", " ('expressed', 'VBN'),\n", " ('in', 'IN'),\n", " ('common', 'JJ'),\n", " ('language', 'NN'),\n", " ('.', '.'),\n", " ('It', 'PRP'),\n", " ('follows', 'VBZ'),\n", " ('that', 'IN'),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('languages', 'NNS'),\n", " ('are', 'VBP'),\n", " ('a', 'DT'),\n", " ('formal', 'JJ'),\n", " ('(', '('),\n", " ('and', 'CC'),\n", " ('as', 'IN'),\n", " ('such', 'JJ'),\n", " ('rather', 'RB'),\n", " ('primitive', 'JJ'),\n", " (')', ')'),\n", " ('subset', 'NN'),\n", " ('of', 'IN'),\n", " ('common', 'JJ'),\n", " ('human', 'JJ'),\n", " ('languages', 'NNS'),\n", " ('.', '.'),\n", " ('To', 'TO'),\n", " ('complicate', 'VB'),\n", " ('things', 'NNS'),\n", " ('even', 'RB'),\n", " ('further', 'RB'),\n", " (',', ','),\n", " ('computer', 'NN'),\n", " ('science', 'NN'),\n", " ('has', 'VBZ'),\n", " ('its', 'PRP$'),\n", " ('own', 'JJ'),\n", " ('understanding', 'NN'),\n", " ('of', 'IN'),\n", " ('“', 'NNP'),\n", " ('operational', 'JJ'),\n", " ('semantics', 'NNS'),\n", " ('”', 'VBP'),\n", " ('in', 'IN'),\n", " ('programming', 'NN'),\n", " ('languages', 'NNS'),\n", " (',', ','),\n", " ('for', 'IN'),\n", " ('example', 'NN'),\n", " ('in', 'IN'),\n", " ('the', 'DT'),\n", " ('construction', 'NN'),\n", " ('of', 'IN'),\n", " ('a', 'DT'),\n", " ('programming', 'JJ'),\n", " ('language', 'NN'),\n", " ('interpreter', 'NN'),\n", " ('or', 'CC'),\n", " ('compiler', 'NN'),\n", " ('.', '.'),\n", " ('Just', 'RB'),\n", " ('as', 'IN'),\n", " ('this', 'DT'),\n", " ('interpreter', 'NN'),\n", " ('doesn', 'NN'),\n", " ('’', 'NNP'),\n", " ('t', 'NN'),\n", " ('perform', 'NN'),\n", " ('“', 'NNP'),\n", " ('interpretations', 'NNS'),\n", " ('”', 'VBP'),\n", " ('in', 'IN'),\n", " ('a', 'DT'),\n", " ('hermeneutic', 'JJ'),\n", " ('sense', 'NN'),\n", " ('of', 'IN'),\n", " ('semantic', 'JJ'),\n", " ('text', 'NN'),\n", " ('explication', 'NN'),\n", " (',', ','),\n", " ('the', 'DT'),\n", " ('computer', 'NN'),\n", " ('science', 'NN'),\n", " ('notion', 'NN'),\n", " ('of', 'IN'),\n", " ('“', 'JJ'),\n", " ('semantics', 'NNS'),\n", " ('”', 'JJ'),\n", " ('defies', 'NNS'),\n", " ('linguistic', 'JJ'),\n", " ('and', 'CC'),\n", " ('common', 'JJ'),\n", " ('sense', 'NN'),\n", " ('understanding', 'NN'),\n", " ('of', 'IN'),\n", " ('the', 'DT'),\n", " ('word', 'NN'),\n", " (',', ','),\n", " ('since', 'IN'),\n", " ('compiler', 'NN'),\n", " ('construction', 'NN'),\n", " ('is', 'VBZ'),\n", " ('purely', 'RB'),\n", " ('syntactical', 'JJ'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('programming', 'VBG'),\n", " ('languages', 'NNS'),\n", " ('denote', 'VBP'),\n", " ('nothing', 'NN'),\n", " ('but', 'CC'),\n", " ('syntactical', 'JJ'),\n", " ('manipulations', 'NNS'),\n", " ('of', 'IN'),\n", " ('symbols', 'NNS'),\n", " ('.', '.'),\n", " ('What', 'WP'),\n", " ('might', 'MD'),\n", " ('more', 'JJR'),\n", " ('suitably', 'RB'),\n", " ('be', 'VB'),\n", " ('called', 'VBN'),\n", " ('the', 'DT'),\n", " ('semantics', 'NNS'),\n", " ('of', 'IN'),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('languages', 'VBZ'),\n", " ('resides', 'NNS'),\n", " ('in', 'IN'),\n", " ('the', 'DT'),\n", " ('symbols', 'NNS'),\n", " ('with', 'IN'),\n", " ('which', 'WDT'),\n", " ('those', 'DT'),\n", " ('operations', 'NNS'),\n", " ('are', 'VBP'),\n", " ('denoted', 'VBN'),\n", " ('in', 'IN'),\n", " ('most', 'JJS'),\n", " ('programming', 'JJ'),\n", " ('languages', 'NNS'),\n", " (':', ':'),\n", " ('English', 'JJ'),\n", " ('words', 'NNS'),\n", " ('like', 'IN'),\n", " ('“', 'NN'),\n", " ('if', 'IN'),\n", " (',', ','),\n", " ('”', 'FW'),\n", " ('“', 'FW'),\n", " ('then', 'RB'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('else', 'RB'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('for', 'IN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('while', 'IN'),\n", " (',', ','),\n", " ('”', 'FW'),\n", " ('“', 'NNP'),\n", " ('goto', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('and', 'CC'),\n", " ('“', 'NNP'),\n", " ('print', 'NN'),\n", " (',', ','),\n", " ('”', 'NN'),\n", " ('in', 'IN'),\n", " ('conjunction', 'NN'),\n", " ('with', 'IN'),\n", " ('arithmetical', 'JJ'),\n", " ('and', 'CC'),\n", " ('punctuation', 'NN'),\n", " ('symbols', 'NNS'),\n", " (';', ':'),\n", " ('in', 'IN'),\n", " ('alphabetic', 'JJ'),\n", " ('software', 'NN'),\n", " ('controls', 'NNS'),\n", " (',', ','),\n", " ('words', 'NNS'),\n", " ('like', 'IN'),\n", " ('“', 'NNP'),\n", " ('list', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('move', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('copy', 'NN'),\n", " (',', ','),\n", " ('”', 'NN'),\n", " ('and', 'CC'),\n", " ('“', 'NNP'),\n", " ('paste', 'NN'),\n", " ('”', 'NN'),\n", " (';', ':'),\n", " ('in', 'IN'),\n", " ('graphical', 'JJ'),\n", " ('software', 'NN'),\n", " ('controls', 'NNS'),\n", " (',', ','),\n", " ('such', 'JJ'),\n", " ('as', 'IN'),\n", " ('symbols', 'NNS'),\n", " ('like', 'IN'),\n", " ('the', 'DT'),\n", " ('trash', 'NN'),\n", " ('can', 'MD'),\n", " ('.', '.'),\n", " ('Ferdinand', 'NNP'),\n", " ('de', 'IN'),\n", " ('Saussure', 'NNP'),\n", " ('states', 'VBZ'),\n", " ('that', 'IN'),\n", " ('the', 'DT'),\n", " ('signs', 'NNS'),\n", " ('of', 'IN'),\n", " ('common', 'JJ'),\n", " ('human', 'JJ'),\n", " ('language', 'NN'),\n", " ('are', 'VBP'),\n", " ('arbitrary2', 'RB'),\n", " ('because', 'IN'),\n", " ('it', 'PRP'),\n", " ('’', 'VBZ'),\n", " ('s', 'JJ'),\n", " ('purely', 'RB'),\n", " ('a', 'DT'),\n", " ('cultural-social', 'JJ'),\n", " ('convention', 'NN'),\n", " ('that', 'IN'),\n", " ('assigns', 'VBZ'),\n", " ('phonemes', 'NNS'),\n", " ('to', 'TO'),\n", " ('concepts', 'NNS'),\n", " ('.', '.'),\n", " ('Likewise', 'NNP'),\n", " (',', ','),\n", " ('it', 'PRP'),\n", " ('’', 'VBZ'),\n", " ('s', 'JJ'),\n", " ('purely', 'RB'),\n", " ('a', 'DT'),\n", " ('cultural', 'JJ'),\n", " ('convention', 'NN'),\n", " ('to', 'TO'),\n", " ('assign', 'VB'),\n", " ('symbols', 'NNS'),\n", " ('to', 'TO'),\n", " ('machine', 'NN'),\n", " ('operations', 'NNS'),\n", " ('.', '.'),\n", " ('But', 'CC'),\n", " ('just', 'RB'),\n", " ('as', 'IN'),\n", " ('the', 'DT'),\n", " ('cultural', 'JJ'),\n", " ('choice', 'NN'),\n", " ('of', 'IN'),\n", " ('phonemes', 'NNS'),\n", " ('in', 'IN'),\n", " ('spoken', 'JJ'),\n", " ('language', 'NN'),\n", " ('is', 'VBZ'),\n", " ('restrained', 'VBN'),\n", " ('by', 'IN'),\n", " ('what', 'WP'),\n", " ('the', 'DT'),\n", " ('human', 'JJ'),\n", " ('voice', 'NN'),\n", " ('can', 'MD'),\n", " ('pronounce', 'VB'),\n", " (',', ','),\n", " ('the', 'DT'),\n", " ('assignment', 'NN'),\n", " ('of', 'IN'),\n", " ('symbols', 'NNS'),\n", " ('to', 'TO'),\n", " ('machine', 'NN'),\n", " ('operations', 'NNS'),\n", " ('is', 'VBZ'),\n", " ('limited', 'VBN'),\n", " ('to', 'TO'),\n", " ('what', 'WP'),\n", " ('can', 'MD'),\n", " ('be', 'VB'),\n", " ('efficiently', 'RB'),\n", " ('processed', 'VBN'),\n", " ('by', 'IN'),\n", " ('the', 'DT'),\n", " ('machine', 'NN'),\n", " ('and', 'CC'),\n", " ('of', 'IN'),\n", " ('good', 'JJ'),\n", " ('use', 'NN'),\n", " ('to', 'TO'),\n", " ('humans.3', 'VB'),\n", " ('This', 'DT'),\n", " ('compromise', 'NN'),\n", " ('between', 'IN'),\n", " ('operability', 'NN'),\n", " ('and', 'CC'),\n", " ('usability', 'NN'),\n", " ('is', 'VBZ'),\n", " ('obvious', 'JJ'),\n", " ('in', 'IN'),\n", " (',', ','),\n", " ('for', 'IN'),\n", " ('example', 'NN'),\n", " (',', ','),\n", " ('Unix', 'NNP'),\n", " ('commands', 'VBZ'),\n", " ('.', '.'),\n", " ('Originally', 'RB'),\n", " ('used', 'VBN'),\n", " ('on', 'IN'),\n", " ('teletype', 'NN'),\n", " ('terminals', 'NNS'),\n", " (',', ','),\n", " ('the', 'DT'),\n", " ('operation', 'NN'),\n", " ('“', 'NNP'),\n", " ('copy', 'NN'),\n", " ('”', 'NN'),\n", " ('was', 'VBD'),\n", " ('abbreviated', 'VBN'),\n", " ('to', 'TO'),\n", " ('the', 'DT'),\n", " ('command', 'NN'),\n", " ('“', 'NNP'),\n", " ('cp', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('move', 'NN'),\n", " ('”', 'NN'),\n", " ('to', 'TO'),\n", " ('“', 'VB'),\n", " ('mv', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('“', 'NNP'),\n", " ('list', 'NN'),\n", " ('”', 'NN'),\n", " ('to', 'TO'),\n", " ('“', 'VB'),\n", " ('ls', 'NN'),\n", " (',', ','),\n", " ('”', 'NNP'),\n", " ('etc.', 'NN'),\n", " (',', ','),\n", " ('in', 'IN'),\n", " ('order', 'NN'),\n", " ('to', 'TO'),\n", " ('cut', 'VB'),\n", " ('down', 'RP'),\n", " ('machine', 'NN'),\n", " ('memory', 'NN'),\n", " ('use', 'NN'),\n", " (',', ','),\n", " ('teletype', 'JJ'),\n", " ('paper', 'NN'),\n", " ('consumption', 'NN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('human', 'JJ'),\n", " ('typing', 'VBG'),\n", " ('effort', 'NN'),\n", " ('at', 'IN'),\n", " ('the', 'DT'),\n", " ('same', 'JJ'),\n", " ('time', 'NN'),\n", " ('.', '.'),\n", " ('Any', 'DT'),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('language', 'NN'),\n", " ('is', 'VBZ'),\n", " ('thus', 'RB'),\n", " ('a', 'DT'),\n", " ('cultural', 'JJ'),\n", " ('compromise', 'NN'),\n", " ('between', 'IN'),\n", " ('the', 'DT'),\n", " ('constraints', 'NNS'),\n", " ('of', 'IN'),\n", " ('machine', 'NN'),\n", " ('design—which', 'NN'),\n", " ('is', 'VBZ'),\n", " ('far', 'RB'),\n", " ('from', 'IN'),\n", " ('objective', 'JJ'),\n", " (',', ','),\n", " ('but', 'CC'),\n", " ('based', 'VBN'),\n", " ('on', 'IN'),\n", " ('human', 'JJ'),\n", " ('choices', 'NNS'),\n", " (',', ','),\n", " ('culture', 'NN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('thinking', 'VBG'),\n", " ('style', 'NN'),\n", " ('itself', 'PRP'),\n", " ('4—and', 'CD'),\n", " ('the', 'DT'),\n", " ('equally', 'RB'),\n", " ('subjective', 'JJ'),\n", " ('user', 'NN'),\n", " ('preferences', 'NNS'),\n", " (',', ','),\n", " ('involving', 'VBG'),\n", " ('fuzzy', 'JJ'),\n", " ('factors', 'NNS'),\n", " ('like', 'IN'),\n", " ('readability', 'NN'),\n", " (',', ','),\n", " ('elegance', 'NN'),\n", " (',', ','),\n", " ('and', 'CC'),\n", " ('usage', 'JJ'),\n", " ('efficiency', 'NN'),\n", " ('.', '.'),\n", " ('The', 'DT'),\n", " ('symbols', 'NNS'),\n", " ('of', 'IN'),\n", " ('computer', 'NN'),\n", " ('control', 'NN'),\n", " ('languages', 'VBZ'),\n", " ('inevitably', 'RB'),\n", " ('do', 'VBP'),\n", " ('have', 'VB'),\n", " ('semantic', 'JJ'),\n", " ('connotations', 'NNS'),\n", " ('simply', 'RB'),\n", " ('because', 'IN'),\n", " ('there', 'EX'),\n", " ('exist', 'VBP'),\n", " ('no', 'DT'),\n", " ('symbols', 'NNS'),\n", " ('with', 'IN'),\n", " ('which', 'WDT'),\n", " ('humans', 'NNS'),\n", " ('would', 'MD'),\n", " ('not', 'RB'),\n", " ('associate', 'VB'),\n", " ('some', 'DT'),\n", " ('meaning', 'NN'),\n", " ('.', '.'),\n", " ('But', 'CC'),\n", " ('symbols', 'NNS'),\n", " ('can', 'MD'),\n", " ('’', 'VB'),\n", " ('t', 'JJ'),\n", " ('denote', 'NN'),\n", " ('any', 'DT'),\n", " ('semantic', 'JJ'),\n", " ('statements', 'NNS'),\n", " (',', ','),\n", " ('that', 'DT'),\n", " ('is', 'VBZ'),\n", " (',', ','),\n", " ('they', 'PRP'),\n", " ('do', 'VBP'),\n", " ('not', 'RB'),\n", " ('express', 'VB'),\n", " ('meaning', 'VBG'),\n", " ('in', 'IN'),\n", " ('their', 'PRP$'),\n", " ('own', 'JJ'),\n", " ('terms', 'NNS'),\n", " (';', ':'),\n", " ('humans', 'NNS'),\n", " ('metaphorically', 'RB'),\n", " ('read', 'VB'),\n", " ('meaning', 'VBG'),\n", " ('into', 'IN'),\n", " ('them', 'PRP'),\n", " ('through', 'IN'),\n", " ('associations', 'NNS'),\n", " ('they', 'PRP'),\n", " ('make', 'VBP'),\n", " ('.', '.'),\n", " ('Languages', 'NNS'),\n", " ('without', 'IN'),\n", " ('semantic', 'JJ'),\n", " ('denotation', 'NN'),\n", " ('are', 'VBP'),\n", " ('not', 'RB'),\n", " ('historically', 'RB'),\n", " ('new', 'JJ'),\n", " ('phenomena', 'NNS'),\n", " (';', ':'),\n", " ('mathematical', 'JJ'),\n", " ('formulas', 'NNS'),\n", " ('are', 'VBP'),\n", " ('their', 'PRP$'),\n", " ('oldest', 'JJS'),\n", " ('example', 'NN'),\n", " ('.', '.'),\n", " ('In', 'IN'),\n", " ('comparison', 'NN'),\n", " ('to', 'TO'),\n", " ('common', 'JJ'),\n", " ('human', 'JJ'),\n", " ('languages', 'NNS'),\n", " (',', ','),\n", " ('the', 'DT'),\n", " ('multitude', 'NN'),\n", " ('of', 'IN'),\n", " ('programming', 'VBG'),\n", " ('languages', 'NNS'),\n", " ('is', 'VBZ'),\n", " ('of', 'IN'),\n", " ('lesser', 'JJR'),\n", " ('significance', 'NN'),\n", " ('.', '.'),\n", " ('The', 'DT'),\n", " ('criterion', 'NN'),\n", " ('of', 'IN'),\n", " ('Turing', 'NNP'),\n", " ('completeness', 'NN'),\n", " ('of', 'IN'),\n", " ('a', 'DT'),\n", " ('programming', 'NN'),\n", " ('language', 'NN'),\n", " (',', ','),\n", " ('that', 'WDT'),\n", " ('is', 'VBZ'),\n", " (',', ','),\n", " ('that', 'IN'),\n", " ('any', 'DT'),\n", " ('computation', 'NN'),\n", " ('can', 'MD'),\n", " ('be', 'VB'),\n", " ('expressed', 'VBN'),\n", " ('in', 'IN'),\n", " ('it', 'PRP'),\n", " (',', ','),\n", " ('means', 'VBZ'),\n", " ('that', 'IN'),\n", " ('every', 'DT'),\n", " ('programming', 'NN'),\n", " ('language', 'NN'),\n", " ('is', 'VBZ'),\n", " (',', ','),\n", " ('formally', 'RB'),\n", " ('speaking', 'VBG'),\n", " (',', ','),\n", " ('just', 'RB'),\n", " ('a', 'DT'),\n", " ('riff', 'NN'),\n", " ('on', 'IN'),\n", " ('every', 'DT'),\n", " ('other', 'JJ'),\n", " ('programming', 'NN'),\n", " ('language', 'NN'),\n", " ('.', '.'),\n", " ('Nothing', 'NN'),\n", " ('can', 'MD'),\n", " ('be', 'VB'),\n", " ('expressed', 'VBN'),\n", " ('in', 'IN'),\n", " ('a', 'DT'),\n", " ('Turingcomplete', 'JJ'),\n", " ('language', 'NN'),\n", " ('such', 'JJ'),\n", " ('as', 'IN'),\n", " ('C', 'NNP'),\n", " ('that', 'IN'),\n", " ('couldn', 'NN'),\n", " ('’', 'NNP'),\n", " ('t', 'NN'),\n", " ('also', 'RB'),\n", " ('be', 'VB'),\n", " ('expressed', 'VBN'),\n", " ('in', 'IN'),\n", " ('another', 'DT'),\n", " ('Turingcomplete', 'NNP'),\n", " ('language', 'NN'),\n", " ('such', 'JJ'),\n", " ('as', 'IN'),\n", " ('Lisp', 'NNP'),\n", " ('(', '('),\n", " ('or', 'CC'),\n", " ('Fortran', 'NNP'),\n", " (',', ','),\n", " ('Smalltalk', 'NNP'),\n", " (',', ','),\n", " ('Java', 'NNP'),\n", " ('...', ':'),\n", " (')', ')'),\n", " ('and', 'CC'),\n", " ('vice', 'NN'),\n", " ('versa', 'NN'),\n", " ('.', '.'),\n", " ('This', 'DT'),\n", " ('ultimately', 'JJ'),\n", " ('proves', 'VBZ'),\n", " ('the', 'DT'),\n", " ...]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tagged" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "words = \"in the beginning was heaven and earth and the time of the whatever\".split()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['in',\n", " 'the',\n", " 'beginning',\n", " 'was',\n", " 'heaven',\n", " 'and',\n", " 'earth',\n", " 'and',\n", " 'the',\n", " 'time',\n", " 'of',\n", " 'the',\n", " 'whatever']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words.index(\"the\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "IN\n", "1 the\n", "BEGINNING\n", "WAS\n", "HEAVEN\n", "AND\n", "EARTH\n", "AND\n", "8 the\n", "TIME\n", "OF\n", "11 the\n", "WHATEVER\n" ] } ], "source": [ "for i, word in enumerate(words):\n", " if word == \"the\":\n", " print (i, word)\n", " else:\n", " print (word.upper())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'in'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import random \n", "\n", "words = {}\n", "words[\"VB\"] = []\n", "\n", "for word in nltk.word_tokenize(\"in the beginning was heaven and earth and the time of the whatever\"):\n", " words[\"VB\"].append(word)\n", " \n", "random.choice(words[\"VB\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }