You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SI13/LIQUID/MANIFESTO/nltk-pos-tagger-Copy1.ipynb

1600 lines
57 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NLTK - Part of Speech"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LIQUID BODIES ARE PLURI-PONTENT. THEY ARE CAPABLE OF MANY ACTS OF TRANSFORMATION. THEY DE-SIMPLIFY THE MATTER OF BEING A BODY THROUGH THEIR VISCERAL ENTANGLEMENTS. WHILE THE BÊTE MACHINE DEPENDS ON AN ABSTRACTED UNDERSTANDING OF ANATOMY FOUNDED UPON GENERALIZATIONS AND IDEALS, LIQUID BODIES RESIST THESE TROPES. LIQUID BODIES DISCUSS A MODE OF EXISTENCE THAT IS CONSTANTLY CHANGING NOT AS THE CUMULATIVE OUTCOMES OF ERROR BUT AS A HIGHLY CHOREOGRAPHED AND CONTINUOUS SPECTRUM STREAM OF EVENTS THAT ARISE FROM THE PHYSICAL INTERACTIONS OF MATTER. THEY INTERNALIZE OTHER BODIES AS MANIFOLDS WITHIN THEIR SUBSTANCE AND ASSERT THEIR IDENTITY THROUGH THEIR ENVIRONMENTAL CONTEXTS. SUCH ENTANGLEMENTS INVOKE MARGINAL RELATIONS BETWEEN MULTIPLE AGENCIES AND EXCEED THE CLASSICAL LOGIC OF OBJECTS. THEY ARE INSEPARABLE FROM THEIR CONTEXT AND OFFER WAYS OF THINKING AND EXPERIMENTING WITH THE CONVENTIONS OF MAKING AND BEING EMBODIED.\n",
"\n"
]
}
],
"source": [
"lines = open('manifesto.txt').readlines()\n",
"sentence = random.choice(lines)\n",
"print(sentence)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokens"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['LIQUID', 'BODIES', 'ARE', 'PLURI-PONTENT', '.', 'THEY', 'ARE', 'CAPABLE', 'OF', 'MANY', 'ACTS', 'OF', 'TRANSFORMATION', '.', 'THEY', 'DE-SIMPLIFY', 'THE', 'MATTER', 'OF', 'BEING', 'A', 'BODY', 'THROUGH', 'THEIR', 'VISCERAL', 'ENTANGLEMENTS', '.', 'WHILE', 'THE', 'BÊTE', 'MACHINE', 'DEPENDS', 'ON', 'AN', 'ABSTRACTED', 'UNDERSTANDING', 'OF', 'ANATOMY', 'FOUNDED', 'UPON', 'GENERALIZATIONS', 'AND', 'IDEALS', ',', 'LIQUID', 'BODIES', 'RESIST', 'THESE', 'TROPES', '.', 'LIQUID', 'BODIES', 'DISCUSS', 'A', 'MODE', 'OF', 'EXISTENCE', 'THAT', 'IS', 'CONSTANTLY', 'CHANGING', '', 'NOT', 'AS', 'THE', 'CUMULATIVE', 'OUTCOMES', 'OF', '', 'ERROR', '', '', 'BUT', 'AS', 'A', 'HIGHLY', 'CHOREOGRAPHED', 'AND', 'CONTINUOUS', 'SPECTRUM', 'STREAM', 'OF', 'EVENTS', 'THAT', 'ARISE', 'FROM', 'THE', 'PHYSICAL', 'INTERACTIONS', 'OF', 'MATTER', '.', 'THEY', 'INTERNALIZE', 'OTHER', 'BODIES', 'AS', 'MANIFOLDS', 'WITHIN', 'THEIR', 'SUBSTANCE', 'AND', 'ASSERT', 'THEIR', 'IDENTITY', 'THROUGH', 'THEIR', 'ENVIRONMENTAL', 'CONTEXTS', '.', 'SUCH', 'ENTANGLEMENTS', 'INVOKE', 'MARGINAL', 'RELATIONS', 'BETWEEN', 'MULTIPLE', 'AGENCIES', 'AND', 'EXCEED', 'THE', 'CLASSICAL', 'LOGIC', 'OF', 'OBJECTS', '.', 'THEY', 'ARE', 'INSEPARABLE', 'FROM', 'THEIR', 'CONTEXT', 'AND', 'OFFER', 'WAYS', 'OF', 'THINKING', 'AND', 'EXPERIMENTING', 'WITH', 'THE', 'CONVENTIONS', 'OF', 'MAKING', 'AND', 'BEING', 'EMBODIED', '.']\n"
]
}
],
"source": [
"tokens = nltk.word_tokenize(sentence)\n",
"print(tokens)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part of Speech \"tags\""
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('LIQUID', 'JJ'), ('BODIES', 'NNP'), ('ARE', 'NNP'), ('PLURI-PONTENT', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'NNP'), ('CAPABLE', 'NNP'), ('OF', 'NNP'), ('MANY', 'NNP'), ('ACTS', 'NNP'), ('OF', 'NNP'), ('TRANSFORMATION', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('DE-SIMPLIFY', 'VBP'), ('THE', 'NNP'), ('MATTER', 'NNP'), ('OF', 'NNP'), ('BEING', 'NNP'), ('A', 'NNP'), ('BODY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('VISCERAL', 'NNP'), ('ENTANGLEMENTS', 'NNP'), ('.', '.'), ('WHILE', 'IN'), ('THE', 'DT'), ('BÊTE', 'NNP'), ('MACHINE', 'NNP'), ('DEPENDS', 'NNP'), ('ON', 'NNP'), ('AN', 'NNP'), ('ABSTRACTED', 'NNP'), ('UNDERSTANDING', 'NN'), ('OF', 'NNP'), ('ANATOMY', 'NNP'), ('FOUNDED', 'NNP'), ('UPON', 'NNP'), ('GENERALIZATIONS', 'NNP'), ('AND', 'NNP'), ('IDEALS', 'NNP'), (',', ','), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('RESIST', 'NNP'), ('THESE', 'NNP'), ('TROPES', 'NNP'), ('.', '.'), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('DISCUSS', 'NNP'), ('A', 'NNP'), ('MODE', 'NNP'), ('OF', 'NNP'), ('EXISTENCE', 'NNP'), ('THAT', 'NNP'), ('IS', 'VBZ'), ('CONSTANTLY', 'NNP'), ('CHANGING', 'NNP'), ('', 'NNP'), ('NOT', 'NNP'), ('AS', 'IN'), ('THE', 'NNP'), ('CUMULATIVE', 'NNP'), ('OUTCOMES', 'NNP'), ('OF', 'NNP'), ('', 'NNP'), ('ERROR', 'NNP'), ('', 'NNP'), ('', 'NNP'), ('BUT', 'NNP'), ('AS', 'IN'), ('A', 'NNP'), ('HIGHLY', 'NNP'), ('CHOREOGRAPHED', 'NNP'), ('AND', 'NNP'), ('CONTINUOUS', 'NNP'), ('SPECTRUM', 'NNP'), ('STREAM', 'NNP'), ('OF', 'NNP'), ('EVENTS', 'NNP'), ('THAT', 'NNP'), ('ARISE', 'NNP'), ('FROM', 'NNP'), ('THE', 'NNP'), ('PHYSICAL', 'NNP'), ('INTERACTIONS', 'NNP'), ('OF', 'NNP'), ('MATTER', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('INTERNALIZE', 'NNP'), ('OTHER', 'NNP'), ('BODIES', 'NNP'), ('AS', 'NNP'), ('MANIFOLDS', 'NNP'), ('WITHIN', 'NNP'), ('THEIR', 'NNP'), ('SUBSTANCE', 'NNP'), ('AND', 'NNP'), ('ASSERT', 'NNP'), ('THEIR', 'NNP'), ('IDENTITY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('ENVIRONMENTAL', 'NNP'), ('CONTEXTS', 'NNP'), ('.', '.'), ('SUCH', 'JJ'), ('ENTANGLEMENTS', 'NNP'), ('INVOKE', 'NNP'), ('MARGINAL', 'NNP'), ('RELATIONS', 'NNP'), ('BETWEEN', 'NNP'), ('MULTIPLE', 'NNP'), ('AGENCIES', 'NNP'), ('AND', 'NNP'), ('EXCEED', 'NNP'), ('THE', 'NNP'), ('CLASSICAL', 'NNP'), ('LOGIC', 'NNP'), ('OF', 'NNP'), ('OBJECTS', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'VBP'), ('INSEPARABLE', 'NNP'), ('FROM', 'NNP'), ('THEIR', 'NNP'), ('CONTEXT', 'NNP'), ('AND', 'NNP'), ('OFFER', 'NNP'), ('WAYS', 'NNP'), ('OF', 'NNP'), ('THINKING', 'NNP'), ('AND', 'NNP'), ('EXPERIMENTING', 'NNP'), ('WITH', 'NNP'), ('THE', 'NNP'), ('CONVENTIONS', 'NNP'), ('OF', 'NNP'), ('MAKING', 'NNP'), ('AND', 'NNP'), ('BEING', 'NNP'), ('EMBODIED', 'NNP'), ('.', '.')]\n"
]
}
],
"source": [
"tagged = nltk.pos_tag(tokens)\n",
"print(tagged)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, you could select for example all the type of **verbs**:"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['LIQUID', 'BODIES', 'INVITE', 'US', 'TO', 'ARTICULATE', 'THE', 'FUZZINESS', 'PARADOXES', 'AND', 'UNCERTAINTIES', 'THE', 'LIVING', 'REALM', 'THEY', 'ARE', 'STILL', 'INSTANTLY', 'RECOGNIZABLE', 'CAN', 'BE', 'NAMED', 'AS', 'TORNADO', 'CIRRUS', 'SOIL', 'EMBRYO', 'OR', 'BIOFILM', 'THESE', 'CONTRADICTIONS', '', 'OF', 'FORM', 'AND', 'CONSTANCY', '', 'ENCOURAGE', 'ALTERNATIVE', 'READINGS', 'OF', 'HOW', 'WE', 'ORDER', 'AND', 'SORT', 'THE', 'WORLD', 'WHOSE', 'MAIN', 'METHODOLOGY', 'IS', 'THROUGH', 'RELATING', 'ONE', 'BODY', 'TO', 'ANOTHER', 'INDEED', 'PROTEAN', 'LIQUID', 'BODIES', 'HELP', 'US', 'UNDERSTAND', 'THAT', 'WHILE', 'UNIVERSALISMS', 'AVERAGES', 'AND', 'GENERALIZATIONS', 'ARE', 'USEFUL', 'IN', 'PRODUCING', 'MAPS', 'OUR', 'BEING', 'IN', 'THE', 'WORLD', 'THEY', 'NEGLECT', 'SPECIFIC', 'DETAILS', 'WHICH', '', 'BRING', 'FORTH', 'THE', 'MATERIALITY', 'THE', 'ENVIRONMENT']\n"
]
}
],
"source": [
"selection = []\n",
"\n",
"for word, tag in tagged:\n",
" if 'NN' in tag:\n",
" selection.append(word)\n",
"\n",
"print(selection)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Where do these tags come from?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n",
"\n",
"From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n",
"\n",
"From: http://www.nltk.org/book_1ed/ch05.html"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "LookupError",
"evalue": "\n**********************************************************************\n Resource \u001b[93mtagsets\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('tagsets')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mhelp/tagsets/PY3/upenn_tagset.pickle\u001b[0m\n\n Searched in:\n - '/home/kendalb/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n - ''\n**********************************************************************\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-b694f07a3ba6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhelp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupenn_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'PRP'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/help.py\u001b[0m in \u001b[0;36mupenn_tagset\u001b[0;34m(tagpattern)\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mupenn_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagpattern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0m_format_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"upenn_tagset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/help.py\u001b[0m in \u001b[0;36m_format_tagset\u001b[0;34m(tagset, tagpattern)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_format_tagset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0mtagdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"help/tagsets/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtagset\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".pickle\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtagpattern\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0m_print_entries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtagdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 751\u001b[0m \u001b[0;31m# Load the resource.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 752\u001b[0;31m \u001b[0mopened_resource\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 753\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"raw\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(resource_url)\u001b[0m\n\u001b[1;32m 875\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 876\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"nltk\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 877\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 878\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"file\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 879\u001b[0m \u001b[0;31m# urllib might not use mode='rb', so handle this one ourselves:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mfind\u001b[0;34m(resource_name, paths)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"*\"\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m70\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0mresource_not_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\n%s\\n%s\\n%s\\n\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0msep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mLookupError\u001b[0m: \n**********************************************************************\n Resource \u001b[93mtagsets\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('tagsets')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mhelp/tagsets/PY3/upenn_tagset.pickle\u001b[0m\n\n Searched in:\n - '/home/kendalb/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n - ''\n**********************************************************************\n"
]
}
],
"source": [
"nltk.help.upenn_tagset('PRP')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"------------"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n",
"\n",
"<table cellspacing=\"2\" cellpadding=\"2\" border=\"0\">\n",
" <tbody><tr bgcolor=\"#DFDFFF\" align=\"none\"> \n",
" <td align=\"none\"> \n",
" <div align=\"left\">Number</div>\n",
" </td>\n",
" <td> \n",
" <div align=\"left\">Tag</div>\n",
" </td>\n",
" <td> \n",
" <div align=\"left\">Description</div>\n",
" </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 1. </td>\n",
" <td>CC </td>\n",
" <td>Coordinating conjunction </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 2. </td>\n",
" <td>CD </td>\n",
" <td>Cardinal number </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 3. </td>\n",
" <td>DT </td>\n",
" <td>Determiner </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 4. </td>\n",
" <td>EX </td>\n",
" <td>Existential <i>there<i> </i></i></td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 5. </td>\n",
" <td>FW </td>\n",
" <td>Foreign word </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 6. </td>\n",
" <td>IN </td>\n",
" <td>Preposition or subordinating conjunction </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 7. </td>\n",
" <td>JJ </td>\n",
" <td>Adjective </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 8. </td>\n",
" <td>JJR </td>\n",
" <td>Adjective, comparative </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 9. </td>\n",
" <td>JJS </td>\n",
" <td>Adjective, superlative </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 10. </td>\n",
" <td>LS </td>\n",
" <td>List item marker </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 11. </td>\n",
" <td>MD </td>\n",
" <td>Modal </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 12. </td>\n",
" <td>NN </td>\n",
" <td>Noun, singular or mass </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 13. </td>\n",
" <td>NNS </td>\n",
" <td>Noun, plural </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 14. </td>\n",
" <td>NNP </td>\n",
" <td>Proper noun, singular </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 15. </td>\n",
" <td>NNPS </td>\n",
" <td>Proper noun, plural </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 16. </td>\n",
" <td>PDT </td>\n",
" <td>Predeterminer </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 17. </td>\n",
" <td>POS </td>\n",
" <td>Possessive ending </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 18. </td>\n",
" <td>PRP </td>\n",
" <td>Personal pronoun </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 19. </td>\n",
" <td>PRP\\$ </td>\n",
" <td>Possessive pronoun </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 20. </td>\n",
" <td>RB </td>\n",
" <td>Adverb </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 21. </td>\n",
" <td>RBR </td>\n",
" <td>Adverb, comparative </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 22. </td>\n",
" <td>RBS </td>\n",
" <td>Adverb, superlative </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 23. </td>\n",
" <td>RP </td>\n",
" <td>Particle </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 24. </td>\n",
" <td>SYM </td>\n",
" <td>Symbol </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 25. </td>\n",
" <td>TO </td>\n",
" <td><i>to</i> </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 26. </td>\n",
" <td>UH </td>\n",
" <td>Interjection </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 27. </td>\n",
" <td>VB </td>\n",
" <td>Verb, base form </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 28. </td>\n",
" <td>VBD </td>\n",
" <td>Verb, past tense </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 29. </td>\n",
" <td>VBG </td>\n",
" <td>Verb, gerund or present participle </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 30. </td>\n",
" <td>VBN </td>\n",
" <td>Verb, past participle </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 31. </td>\n",
" <td>VBP </td>\n",
" <td>Verb, non-3rd person singular present </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 32. </td>\n",
" <td>VBZ </td>\n",
" <td>Verb, 3rd person singular present </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 33. </td>\n",
" <td>WDT </td>\n",
" <td>Wh-determiner </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 34. </td>\n",
" <td>WP </td>\n",
" <td>Wh-pronoun </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 35. </td>\n",
" <td>WP$ </td>\n",
" <td>Possessive wh-pronoun </td>\n",
" </tr>\n",
" <tr bgcolor=\"#FFFFCA\"> \n",
" <td align=\"none\"> 36. </td>\n",
" <td>WRB </td>\n",
" <td>Wh-adverb \n",
"</td></tr></tbody></table>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## A telling/tricky case\n",
"It's important to realize that POS tagging is not a fixed property of a word -- but depends on the context of each word. The NLTK book gives an example of [homonyms](http://www.nltk.org/book_1ed/ch05.html#using-a-tagger) -- words that are written the same, but are actually pronounced differently and have different meanings depending on their use."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text = nltk.word_tokenize(\"They refuse to permit us to obtain the refuse permit\")\n",
"nltk.pos_tag(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"From the book:\n",
"\n",
"> Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning \"deny,\" while REFuse is a noun meaning \"trash\" (i.e. they are not homophones). Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Applying to an entire text"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"language = open('../txt/language.txt').read()\n",
"tokens = nltk.word_tokenize(language)\n",
"tagged = nltk.pos_tag(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('Language', 'NN'),\n",
" ('Florian', 'JJ'),\n",
" ('Cramer', 'NNP'),\n",
" ('Software', 'NNP'),\n",
" ('and', 'CC'),\n",
" ('language', 'NN'),\n",
" ('are', 'VBP'),\n",
" ('intrinsically', 'RB'),\n",
" ('related', 'VBN'),\n",
" (',', ','),\n",
" ('since', 'IN'),\n",
" ('software', 'NN'),\n",
" ('may', 'MD'),\n",
" ('process', 'VB'),\n",
" ('language', 'NN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('is', 'VBZ'),\n",
" ('constructed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('language', 'NN'),\n",
" ('.', '.'),\n",
" ('Yet', 'CC'),\n",
" ('language', 'NN'),\n",
" ('means', 'VBZ'),\n",
" ('different', 'JJ'),\n",
" ('things', 'NNS'),\n",
" ('in', 'IN'),\n",
" ('the', 'DT'),\n",
" ('context', 'NN'),\n",
" ('of', 'IN'),\n",
" ('computing', 'VBG'),\n",
" (':', ':'),\n",
" ('formal', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('in', 'IN'),\n",
" ('which', 'WDT'),\n",
" ('algorithms', 'EX'),\n",
" ('are', 'VBP'),\n",
" ('expressed', 'VBN'),\n",
" ('and', 'CC'),\n",
" ('software', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('implemented', 'VBN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('in', 'IN'),\n",
" ('so-called', 'JJ'),\n",
" ('“', 'NNP'),\n",
" ('natural', 'JJ'),\n",
" ('”', 'NNP'),\n",
" ('spoken', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('.', '.'),\n",
" ('There', 'EX'),\n",
" ('are', 'VBP'),\n",
" ('at', 'IN'),\n",
" ('least', 'JJS'),\n",
" ('two', 'CD'),\n",
" ('layers', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('formal', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('in', 'IN'),\n",
" ('software', 'NN'),\n",
" (':', ':'),\n",
" ('programming', 'NN'),\n",
" ('language', 'NN'),\n",
" ('in', 'IN'),\n",
" ('which', 'WDT'),\n",
" ('the', 'DT'),\n",
" ('software', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('written', 'VBN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('the', 'DT'),\n",
" ('language', 'NN'),\n",
" ('implemented', 'VBD'),\n",
" ('within', 'IN'),\n",
" ('the', 'DT'),\n",
" ('software', 'NN'),\n",
" ('as', 'IN'),\n",
" ('its', 'PRP$'),\n",
" ('symbolic', 'JJ'),\n",
" ('controls', 'NNS'),\n",
" ('.', '.'),\n",
" ('In', 'IN'),\n",
" ('the', 'DT'),\n",
" ('case', 'NN'),\n",
" ('of', 'IN'),\n",
" ('compilers', 'NNS'),\n",
" (',', ','),\n",
" ('shells', 'NNS'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('macro', 'NN'),\n",
" ('languages', 'NNS'),\n",
" (',', ','),\n",
" ('for', 'IN'),\n",
" ('example', 'NN'),\n",
" (',', ','),\n",
" ('these', 'DT'),\n",
" ('layers', 'NNS'),\n",
" ('can', 'MD'),\n",
" ('overlap', 'VB'),\n",
" ('.', '.'),\n",
" ('“', 'VB'),\n",
" ('Natural', 'NNP'),\n",
" ('”', 'NNP'),\n",
" ('language', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('what', 'WP'),\n",
" ('can', 'MD'),\n",
" ('be', 'VB'),\n",
" ('processed', 'VBN'),\n",
" ('as', 'IN'),\n",
" ('data', 'NNS'),\n",
" ('by', 'IN'),\n",
" ('software', 'NN'),\n",
" (';', ':'),\n",
" ('since', 'IN'),\n",
" ('this', 'DT'),\n",
" ('processing', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('formal', 'JJ'),\n",
" (',', ','),\n",
" ('however', 'RB'),\n",
" (',', ','),\n",
" ('it', 'PRP'),\n",
" ('is', 'VBZ'),\n",
" ('restricted', 'VBN'),\n",
" ('to', 'TO'),\n",
" ('syntactical', 'JJ'),\n",
" ('operations', 'NNS'),\n",
" ('.', '.'),\n",
" ('While', 'IN'),\n",
" ('differentiation', 'NN'),\n",
" ('of', 'IN'),\n",
" ('computer', 'NN'),\n",
" ('programming', 'VBG'),\n",
" ('languages', 'NNS'),\n",
" ('as', 'IN'),\n",
" ('“', 'JJ'),\n",
" ('artificial', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('from', 'IN'),\n",
" ('languages', 'NNS'),\n",
" ('like', 'VBP'),\n",
" ('English', 'NNP'),\n",
" ('as', 'IN'),\n",
" ('“', 'NNP'),\n",
" ('natural', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('is', 'VBZ'),\n",
" ('conceptually', 'RB'),\n",
" ('important', 'JJ'),\n",
" ('and', 'CC'),\n",
" ('undisputed', 'JJ'),\n",
" (',', ','),\n",
" ('it', 'PRP'),\n",
" ('remains', 'VBZ'),\n",
" ('problematic', 'JJ'),\n",
" ('in', 'IN'),\n",
" ('its', 'PRP$'),\n",
" ('pure', 'NN'),\n",
" ('terminology', 'NN'),\n",
" (':', ':'),\n",
" ('There', 'EX'),\n",
" ('is', 'VBZ'),\n",
" ('nothing', 'NN'),\n",
" ('“', 'JJ'),\n",
" ('natural', 'JJ'),\n",
" ('”', 'NN'),\n",
" ('about', 'IN'),\n",
" ('spoken', 'JJ'),\n",
" ('language', 'NN'),\n",
" (';', ':'),\n",
" ('it', 'PRP'),\n",
" ('is', 'VBZ'),\n",
" ('a', 'DT'),\n",
" ('cultural', 'JJ'),\n",
" ('construct', 'NN'),\n",
" ('and', 'CC'),\n",
" ('thus', 'RB'),\n",
" ('just', 'RB'),\n",
" ('as', 'IN'),\n",
" ('“', 'JJ'),\n",
" ('artificial', 'JJ'),\n",
" ('”', 'NN'),\n",
" ('as', 'IN'),\n",
" ('any', 'DT'),\n",
" ('formal', 'JJ'),\n",
" ('machine', 'NN'),\n",
" ('control', 'NN'),\n",
" ('language', 'NN'),\n",
" ('.', '.'),\n",
" ('To', 'TO'),\n",
" ('call', 'VB'),\n",
" ('programming', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('“', 'VBP'),\n",
" ('machine', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('doesn', 'JJ'),\n",
" ('', 'NNP'),\n",
" ('t', 'NN'),\n",
" ('solve', 'VBP'),\n",
" ('the', 'DT'),\n",
" ('problem', 'NN'),\n",
" ('either', 'RB'),\n",
" (',', ','),\n",
" ('as', 'IN'),\n",
" ('it', 'PRP'),\n",
" ('obscures', 'VBZ'),\n",
" ('that', 'IN'),\n",
" ('“', 'FW'),\n",
" ('machine', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('are', 'VBP'),\n",
" ('human', 'JJ'),\n",
" ('creations', 'NNS'),\n",
" ('.', '.'),\n",
" ('High-level', 'JJ'),\n",
" ('machine-independent', 'JJ'),\n",
" ('programming', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('such', 'JJ'),\n",
" ('as', 'IN'),\n",
" ('Fortran', 'NNP'),\n",
" (',', ','),\n",
" ('C', 'NNP'),\n",
" (',', ','),\n",
" ('Java', 'NNP'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('Basic', 'NNP'),\n",
" ('are', 'VBP'),\n",
" ('not', 'RB'),\n",
" ('even', 'RB'),\n",
" ('direct', 'JJ'),\n",
" ('mappings', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('machine', 'NN'),\n",
" ('logic', 'NN'),\n",
" ('.', '.'),\n",
" ('If', 'IN'),\n",
" ('programming', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('are', 'VBP'),\n",
" ('human', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('for', 'IN'),\n",
" ('machine', 'NN'),\n",
" ('control', 'NN'),\n",
" (',', ','),\n",
" ('they', 'PRP'),\n",
" ('could', 'MD'),\n",
" ('be', 'VB'),\n",
" ('called', 'VBN'),\n",
" ('cybernetic', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('.', '.'),\n",
" ('But', 'CC'),\n",
" ('these', 'DT'),\n",
" ('languages', 'NNS'),\n",
" ('can', 'MD'),\n",
" ('also', 'RB'),\n",
" ('be', 'VB'),\n",
" ('used', 'VBN'),\n",
" ('outside', 'JJ'),\n",
" ('machines—in', 'NN'),\n",
" ('programming', 'VBG'),\n",
" ('handbooks', 'NNS'),\n",
" (',', ','),\n",
" ('for', 'IN'),\n",
" ('example', 'NN'),\n",
" (',', ','),\n",
" ('in', 'IN'),\n",
" ('programmer', 'NN'),\n",
" ('', 'NNP'),\n",
" ('s', 'NN'),\n",
" ('dinner', 'NN'),\n",
" ('table', 'JJ'),\n",
" ('jokes', 'NNS'),\n",
" (',', ','),\n",
" ('or', 'CC'),\n",
" ('as', 'IN'),\n",
" ('abstract', 'JJ'),\n",
" ('formal', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('for', 'IN'),\n",
" ('expressing', 'VBG'),\n",
" ('logical', 'JJ'),\n",
" ('constructs', 'NNS'),\n",
" (',', ','),\n",
" ('such', 'JJ'),\n",
" ('as', 'IN'),\n",
" ('in', 'IN'),\n",
" ('Hugh', 'NNP'),\n",
" ('Kenner', 'NNP'),\n",
" ('', 'NNP'),\n",
" ('s', 'NN'),\n",
" ('use', 'NN'),\n",
" ('of', 'IN'),\n",
" ('the', 'DT'),\n",
" ('Pascal', 'NNP'),\n",
" ('programming', 'NN'),\n",
" ('language', 'NN'),\n",
" ('to', 'TO'),\n",
" ('explain', 'VB'),\n",
" ('aspects', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('the', 'DT'),\n",
" ('structure', 'NN'),\n",
" ('of', 'IN'),\n",
" ('Samuel', 'NNP'),\n",
" ('Beckett', 'NNP'),\n",
" ('', 'NNP'),\n",
" ('s', 'VBD'),\n",
" ('writing.1', 'NN'),\n",
" ('In', 'IN'),\n",
" ('this', 'DT'),\n",
" ('sense', 'NN'),\n",
" (',', ','),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('could', 'MD'),\n",
" ('be', 'VB'),\n",
" ('more', 'RBR'),\n",
" ('broadly', 'RB'),\n",
" ('defined', 'VBN'),\n",
" ('as', 'IN'),\n",
" ('syntactical', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('as', 'IN'),\n",
" ('opposed', 'VBN'),\n",
" ('to', 'TO'),\n",
" ('semantic', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('.', '.'),\n",
" ('But', 'CC'),\n",
" ('this', 'DT'),\n",
" ('terminology', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('not', 'RB'),\n",
" ('without', 'IN'),\n",
" ('its', 'PRP$'),\n",
" ('problems', 'NNS'),\n",
" ('either', 'DT'),\n",
" ('.', '.'),\n",
" ('Common', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('like', 'IN'),\n",
" ('English', 'NNP'),\n",
" ('are', 'VBP'),\n",
" ('both', 'DT'),\n",
" ('formal', 'JJ'),\n",
" ('and', 'CC'),\n",
" ('semantic', 'JJ'),\n",
" (';', ':'),\n",
" ('although', 'IN'),\n",
" ('their', 'PRP$'),\n",
" ('scope', 'NN'),\n",
" ('extends', 'VBZ'),\n",
" ('beyond', 'IN'),\n",
" ('the', 'DT'),\n",
" ('formal', 'JJ'),\n",
" (',', ','),\n",
" ('anything', 'NN'),\n",
" ('that', 'WDT'),\n",
" ('can', 'MD'),\n",
" ('be', 'VB'),\n",
" ('expressed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('a', 'DT'),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('language', 'NN'),\n",
" ('can', 'MD'),\n",
" ('also', 'RB'),\n",
" ('be', 'VB'),\n",
" ('expressed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('common', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('.', '.'),\n",
" ('It', 'PRP'),\n",
" ('follows', 'VBZ'),\n",
" ('that', 'IN'),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('languages', 'NNS'),\n",
" ('are', 'VBP'),\n",
" ('a', 'DT'),\n",
" ('formal', 'JJ'),\n",
" ('(', '('),\n",
" ('and', 'CC'),\n",
" ('as', 'IN'),\n",
" ('such', 'JJ'),\n",
" ('rather', 'RB'),\n",
" ('primitive', 'JJ'),\n",
" (')', ')'),\n",
" ('subset', 'NN'),\n",
" ('of', 'IN'),\n",
" ('common', 'JJ'),\n",
" ('human', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" ('.', '.'),\n",
" ('To', 'TO'),\n",
" ('complicate', 'VB'),\n",
" ('things', 'NNS'),\n",
" ('even', 'RB'),\n",
" ('further', 'RB'),\n",
" (',', ','),\n",
" ('computer', 'NN'),\n",
" ('science', 'NN'),\n",
" ('has', 'VBZ'),\n",
" ('its', 'PRP$'),\n",
" ('own', 'JJ'),\n",
" ('understanding', 'NN'),\n",
" ('of', 'IN'),\n",
" ('“', 'NNP'),\n",
" ('operational', 'JJ'),\n",
" ('semantics', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('in', 'IN'),\n",
" ('programming', 'NN'),\n",
" ('languages', 'NNS'),\n",
" (',', ','),\n",
" ('for', 'IN'),\n",
" ('example', 'NN'),\n",
" ('in', 'IN'),\n",
" ('the', 'DT'),\n",
" ('construction', 'NN'),\n",
" ('of', 'IN'),\n",
" ('a', 'DT'),\n",
" ('programming', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('interpreter', 'NN'),\n",
" ('or', 'CC'),\n",
" ('compiler', 'NN'),\n",
" ('.', '.'),\n",
" ('Just', 'RB'),\n",
" ('as', 'IN'),\n",
" ('this', 'DT'),\n",
" ('interpreter', 'NN'),\n",
" ('doesn', 'NN'),\n",
" ('', 'NNP'),\n",
" ('t', 'NN'),\n",
" ('perform', 'NN'),\n",
" ('“', 'NNP'),\n",
" ('interpretations', 'NNS'),\n",
" ('”', 'VBP'),\n",
" ('in', 'IN'),\n",
" ('a', 'DT'),\n",
" ('hermeneutic', 'JJ'),\n",
" ('sense', 'NN'),\n",
" ('of', 'IN'),\n",
" ('semantic', 'JJ'),\n",
" ('text', 'NN'),\n",
" ('explication', 'NN'),\n",
" (',', ','),\n",
" ('the', 'DT'),\n",
" ('computer', 'NN'),\n",
" ('science', 'NN'),\n",
" ('notion', 'NN'),\n",
" ('of', 'IN'),\n",
" ('“', 'JJ'),\n",
" ('semantics', 'NNS'),\n",
" ('”', 'JJ'),\n",
" ('defies', 'NNS'),\n",
" ('linguistic', 'JJ'),\n",
" ('and', 'CC'),\n",
" ('common', 'JJ'),\n",
" ('sense', 'NN'),\n",
" ('understanding', 'NN'),\n",
" ('of', 'IN'),\n",
" ('the', 'DT'),\n",
" ('word', 'NN'),\n",
" (',', ','),\n",
" ('since', 'IN'),\n",
" ('compiler', 'NN'),\n",
" ('construction', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('purely', 'RB'),\n",
" ('syntactical', 'JJ'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('programming', 'VBG'),\n",
" ('languages', 'NNS'),\n",
" ('denote', 'VBP'),\n",
" ('nothing', 'NN'),\n",
" ('but', 'CC'),\n",
" ('syntactical', 'JJ'),\n",
" ('manipulations', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('symbols', 'NNS'),\n",
" ('.', '.'),\n",
" ('What', 'WP'),\n",
" ('might', 'MD'),\n",
" ('more', 'JJR'),\n",
" ('suitably', 'RB'),\n",
" ('be', 'VB'),\n",
" ('called', 'VBN'),\n",
" ('the', 'DT'),\n",
" ('semantics', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('languages', 'VBZ'),\n",
" ('resides', 'NNS'),\n",
" ('in', 'IN'),\n",
" ('the', 'DT'),\n",
" ('symbols', 'NNS'),\n",
" ('with', 'IN'),\n",
" ('which', 'WDT'),\n",
" ('those', 'DT'),\n",
" ('operations', 'NNS'),\n",
" ('are', 'VBP'),\n",
" ('denoted', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('most', 'JJS'),\n",
" ('programming', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" (':', ':'),\n",
" ('English', 'JJ'),\n",
" ('words', 'NNS'),\n",
" ('like', 'IN'),\n",
" ('“', 'NN'),\n",
" ('if', 'IN'),\n",
" (',', ','),\n",
" ('”', 'FW'),\n",
" ('“', 'FW'),\n",
" ('then', 'RB'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('else', 'RB'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('for', 'IN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('while', 'IN'),\n",
" (',', ','),\n",
" ('”', 'FW'),\n",
" ('“', 'NNP'),\n",
" ('goto', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('and', 'CC'),\n",
" ('“', 'NNP'),\n",
" ('print', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NN'),\n",
" ('in', 'IN'),\n",
" ('conjunction', 'NN'),\n",
" ('with', 'IN'),\n",
" ('arithmetical', 'JJ'),\n",
" ('and', 'CC'),\n",
" ('punctuation', 'NN'),\n",
" ('symbols', 'NNS'),\n",
" (';', ':'),\n",
" ('in', 'IN'),\n",
" ('alphabetic', 'JJ'),\n",
" ('software', 'NN'),\n",
" ('controls', 'NNS'),\n",
" (',', ','),\n",
" ('words', 'NNS'),\n",
" ('like', 'IN'),\n",
" ('“', 'NNP'),\n",
" ('list', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('move', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('copy', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NN'),\n",
" ('and', 'CC'),\n",
" ('“', 'NNP'),\n",
" ('paste', 'NN'),\n",
" ('”', 'NN'),\n",
" (';', ':'),\n",
" ('in', 'IN'),\n",
" ('graphical', 'JJ'),\n",
" ('software', 'NN'),\n",
" ('controls', 'NNS'),\n",
" (',', ','),\n",
" ('such', 'JJ'),\n",
" ('as', 'IN'),\n",
" ('symbols', 'NNS'),\n",
" ('like', 'IN'),\n",
" ('the', 'DT'),\n",
" ('trash', 'NN'),\n",
" ('can', 'MD'),\n",
" ('.', '.'),\n",
" ('Ferdinand', 'NNP'),\n",
" ('de', 'IN'),\n",
" ('Saussure', 'NNP'),\n",
" ('states', 'VBZ'),\n",
" ('that', 'IN'),\n",
" ('the', 'DT'),\n",
" ('signs', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('common', 'JJ'),\n",
" ('human', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('are', 'VBP'),\n",
" ('arbitrary2', 'RB'),\n",
" ('because', 'IN'),\n",
" ('it', 'PRP'),\n",
" ('', 'VBZ'),\n",
" ('s', 'JJ'),\n",
" ('purely', 'RB'),\n",
" ('a', 'DT'),\n",
" ('cultural-social', 'JJ'),\n",
" ('convention', 'NN'),\n",
" ('that', 'IN'),\n",
" ('assigns', 'VBZ'),\n",
" ('phonemes', 'NNS'),\n",
" ('to', 'TO'),\n",
" ('concepts', 'NNS'),\n",
" ('.', '.'),\n",
" ('Likewise', 'NNP'),\n",
" (',', ','),\n",
" ('it', 'PRP'),\n",
" ('', 'VBZ'),\n",
" ('s', 'JJ'),\n",
" ('purely', 'RB'),\n",
" ('a', 'DT'),\n",
" ('cultural', 'JJ'),\n",
" ('convention', 'NN'),\n",
" ('to', 'TO'),\n",
" ('assign', 'VB'),\n",
" ('symbols', 'NNS'),\n",
" ('to', 'TO'),\n",
" ('machine', 'NN'),\n",
" ('operations', 'NNS'),\n",
" ('.', '.'),\n",
" ('But', 'CC'),\n",
" ('just', 'RB'),\n",
" ('as', 'IN'),\n",
" ('the', 'DT'),\n",
" ('cultural', 'JJ'),\n",
" ('choice', 'NN'),\n",
" ('of', 'IN'),\n",
" ('phonemes', 'NNS'),\n",
" ('in', 'IN'),\n",
" ('spoken', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('restrained', 'VBN'),\n",
" ('by', 'IN'),\n",
" ('what', 'WP'),\n",
" ('the', 'DT'),\n",
" ('human', 'JJ'),\n",
" ('voice', 'NN'),\n",
" ('can', 'MD'),\n",
" ('pronounce', 'VB'),\n",
" (',', ','),\n",
" ('the', 'DT'),\n",
" ('assignment', 'NN'),\n",
" ('of', 'IN'),\n",
" ('symbols', 'NNS'),\n",
" ('to', 'TO'),\n",
" ('machine', 'NN'),\n",
" ('operations', 'NNS'),\n",
" ('is', 'VBZ'),\n",
" ('limited', 'VBN'),\n",
" ('to', 'TO'),\n",
" ('what', 'WP'),\n",
" ('can', 'MD'),\n",
" ('be', 'VB'),\n",
" ('efficiently', 'RB'),\n",
" ('processed', 'VBN'),\n",
" ('by', 'IN'),\n",
" ('the', 'DT'),\n",
" ('machine', 'NN'),\n",
" ('and', 'CC'),\n",
" ('of', 'IN'),\n",
" ('good', 'JJ'),\n",
" ('use', 'NN'),\n",
" ('to', 'TO'),\n",
" ('humans.3', 'VB'),\n",
" ('This', 'DT'),\n",
" ('compromise', 'NN'),\n",
" ('between', 'IN'),\n",
" ('operability', 'NN'),\n",
" ('and', 'CC'),\n",
" ('usability', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('obvious', 'JJ'),\n",
" ('in', 'IN'),\n",
" (',', ','),\n",
" ('for', 'IN'),\n",
" ('example', 'NN'),\n",
" (',', ','),\n",
" ('Unix', 'NNP'),\n",
" ('commands', 'VBZ'),\n",
" ('.', '.'),\n",
" ('Originally', 'RB'),\n",
" ('used', 'VBN'),\n",
" ('on', 'IN'),\n",
" ('teletype', 'NN'),\n",
" ('terminals', 'NNS'),\n",
" (',', ','),\n",
" ('the', 'DT'),\n",
" ('operation', 'NN'),\n",
" ('“', 'NNP'),\n",
" ('copy', 'NN'),\n",
" ('”', 'NN'),\n",
" ('was', 'VBD'),\n",
" ('abbreviated', 'VBN'),\n",
" ('to', 'TO'),\n",
" ('the', 'DT'),\n",
" ('command', 'NN'),\n",
" ('“', 'NNP'),\n",
" ('cp', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('move', 'NN'),\n",
" ('”', 'NN'),\n",
" ('to', 'TO'),\n",
" ('“', 'VB'),\n",
" ('mv', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('“', 'NNP'),\n",
" ('list', 'NN'),\n",
" ('”', 'NN'),\n",
" ('to', 'TO'),\n",
" ('“', 'VB'),\n",
" ('ls', 'NN'),\n",
" (',', ','),\n",
" ('”', 'NNP'),\n",
" ('etc.', 'NN'),\n",
" (',', ','),\n",
" ('in', 'IN'),\n",
" ('order', 'NN'),\n",
" ('to', 'TO'),\n",
" ('cut', 'VB'),\n",
" ('down', 'RP'),\n",
" ('machine', 'NN'),\n",
" ('memory', 'NN'),\n",
" ('use', 'NN'),\n",
" (',', ','),\n",
" ('teletype', 'JJ'),\n",
" ('paper', 'NN'),\n",
" ('consumption', 'NN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('human', 'JJ'),\n",
" ('typing', 'VBG'),\n",
" ('effort', 'NN'),\n",
" ('at', 'IN'),\n",
" ('the', 'DT'),\n",
" ('same', 'JJ'),\n",
" ('time', 'NN'),\n",
" ('.', '.'),\n",
" ('Any', 'DT'),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('language', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('thus', 'RB'),\n",
" ('a', 'DT'),\n",
" ('cultural', 'JJ'),\n",
" ('compromise', 'NN'),\n",
" ('between', 'IN'),\n",
" ('the', 'DT'),\n",
" ('constraints', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('machine', 'NN'),\n",
" ('design—which', 'NN'),\n",
" ('is', 'VBZ'),\n",
" ('far', 'RB'),\n",
" ('from', 'IN'),\n",
" ('objective', 'JJ'),\n",
" (',', ','),\n",
" ('but', 'CC'),\n",
" ('based', 'VBN'),\n",
" ('on', 'IN'),\n",
" ('human', 'JJ'),\n",
" ('choices', 'NNS'),\n",
" (',', ','),\n",
" ('culture', 'NN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('thinking', 'VBG'),\n",
" ('style', 'NN'),\n",
" ('itself', 'PRP'),\n",
" ('4—and', 'CD'),\n",
" ('the', 'DT'),\n",
" ('equally', 'RB'),\n",
" ('subjective', 'JJ'),\n",
" ('user', 'NN'),\n",
" ('preferences', 'NNS'),\n",
" (',', ','),\n",
" ('involving', 'VBG'),\n",
" ('fuzzy', 'JJ'),\n",
" ('factors', 'NNS'),\n",
" ('like', 'IN'),\n",
" ('readability', 'NN'),\n",
" (',', ','),\n",
" ('elegance', 'NN'),\n",
" (',', ','),\n",
" ('and', 'CC'),\n",
" ('usage', 'JJ'),\n",
" ('efficiency', 'NN'),\n",
" ('.', '.'),\n",
" ('The', 'DT'),\n",
" ('symbols', 'NNS'),\n",
" ('of', 'IN'),\n",
" ('computer', 'NN'),\n",
" ('control', 'NN'),\n",
" ('languages', 'VBZ'),\n",
" ('inevitably', 'RB'),\n",
" ('do', 'VBP'),\n",
" ('have', 'VB'),\n",
" ('semantic', 'JJ'),\n",
" ('connotations', 'NNS'),\n",
" ('simply', 'RB'),\n",
" ('because', 'IN'),\n",
" ('there', 'EX'),\n",
" ('exist', 'VBP'),\n",
" ('no', 'DT'),\n",
" ('symbols', 'NNS'),\n",
" ('with', 'IN'),\n",
" ('which', 'WDT'),\n",
" ('humans', 'NNS'),\n",
" ('would', 'MD'),\n",
" ('not', 'RB'),\n",
" ('associate', 'VB'),\n",
" ('some', 'DT'),\n",
" ('meaning', 'NN'),\n",
" ('.', '.'),\n",
" ('But', 'CC'),\n",
" ('symbols', 'NNS'),\n",
" ('can', 'MD'),\n",
" ('', 'VB'),\n",
" ('t', 'JJ'),\n",
" ('denote', 'NN'),\n",
" ('any', 'DT'),\n",
" ('semantic', 'JJ'),\n",
" ('statements', 'NNS'),\n",
" (',', ','),\n",
" ('that', 'DT'),\n",
" ('is', 'VBZ'),\n",
" (',', ','),\n",
" ('they', 'PRP'),\n",
" ('do', 'VBP'),\n",
" ('not', 'RB'),\n",
" ('express', 'VB'),\n",
" ('meaning', 'VBG'),\n",
" ('in', 'IN'),\n",
" ('their', 'PRP$'),\n",
" ('own', 'JJ'),\n",
" ('terms', 'NNS'),\n",
" (';', ':'),\n",
" ('humans', 'NNS'),\n",
" ('metaphorically', 'RB'),\n",
" ('read', 'VB'),\n",
" ('meaning', 'VBG'),\n",
" ('into', 'IN'),\n",
" ('them', 'PRP'),\n",
" ('through', 'IN'),\n",
" ('associations', 'NNS'),\n",
" ('they', 'PRP'),\n",
" ('make', 'VBP'),\n",
" ('.', '.'),\n",
" ('Languages', 'NNS'),\n",
" ('without', 'IN'),\n",
" ('semantic', 'JJ'),\n",
" ('denotation', 'NN'),\n",
" ('are', 'VBP'),\n",
" ('not', 'RB'),\n",
" ('historically', 'RB'),\n",
" ('new', 'JJ'),\n",
" ('phenomena', 'NNS'),\n",
" (';', ':'),\n",
" ('mathematical', 'JJ'),\n",
" ('formulas', 'NNS'),\n",
" ('are', 'VBP'),\n",
" ('their', 'PRP$'),\n",
" ('oldest', 'JJS'),\n",
" ('example', 'NN'),\n",
" ('.', '.'),\n",
" ('In', 'IN'),\n",
" ('comparison', 'NN'),\n",
" ('to', 'TO'),\n",
" ('common', 'JJ'),\n",
" ('human', 'JJ'),\n",
" ('languages', 'NNS'),\n",
" (',', ','),\n",
" ('the', 'DT'),\n",
" ('multitude', 'NN'),\n",
" ('of', 'IN'),\n",
" ('programming', 'VBG'),\n",
" ('languages', 'NNS'),\n",
" ('is', 'VBZ'),\n",
" ('of', 'IN'),\n",
" ('lesser', 'JJR'),\n",
" ('significance', 'NN'),\n",
" ('.', '.'),\n",
" ('The', 'DT'),\n",
" ('criterion', 'NN'),\n",
" ('of', 'IN'),\n",
" ('Turing', 'NNP'),\n",
" ('completeness', 'NN'),\n",
" ('of', 'IN'),\n",
" ('a', 'DT'),\n",
" ('programming', 'NN'),\n",
" ('language', 'NN'),\n",
" (',', ','),\n",
" ('that', 'WDT'),\n",
" ('is', 'VBZ'),\n",
" (',', ','),\n",
" ('that', 'IN'),\n",
" ('any', 'DT'),\n",
" ('computation', 'NN'),\n",
" ('can', 'MD'),\n",
" ('be', 'VB'),\n",
" ('expressed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('it', 'PRP'),\n",
" (',', ','),\n",
" ('means', 'VBZ'),\n",
" ('that', 'IN'),\n",
" ('every', 'DT'),\n",
" ('programming', 'NN'),\n",
" ('language', 'NN'),\n",
" ('is', 'VBZ'),\n",
" (',', ','),\n",
" ('formally', 'RB'),\n",
" ('speaking', 'VBG'),\n",
" (',', ','),\n",
" ('just', 'RB'),\n",
" ('a', 'DT'),\n",
" ('riff', 'NN'),\n",
" ('on', 'IN'),\n",
" ('every', 'DT'),\n",
" ('other', 'JJ'),\n",
" ('programming', 'NN'),\n",
" ('language', 'NN'),\n",
" ('.', '.'),\n",
" ('Nothing', 'NN'),\n",
" ('can', 'MD'),\n",
" ('be', 'VB'),\n",
" ('expressed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('a', 'DT'),\n",
" ('Turingcomplete', 'JJ'),\n",
" ('language', 'NN'),\n",
" ('such', 'JJ'),\n",
" ('as', 'IN'),\n",
" ('C', 'NNP'),\n",
" ('that', 'IN'),\n",
" ('couldn', 'NN'),\n",
" ('', 'NNP'),\n",
" ('t', 'NN'),\n",
" ('also', 'RB'),\n",
" ('be', 'VB'),\n",
" ('expressed', 'VBN'),\n",
" ('in', 'IN'),\n",
" ('another', 'DT'),\n",
" ('Turingcomplete', 'NNP'),\n",
" ('language', 'NN'),\n",
" ('such', 'JJ'),\n",
" ('as', 'IN'),\n",
" ('Lisp', 'NNP'),\n",
" ('(', '('),\n",
" ('or', 'CC'),\n",
" ('Fortran', 'NNP'),\n",
" (',', ','),\n",
" ('Smalltalk', 'NNP'),\n",
" (',', ','),\n",
" ('Java', 'NNP'),\n",
" ('...', ':'),\n",
" (')', ')'),\n",
" ('and', 'CC'),\n",
" ('vice', 'NN'),\n",
" ('versa', 'NN'),\n",
" ('.', '.'),\n",
" ('This', 'DT'),\n",
" ('ultimately', 'JJ'),\n",
" ('proves', 'VBZ'),\n",
" ('the', 'DT'),\n",
" ...]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tagged"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"words = \"in the beginning was heaven and earth and the time of the whatever\".split()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['in',\n",
" 'the',\n",
" 'beginning',\n",
" 'was',\n",
" 'heaven',\n",
" 'and',\n",
" 'earth',\n",
" 'and',\n",
" 'the',\n",
" 'time',\n",
" 'of',\n",
" 'the',\n",
" 'whatever']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words.index(\"the\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IN\n",
"1 the\n",
"BEGINNING\n",
"WAS\n",
"HEAVEN\n",
"AND\n",
"EARTH\n",
"AND\n",
"8 the\n",
"TIME\n",
"OF\n",
"11 the\n",
"WHATEVER\n"
]
}
],
"source": [
"for i, word in enumerate(words):\n",
" if word == \"the\":\n",
" print (i, word)\n",
" else:\n",
" print (word.upper())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'in'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random \n",
"\n",
"words = {}\n",
"words[\"VB\"] = []\n",
"\n",
"for word in nltk.word_tokenize(\"in the beginning was heaven and earth and the time of the whatever\"):\n",
" words[\"VB\"].append(word)\n",
" \n",
"random.choice(words[\"VB\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}