You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
511 lines
13 KiB
Plaintext
511 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# NLTK - Part of Speech"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import nltk\n",
|
|
"import random"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lines = open('../txt/language.txt').readlines()\n",
|
|
"sentence = random.choice(lines)\n",
|
|
"print(sentence)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Tokens"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tokens = nltk.word_tokenize(sentence)\n",
|
|
"print(tokens)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Part of Speech \"tags\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tagged = nltk.pos_tag(tokens)\n",
|
|
"print(tagged)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now, you could select for example all the type of **verbs**:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"selection = []\n",
|
|
"\n",
|
|
"for word, tag in tagged:\n",
|
|
" if 'VB' in tag:\n",
|
|
" selection.append(word)\n",
|
|
"\n",
|
|
"print(selection)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Where do these tags come from?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n",
|
|
"\n",
|
|
"From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n",
|
|
"\n",
|
|
"From: http://www.nltk.org/book_1ed/ch05.html"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"nltk.help.upenn_tagset('PRP')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"------------"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n",
|
|
"\n",
|
|
"<table cellspacing=\"2\" cellpadding=\"2\" border=\"0\">\n",
|
|
" <tbody><tr bgcolor=\"#DFDFFF\" align=\"none\"> \n",
|
|
" <td align=\"none\"> \n",
|
|
" <div align=\"left\">Number</div>\n",
|
|
" </td>\n",
|
|
" <td> \n",
|
|
" <div align=\"left\">Tag</div>\n",
|
|
" </td>\n",
|
|
" <td> \n",
|
|
" <div align=\"left\">Description</div>\n",
|
|
" </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 1. </td>\n",
|
|
" <td>CC </td>\n",
|
|
" <td>Coordinating conjunction </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 2. </td>\n",
|
|
" <td>CD </td>\n",
|
|
" <td>Cardinal number </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 3. </td>\n",
|
|
" <td>DT </td>\n",
|
|
" <td>Determiner </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 4. </td>\n",
|
|
" <td>EX </td>\n",
|
|
" <td>Existential <i>there<i> </i></i></td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 5. </td>\n",
|
|
" <td>FW </td>\n",
|
|
" <td>Foreign word </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 6. </td>\n",
|
|
" <td>IN </td>\n",
|
|
" <td>Preposition or subordinating conjunction </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 7. </td>\n",
|
|
" <td>JJ </td>\n",
|
|
" <td>Adjective </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 8. </td>\n",
|
|
" <td>JJR </td>\n",
|
|
" <td>Adjective, comparative </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 9. </td>\n",
|
|
" <td>JJS </td>\n",
|
|
" <td>Adjective, superlative </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 10. </td>\n",
|
|
" <td>LS </td>\n",
|
|
" <td>List item marker </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 11. </td>\n",
|
|
" <td>MD </td>\n",
|
|
" <td>Modal </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 12. </td>\n",
|
|
" <td>NN </td>\n",
|
|
" <td>Noun, singular or mass </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 13. </td>\n",
|
|
" <td>NNS </td>\n",
|
|
" <td>Noun, plural </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 14. </td>\n",
|
|
" <td>NNP </td>\n",
|
|
" <td>Proper noun, singular </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 15. </td>\n",
|
|
" <td>NNPS </td>\n",
|
|
" <td>Proper noun, plural </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 16. </td>\n",
|
|
" <td>PDT </td>\n",
|
|
" <td>Predeterminer </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 17. </td>\n",
|
|
" <td>POS </td>\n",
|
|
" <td>Possessive ending </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 18. </td>\n",
|
|
" <td>PRP </td>\n",
|
|
" <td>Personal pronoun </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 19. </td>\n",
|
|
" <td>PRP\\$ </td>\n",
|
|
" <td>Possessive pronoun </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 20. </td>\n",
|
|
" <td>RB </td>\n",
|
|
" <td>Adverb </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 21. </td>\n",
|
|
" <td>RBR </td>\n",
|
|
" <td>Adverb, comparative </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 22. </td>\n",
|
|
" <td>RBS </td>\n",
|
|
" <td>Adverb, superlative </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 23. </td>\n",
|
|
" <td>RP </td>\n",
|
|
" <td>Particle </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 24. </td>\n",
|
|
" <td>SYM </td>\n",
|
|
" <td>Symbol </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 25. </td>\n",
|
|
" <td>TO </td>\n",
|
|
" <td><i>to</i> </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 26. </td>\n",
|
|
" <td>UH </td>\n",
|
|
" <td>Interjection </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 27. </td>\n",
|
|
" <td>VB </td>\n",
|
|
" <td>Verb, base form </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 28. </td>\n",
|
|
" <td>VBD </td>\n",
|
|
" <td>Verb, past tense </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 29. </td>\n",
|
|
" <td>VBG </td>\n",
|
|
" <td>Verb, gerund or present participle </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 30. </td>\n",
|
|
" <td>VBN </td>\n",
|
|
" <td>Verb, past participle </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 31. </td>\n",
|
|
" <td>VBP </td>\n",
|
|
" <td>Verb, non-3rd person singular present </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 32. </td>\n",
|
|
" <td>VBZ </td>\n",
|
|
" <td>Verb, 3rd person singular present </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 33. </td>\n",
|
|
" <td>WDT </td>\n",
|
|
" <td>Wh-determiner </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 34. </td>\n",
|
|
" <td>WP </td>\n",
|
|
" <td>Wh-pronoun </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 35. </td>\n",
|
|
" <td>WP$ </td>\n",
|
|
" <td>Possessive wh-pronoun </td>\n",
|
|
" </tr>\n",
|
|
" <tr bgcolor=\"#FFFFCA\"> \n",
|
|
" <td align=\"none\"> 36. </td>\n",
|
|
" <td>WRB </td>\n",
|
|
" <td>Wh-adverb \n",
|
|
"</td></tr></tbody></table>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## A telling/tricky case\n",
|
|
"It's important to realize that POS tagging is not a fixed property of a word -- but depends on the context of each word. The NLTK book gives an example of [homonyms](http://www.nltk.org/book_1ed/ch05.html#using-a-tagger) -- words that are written the same, but are actually pronounced differently and have different meanings depending on their use."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text = nltk.word_tokenize(\"They refuse to permit us to obtain the refuse permit\")\n",
|
|
"nltk.pos_tag(text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"From the book:\n",
|
|
"\n",
|
|
"> Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning \"deny,\" while REFuse is a noun meaning \"trash\" (i.e. they are not homophones). Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Applying to an entire text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"language = open('../txt/language.txt').read()\n",
|
|
"tokens = nltk.word_tokenize(language)\n",
|
|
"tagged = nltk.pos_tag(tokens)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tagged"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"words = \"in the beginning was heaven and earth and the time of the whatever\".split()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"words"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"1"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"words.index(\"the\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"IN\n",
|
|
"1 the\n",
|
|
"BEGINNING\n",
|
|
"WAS\n",
|
|
"HEAVEN\n",
|
|
"AND\n",
|
|
"EARTH\n",
|
|
"AND\n",
|
|
"8 the\n",
|
|
"TIME\n",
|
|
"OF\n",
|
|
"11 the\n",
|
|
"WHATEVER\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for i, word in enumerate(words):\n",
|
|
" if word == \"the\":\n",
|
|
" print (i, word)\n",
|
|
" else:\n",
|
|
" print (word.upper())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'VB'"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import random \n",
|
|
"\n",
|
|
"words = {}\n",
|
|
"words[\"VB\"] = []\n",
|
|
"\n",
|
|
"for word in nltk.word_tokenize(\"in the beginning was heaven and earth and the time of the whatever\"):\n",
|
|
" words[\"VB\"].append(word)\n",
|
|
" \n",
|
|
"random.choice(words[\"VB\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|