{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK - Part of Speech" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import random" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "To complicate things even further, computer science has its own understanding of “operational semantics” in programming languages, for example in the construction of a programming language interpreter or compiler.\n", "\n" ] } ], "source": [ "lines = open('txt/language.txt').readlines()\n", "sentence = random.choice(lines)\n", "print(sentence)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokens" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['To', 'complicate', 'things', 'even', 'further', ',', 'computer', 'science', 'has', 'its', 'own', 'understanding', 'of', '“', 'operational', 'semantics', '”', 'in', 'programming', 'languages', ',', 'for', 'example', 'in', 'the', 'construction', 'of', 'a', 'programming', 'language', 'interpreter', 'or', 'compiler', '.']\n" ] } ], "source": [ "tokens = nltk.word_tokenize(sentence)\n", "print(tokens)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Part of Speech \"tags\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('To', 'TO'), ('complicate', 'VB'), ('things', 'NNS'), ('even', 'RB'), ('further', 'RB'), (',', ','), ('computer', 'NN'), ('science', 'NN'), ('has', 'VBZ'), ('its', 'PRP$'), ('own', 'JJ'), ('understanding', 'NN'), ('of', 'IN'), ('“', 'NNP'), ('operational', 'JJ'), ('semantics', 'NNS'), ('”', 'VBP'), ('in', 'IN'), ('programming', 'NN'), ('languages', 'NNS'), (',', ','), ('for', 'IN'), ('example', 'NN'), ('in', 'IN'), ('the', 'DT'), ('construction', 'NN'), ('of', 'IN'), ('a', 'DT'), ('programming', 'JJ'), ('language', 'NN'), ('interpreter', 'NN'), ('or', 'CC'), ('compiler', 'NN'), ('.', '.')]\n" ] } ], "source": [ "tagged = nltk.pos_tag(tokens)\n", "print(tagged)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, you could select for example all the type of **verbs**:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['complicate', 'has', '”']\n" ] } ], "source": [ "selection = []\n", "\n", "for word, tag in tagged:\n", " if 'VB' in tag:\n", " selection.append(word)\n", "\n", "print(selection)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Where do these tags come from?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n", "\n", "From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n", "\n", "From: http://www.nltk.org/book_1ed/ch05.html" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PRP: pronoun, personal\n", " hers herself him himself hisself it itself me myself one oneself ours\n", " ourselves ownself self she thee theirs them themselves they thou thy us\n" ] } ], "source": [ "nltk.help.upenn_tagset('PRP')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n", "\n", "
\n",
" Number \n",
" | \n",
" \n",
" Tag \n",
" | \n",
" \n",
" Description \n",
" | \n",
"
1. | \n", "CC | \n", "Coordinating conjunction | \n", "
2. | \n", "CD | \n", "Cardinal number | \n", "
3. | \n", "DT | \n", "Determiner | \n", "
4. | \n", "EX | \n", "Existential there | \n", "
5. | \n", "FW | \n", "Foreign word | \n", "
6. | \n", "IN | \n", "Preposition or subordinating conjunction | \n", "
7. | \n", "JJ | \n", "Adjective | \n", "
8. | \n", "JJR | \n", "Adjective, comparative | \n", "
9. | \n", "JJS | \n", "Adjective, superlative | \n", "
10. | \n", "LS | \n", "List item marker | \n", "
11. | \n", "MD | \n", "Modal | \n", "
12. | \n", "NN | \n", "Noun, singular or mass | \n", "
13. | \n", "NNS | \n", "Noun, plural | \n", "
14. | \n", "NNP | \n", "Proper noun, singular | \n", "
15. | \n", "NNPS | \n", "Proper noun, plural | \n", "
16. | \n", "PDT | \n", "Predeterminer | \n", "
17. | \n", "POS | \n", "Possessive ending | \n", "
18. | \n", "PRP | \n", "Personal pronoun | \n", "
19. | \n", "PRP\\$ | \n", "Possessive pronoun | \n", "
20. | \n", "RB | \n", "Adverb | \n", "
21. | \n", "RBR | \n", "Adverb, comparative | \n", "
22. | \n", "RBS | \n", "Adverb, superlative | \n", "
23. | \n", "RP | \n", "Particle | \n", "
24. | \n", "SYM | \n", "Symbol | \n", "
25. | \n", "TO | \n", "to | \n", "
26. | \n", "UH | \n", "Interjection | \n", "
27. | \n", "VB | \n", "Verb, base form | \n", "
28. | \n", "VBD | \n", "Verb, past tense | \n", "
29. | \n", "VBG | \n", "Verb, gerund or present participle | \n", "
30. | \n", "VBN | \n", "Verb, past participle | \n", "
31. | \n", "VBP | \n", "Verb, non-3rd person singular present | \n", "
32. | \n", "VBZ | \n", "Verb, 3rd person singular present | \n", "
33. | \n", "WDT | \n", "Wh-determiner | \n", "
34. | \n", "WP | \n", "Wh-pronoun | \n", "
35. | \n", "WP$ | \n", "Possessive wh-pronoun | \n", "
36. | \n", "WRB | \n", "Wh-adverb \n", " |