{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK - Part of Speech" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import random" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "lines = open('txt/language.txt').readlines()\n", "sentence = random.choice(lines)\n", "print(sentence)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens = nltk.word_tokenize(sentence)\n", "print(tokens)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Part of Speech \"tags\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tagged = nltk.pos_tag(tokens)\n", "print(tagged)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, you could select for example all the type of **verbs**:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "selection = []\n", "\n", "for word, tag in tagged:\n", " if 'VB' in tag:\n", " selection.append(word)\n", "\n", "print(selection)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Where do these tags come from?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n", "\n", "From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n", "\n", "From: http://www.nltk.org/book_1ed/ch05.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nltk.help.upenn_tagset('PRP')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n", "\n", "
\n",
" Number \n",
" | \n",
" \n",
" Tag \n",
" | \n",
" \n",
" Description \n",
" | \n",
"
1. | \n", "CC | \n", "Coordinating conjunction | \n", "
2. | \n", "CD | \n", "Cardinal number | \n", "
3. | \n", "DT | \n", "Determiner | \n", "
4. | \n", "EX | \n", "Existential there | \n", "
5. | \n", "FW | \n", "Foreign word | \n", "
6. | \n", "IN | \n", "Preposition or subordinating conjunction | \n", "
7. | \n", "JJ | \n", "Adjective | \n", "
8. | \n", "JJR | \n", "Adjective, comparative | \n", "
9. | \n", "JJS | \n", "Adjective, superlative | \n", "
10. | \n", "LS | \n", "List item marker | \n", "
11. | \n", "MD | \n", "Modal | \n", "
12. | \n", "NN | \n", "Noun, singular or mass | \n", "
13. | \n", "NNS | \n", "Noun, plural | \n", "
14. | \n", "NNP | \n", "Proper noun, singular | \n", "
15. | \n", "NNPS | \n", "Proper noun, plural | \n", "
16. | \n", "PDT | \n", "Predeterminer | \n", "
17. | \n", "POS | \n", "Possessive ending | \n", "
18. | \n", "PRP | \n", "Personal pronoun | \n", "
19. | \n", "PRP\\$ | \n", "Possessive pronoun | \n", "
20. | \n", "RB | \n", "Adverb | \n", "
21. | \n", "RBR | \n", "Adverb, comparative | \n", "
22. | \n", "RBS | \n", "Adverb, superlative | \n", "
23. | \n", "RP | \n", "Particle | \n", "
24. | \n", "SYM | \n", "Symbol | \n", "
25. | \n", "TO | \n", "to | \n", "
26. | \n", "UH | \n", "Interjection | \n", "
27. | \n", "VB | \n", "Verb, base form | \n", "
28. | \n", "VBD | \n", "Verb, past tense | \n", "
29. | \n", "VBG | \n", "Verb, gerund or present participle | \n", "
30. | \n", "VBN | \n", "Verb, past participle | \n", "
31. | \n", "VBP | \n", "Verb, non-3rd person singular present | \n", "
32. | \n", "VBZ | \n", "Verb, 3rd person singular present | \n", "
33. | \n", "WDT | \n", "Wh-determiner | \n", "
34. | \n", "WP | \n", "Wh-pronoun | \n", "
35. | \n", "WP$ | \n", "Possessive wh-pronoun | \n", "
36. | \n", "WRB | \n", "Wh-adverb \n", " |