From 5c402761981785d2ac0b19961d6b739423fc6471 Mon Sep 17 00:00:00 2001 From: manetta Date: Mon, 26 Oct 2020 14:06:25 +0100 Subject: [PATCH] adding 3 nltk notebooks --- nltk-frequency-distribution.ipynb | 191 ++++++++++++++++ nltk-pos-tagger.ipynb | 350 ++++++++++++++++++++++++++++++ nltk-similar-words.ipynb | 165 ++++++++++++++ 3 files changed, 706 insertions(+) create mode 100644 nltk-frequency-distribution.ipynb create mode 100644 nltk-pos-tagger.ipynb create mode 100644 nltk-similar-words.ipynb diff --git a/nltk-frequency-distribution.ipynb b/nltk-frequency-distribution.ipynb new file mode 100644 index 0000000..b4cfafe --- /dev/null +++ b/nltk-frequency-distribution.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NLTK - Frequency Distribution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.nltk.org/book/ch01.html#frequency-distributions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lines = open('txt/language.txt').readlines()\n", + "sentence = random.choice(lines)\n", + "print(sentence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokens = nltk.word_tokenize(sentence)\n", + "print(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Frequency Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# frequency of characters\n", + "fd = nltk.FreqDist(sentence)\n", + "print(fd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(fd.most_common(50))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# frequency of words\n", + "fd = nltk.FreqDist(tokens)\n", + "print(fd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(fd.most_common(50))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# frequency of a text\n", + "txt = open('txt/language.txt').read()\n", + "tokens = nltk.word_tokenize(txt)\n", + "fd = nltk.FreqDist(tokens)\n", + "print(fd)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(fd.most_common(50))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Requesting the frequency of a specific word\n", + "print(fd['language'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nltk-pos-tagger.ipynb b/nltk-pos-tagger.ipynb new file mode 100644 index 0000000..75f769e --- /dev/null +++ b/nltk-pos-tagger.ipynb @@ -0,0 +1,350 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NLTK - Part of Speech" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lines = open('txt/language.txt').readlines()\n", + "sentence = random.choice(lines)\n", + "print(sentence)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokens = nltk.word_tokenize(sentence)\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part of Speech \"tags\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tagged = nltk.pos_tag(tokens)\n", + "print(tagged)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, you could select for example all the type of **verbs**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "selection = []\n", + "\n", + "for word, tag in tagged:\n", + " if 'VB' in tag:\n", + " selection.append(word)\n", + "\n", + "print(selection)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Where do these tags come from?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.\n", + "\n", + "From: http://www.nltk.org/api/nltk.tag.html#module-nltk.tag" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').\n", + "\n", + "From: http://www.nltk.org/book_1ed/ch05.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nltk.help.upenn_tagset('PRP')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An alphabetical list of part-of-speech tags used in the Penn Treebank Project ([link](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)):\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
Number
\n", + "
\n", + "
Tag
\n", + "
\n", + "
Description
\n", + "
1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
6. IN Preposition or subordinating conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10. LS List item marker
11. MD Modal
12. NN Noun, singular or mass
13. NNS Noun, plural
14. NNP Proper noun, singular
15. NNPS Proper noun, plural
16. PDT Predeterminer
17. POS Possessive ending
18. PRP Personal pronoun
19. PRP\\$ Possessive pronoun
20. RB Adverb
21. RBR Adverb, comparative
22. RBS Adverb, superlative
23. RP Particle
24. SYM Symbol
25. TO to
26. UH Interjection
27. VB Verb, base form
28. VBD Verb, past tense
29. VBG Verb, gerund or present participle
30. VBN Verb, past participle
31. VBP Verb, non-3rd person singular present
32. VBZ Verb, 3rd person singular present
33. WDT Wh-determiner
34. WP Wh-pronoun
35. WP$ Possessive wh-pronoun
36. WRB Wh-adverb \n", + "
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nltk-similar-words.ipynb b/nltk-similar-words.ipynb new file mode 100644 index 0000000..17e5b95 --- /dev/null +++ b/nltk-similar-words.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NLTK - Similar Words" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.nltk.org/book/ch01.html#searching-text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "txt = open('txt/language.txt').read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tokens = nltk.word_tokenize(txt)\n", + "print(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NLTK Text object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = nltk.Text(tokens)\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## concordance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is what you did with Michael before the break ...\n", + "concordance = text.concordance(\"language\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## similarities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# With a small next step ...\n", + "similar = text.similar(\"language\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# And searching for contexts ...\n", + "contexts = text.common_contexts([\"language\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read on" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.nltk.org/book/ch01.html#searching-text (recommended!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}