From 42fe36dbb3d66e02ca98c0c2ddae22d70368c7f4 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Wed, 14 Oct 2020 07:42:56 +0000 Subject: [PATCH] NLTKing --- NLTKing.ipynb | 495 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 495 insertions(+) create mode 100644 NLTKing.ipynb diff --git a/NLTKing.ipynb b/NLTKing.ipynb new file mode 100644 index 0000000..6217d7d --- /dev/null +++ b/NLTKing.ipynb @@ -0,0 +1,495 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.nltk.org/book/\n", + "\n", + "https://www.nltk.org/book/ch00.html#natural-language-toolkit-nltk\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nltk.download(\"book\", download_dir=\"/usr/local/share/nltk_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.book import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(text1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.text import Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Text?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading Words for the Future texts\n", + "\n", + "[Chapter 3 of the NLTK book](https://www.nltk.org/book/ch03.html) discusses using your own texts using urlopen and the nltk.text.Text class.\n", + "\n", + "We can use [urllib.request.urlopen](https://docs.python.org/3/library/urllib.request.html?highlight=urlopen#urllib.request.urlopen) + pull the \"raw\" URLs of materials from the [SI13 materials on git.xpub.nl](https://git.xpub.nl/XPUB/S13-Words-for-the-Future-materials)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://git.xpub.nl/XPUB/S13-Words-for-the-Future-materials/raw/branch/master/txt-essays/RESURGENCE%20Isabelle%20Stengers.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = urlopen(url).read().decode()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokens = word_tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers = nltk.text.Text(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers.concordance(\"power\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers.similar(\"power\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers.common_contexts([\"power\", \"victims\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers.dispersion_plot([\"power\", \"freedom\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq = FreqDist(stengers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq['power']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq.plot(50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq.plot(50, cumulative=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Counting Vocabulary\n", + "\n", + "## Making a function\n", + "Investigating a text as a list of words, we discover that we can compare the count of the total number of words, with the number of unique words. If we compare " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(stengers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(set(stengers))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def lexical_diversity(text):\n", + " return len(text) / len(set(text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lexical_diversity(stengers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def percentage (count, total):\n", + " return 100 * count / total" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "percentage(4, 5)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NB: BE CAREFUL RUNNING THE FOLLOWING LINE ... IT'S REALLY SLOW...\n", + "Not all code is equal, and just because two different methods produce the same result\n", + "doesn't mean they're equally usable in practice\n", + "\n", + "Why? because text1 (Moby Dick) is a list\n", + "and checking if (x not in text1)\n", + "has to scan the whole list of words\n", + "AND THEN this scan is done FOR EVERY WORD in the stengers text\n", + "The result is called \"order n squared\" execution, as the number of words in each text increases\n", + "the time to perform the code get EXPONENTIALLY slower\n", + "it's basically the phenomenon of nested loops on large lists.... SSSSSSSSSLLLLLLLLLOOOOOOOOOOOWWWWWWWWWWW" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# stengers_unique = []\n", + "# for word in stengers.tokens:\n", + "# if word not in text1:\n", + "# stengers_unique.append(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# stengers_unique = [x for x in stengers.tokens if x not in text1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FIX: make a set based on the Moby Dick text, checking if something is in a set is VERY FAST compared to scanning a list (Order log(n) instead of n)..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "moby = set(text1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"the\" in moby" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rather than n\\*n (n squared), the following is just n * log(n) which is *not* exponential as n gets big" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers_unique = []\n", + "for word in stengers.tokens:\n", + " if word not in moby:\n", + " stengers_unique.append(word)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above can also be expressed using the more compact form of a list comprehension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers_unique = [word for word in stengers.tokens if word not in moby]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(stengers_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers_unique_text = Text(stengers_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq = FreqDist(stengers_unique)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq.plot(50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers_unique_text.concordance(\"witches\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Increasing the default figure size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.core.pylabtools import figsize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "figsize(20.0,4.8)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}