From 981258bace3ebcf32ac51f282ab0a281cd23e8b8 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Wed, 14 Oct 2020 10:12:17 +0000 Subject: [PATCH] updated NLTKing --- NLTKing.ipynb | 196 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 189 insertions(+), 7 deletions(-) diff --git a/NLTKing.ipynb b/NLTKing.ipynb index 6217d7d..524a5ca 100644 --- a/NLTKing.ipynb +++ b/NLTKing.ipynb @@ -69,7 +69,26 @@ "metadata": {}, "outputs": [], "source": [ - "Text?" + "nltk.text.Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for line in text1.concordance_list(\"whale\"):\n", + " print (line.left_print, line.query, line.right_print)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text5.tokens" ] }, { @@ -92,6 +111,15 @@ "url = \"https://git.xpub.nl/XPUB/S13-Words-for-the-Future-materials/raw/branch/master/txt-essays/RESURGENCE%20Isabelle%20Stengers.txt\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url" + ] + }, { "cell_type": "code", "execution_count": null, @@ -101,6 +129,33 @@ "from urllib.request import urlopen" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r = urlopen(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rawtext = r.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = rawtext.decode()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -110,6 +165,51 @@ "text = urlopen(url).read().decode()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "words = text.split?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "words = text.split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "words = text.split()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(words)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -134,7 +234,52 @@ "metadata": {}, "outputs": [], "source": [ - "stengers = nltk.text.Text(tokens)" + "tokens = word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokens = word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokens[-10:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers = Text(tokens)" ] }, { @@ -152,7 +297,16 @@ "metadata": {}, "outputs": [], "source": [ - "stengers.concordance(\"power\")" + "stengers.concordance(\"the\", width=82, lines=74)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers.concordance?" ] }, { @@ -161,7 +315,19 @@ "metadata": {}, "outputs": [], "source": [ - "stengers.similar(\"power\")" + "for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", + " print (line.left_print, line.query, line.right_print)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open (\"patches/stengers_the.txt\", \"w\") as output:\n", + " for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", + " print (line.left_print, line.query, line.right_print, file=output)" ] }, { @@ -179,7 +345,16 @@ "metadata": {}, "outputs": [], "source": [ - "stengers.dispersion_plot([\"power\", \"freedom\"])" + "stengers.dispersion_plot([\"power\", \"the\", \"victims\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.probability import FreqDist" ] }, { @@ -197,7 +372,7 @@ "metadata": {}, "outputs": [], "source": [ - "freq" + "freq[\"WHALE\"]" ] }, { @@ -467,8 +642,15 @@ "metadata": {}, "outputs": [], "source": [ - "figsize(20.0,4.8)" + "figsize(20.0,20.0)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {