From 9185be341aeaa42922f1d61a737c11a01ff175b7 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Wed, 14 Oct 2020 11:21:32 +0000 Subject: [PATCH] updated NLTKing --- NLTKing.ipynb | 145 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 13 deletions(-) diff --git a/NLTKing.ipynb b/NLTKing.ipynb index 524a5ca..b934d87 100644 --- a/NLTKing.ipynb +++ b/NLTKing.ipynb @@ -198,7 +198,7 @@ "metadata": {}, "outputs": [], "source": [ - "words = text.split()" + "words = text.split" ] }, { @@ -207,7 +207,7 @@ "metadata": {}, "outputs": [], "source": [ - "len(words)" + "words = text.split" ] }, { @@ -216,7 +216,7 @@ "metadata": {}, "outputs": [], "source": [ - "from nltk import word_tokenize" + "words = text.split" ] }, { @@ -225,7 +225,16 @@ "metadata": {}, "outputs": [], "source": [ - "tokens = word_tokenize(text)" + "words = text.split()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(words)" ] }, { @@ -234,7 +243,7 @@ "metadata": {}, "outputs": [], "source": [ - "tokens = word_tokenize" + "from nltk import word_tokenize" ] }, { @@ -243,7 +252,7 @@ "metadata": {}, "outputs": [], "source": [ - "tokens = word_tokenize" + "tokens = word_tokenize(text)" ] }, { @@ -288,7 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "stengers" + "stengers.concordance(\"the\", width=82, lines=74)" ] }, { @@ -297,7 +306,8 @@ "metadata": {}, "outputs": [], "source": [ - "stengers.concordance(\"the\", width=82, lines=74)" + "for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", + " print (line.left_print, line.query, line.right_print)" ] }, { @@ -306,7 +316,9 @@ "metadata": {}, "outputs": [], "source": [ - "stengers.concordance?" + "with open (\"patches/stengers_the.txt\", \"w\") as output:\n", + " for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", + " print (line.left_print, line.query, line.right_print, file=output)" ] }, { @@ -316,7 +328,7 @@ "outputs": [], "source": [ "for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", - " print (line.left_print, line.query, line.right_print)" + " print (line.query)" ] }, { @@ -325,9 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open (\"patches/stengers_the.txt\", \"w\") as output:\n", - " for line in stengers.concordance_list(\"the\", width=82, lines=74):\n", - " print (line.left_print, line.query, line.right_print, file=output)" + "stengers.concordance(\"the\", width=3)\n" ] }, { @@ -645,6 +655,115 @@ "figsize(20.0,20.0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stengers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nami asks: How to I get concordances of just words ending \"ity\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "t = stengers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ity = []\n", + "for w in stengers:\n", + " if w.endswith(\"ity\"):\n", + " # print (w)\n", + " ity.append(w.lower())\n", + "ity = set(ity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for word in ity:\n", + " stengers.concordance(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"Objectivity\".lower" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set(ity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clara asks, what about lines that are shorter than the width you give?\n", + "\n", + "https://www.peterbe.com/plog/how-to-pad-fill-string-by-variable-python\n", + "\n", + "cwidth is how much \"padding\" is needed for each side\n", + "it's our page width - the length of the word divided by 2\n", + "in python means \"integer\" (whole number) division" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for line in stengers.concordance_list(\"resurgence\", width=82, lines=74):\n", + " cwidth = (82 - len(\"resurgence\")) // 2\n", + " # print (cwidth)\n", + " print ( line.left_print.rjust(cwidth), line.query, line.right_print.ljust(cwidth) )\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,