From 76a7f198bbcee94a40a3c70757af08f58a5f9ea0 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Tue, 27 Oct 2020 21:18:12 +0100 Subject: [PATCH] updated/simplified weasyprint notebook --- json-making-datasets.ipynb | 59 +- nltk-frequency-distribution.ipynb | 111 ++- nltk-pos-tagger.ipynb | 1110 ++++++++++++++++++++++++++++- weasyprint.ipynb | 192 ++--- 4 files changed, 1315 insertions(+), 157 deletions(-) diff --git a/json-making-datasets.ipynb b/json-making-datasets.ipynb index 501dd46..61be990 100644 --- a/json-making-datasets.ipynb +++ b/json-making-datasets.ipynb @@ -74,9 +74,21 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'dataset' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Adding a new key to the dictionary, assigning a string as value:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'new'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'NEW WORD'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# or assigning a number as value:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'new'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'dataset' is not defined" + ] + } + ], "source": [ "# Adding a new key to the dictionary, assigning a string as value:\n", "dataset['new'] = 'NEW WORD'\n", @@ -113,19 +125,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# This is sample data, a list of words and POS tags:\n", - "data = [('Common', 'JJ'), ('languages', 'NNS'), ('like', 'IN'), ('English', 'NNP'), ('are', 'VBP'), ('both', 'DT'), ('formal', 'JJ'), ('and', 'CC'), ('semantic', 'JJ'), (';', ':'), ('although', 'IN'), ('their', 'PRP$'), ('scope', 'NN'), ('extends', 'VBZ'), ('beyond', 'IN'), ('the', 'DT'), ('formal', 'JJ'), (',', ','), ('anything', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('common', 'JJ'), ('language', 'NN'), ('.', '.')]" + "dataset = [('Common', 'JJ'), ('languages', 'NNS'), ('like', 'IN'), ('English', 'NNP'), ('are', 'VBP'), ('both', 'DT'), ('formal', 'JJ'), ('and', 'CC'), ('semantic', 'JJ'), (';', ':'), ('although', 'IN'), ('their', 'PRP$'), ('scope', 'NN'), ('extends', 'VBZ'), ('beyond', 'IN'), ('the', 'DT'), ('formal', 'JJ'), (',', ','), ('anything', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('common', 'JJ'), ('language', 'NN'), ('.', '.')]" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'are': 'VBP', 'extends': 'VBZ', 'be': 'VB', 'expressed': 'VBN'}\n" + ] + } + ], "source": [ "# Making a dataset with only verbs\n", "dataset = {}\n", @@ -146,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -155,9 +175,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"are\": \"VBP\",\n", + " \"extends\": \"VBZ\",\n", + " \"be\": \"VB\",\n", + " \"expressed\": \"VBN\"\n", + "}\n" + ] + } + ], "source": [ "out = json.dumps(dataset, indent=4)\n", "print(out)" @@ -165,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ diff --git a/nltk-frequency-distribution.ipynb b/nltk-frequency-distribution.ipynb index b4cfafe..ef1c143 100644 --- a/nltk-frequency-distribution.ipynb +++ b/nltk-frequency-distribution.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -26,9 +26,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The symbols of computer control languages inevitably do have semantic connotations simply because there exist no symbols with which humans would not associate some meaning.\n", + "\n" + ] + } + ], "source": [ "lines = open('txt/language.txt').readlines()\n", "sentence = random.choice(lines)\n", @@ -44,9 +53,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['The', 'symbols', 'of', 'computer', 'control', 'languages', 'inevitably', 'do', 'have', 'semantic', 'connotations', 'simply', 'because', 'there', 'exist', 'no', 'symbols', 'with', 'which', 'humans', 'would', 'not', 'associate', 'some', 'meaning', '.']\n" + ] + } + ], "source": [ "tokens = nltk.word_tokenize(sentence)\n", "print(tokens)" @@ -68,9 +85,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# frequency of characters\n", "fd = nltk.FreqDist(sentence)\n", @@ -79,9 +104,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(' ', 24), ('o', 15), ('e', 14), ('s', 14), ('n', 12), ('t', 11), ('a', 11), ('i', 10), ('m', 8), ('h', 7), ('l', 7), ('c', 7), ('u', 5), ('y', 4), ('b', 4), ('r', 3), ('g', 3), ('w', 3), ('p', 2), ('v', 2), ('d', 2), ('T', 1), ('f', 1), ('x', 1), ('.', 1), ('\\n', 1)]\n" + ] + } + ], "source": [ "print(fd.most_common(50))" ] @@ -95,9 +128,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# frequency of words\n", "fd = nltk.FreqDist(tokens)\n", @@ -106,9 +147,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('symbols', 2), ('The', 1), ('of', 1), ('computer', 1), ('control', 1), ('languages', 1), ('inevitably', 1), ('do', 1), ('have', 1), ('semantic', 1), ('connotations', 1), ('simply', 1), ('because', 1), ('there', 1), ('exist', 1), ('no', 1), ('with', 1), ('which', 1), ('humans', 1), ('would', 1), ('not', 1), ('associate', 1), ('some', 1), ('meaning', 1), ('.', 1)]\n" + ] + } + ], "source": [ "print(fd.most_common(50))" ] @@ -122,9 +171,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# frequency of a text\n", "txt = open('txt/language.txt').read()\n", @@ -135,9 +192,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(',', 172), ('.', 93), ('the', 88), ('of', 88), ('”', 66), ('“', 65), ('and', 61), ('a', 61), ('is', 58), ('languages', 54), ('in', 51), ('language', 47), ('to', 41), ('as', 37), ('computer', 32), ('that', 29), ('programming', 25), ('control', 23), ('are', 22), ('for', 21), ('’', 21), ('The', 18), ('can', 17), ('be', 16), ('it', 16), ('machine', 16), ('human', 15), ('not', 15), ('software', 14), ('formal', 14), ('or', 14), ('symbols', 14), ('s', 12), ('with', 12), (':', 11), ('its', 11), ('this', 11), ('common', 11), ('their', 10), ('example', 9), (';', 9), ('operations', 9), ('such', 9), ('from', 8), ('through', 8), ('code', 8), ('since', 7), ('different', 7), ('In', 7), ('like', 7)]\n" + ] + } + ], "source": [ "print(fd.most_common(50))" ] @@ -151,9 +216,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "47\n" + ] + } + ], "source": [ "# Requesting the frequency of a specific word\n", "print(fd['language'])" diff --git a/nltk-pos-tagger.ipynb b/nltk-pos-tagger.ipynb index 75f769e..a5bf536 100644 --- a/nltk-pos-tagger.ipynb +++ b/nltk-pos-tagger.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -19,9 +19,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To complicate things even further, computer science has its own understanding of “operational semantics” in programming languages, for example in the construction of a programming language interpreter or compiler.\n", + "\n" + ] + } + ], "source": [ "lines = open('txt/language.txt').readlines()\n", "sentence = random.choice(lines)\n", @@ -37,9 +46,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['To', 'complicate', 'things', 'even', 'further', ',', 'computer', 'science', 'has', 'its', 'own', 'understanding', 'of', '“', 'operational', 'semantics', '”', 'in', 'programming', 'languages', ',', 'for', 'example', 'in', 'the', 'construction', 'of', 'a', 'programming', 'language', 'interpreter', 'or', 'compiler', '.']\n" + ] + } + ], "source": [ "tokens = nltk.word_tokenize(sentence)\n", "print(tokens)" @@ -54,9 +71,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('To', 'TO'), ('complicate', 'VB'), ('things', 'NNS'), ('even', 'RB'), ('further', 'RB'), (',', ','), ('computer', 'NN'), ('science', 'NN'), ('has', 'VBZ'), ('its', 'PRP$'), ('own', 'JJ'), ('understanding', 'NN'), ('of', 'IN'), ('“', 'NNP'), ('operational', 'JJ'), ('semantics', 'NNS'), ('”', 'VBP'), ('in', 'IN'), ('programming', 'NN'), ('languages', 'NNS'), (',', ','), ('for', 'IN'), ('example', 'NN'), ('in', 'IN'), ('the', 'DT'), ('construction', 'NN'), ('of', 'IN'), ('a', 'DT'), ('programming', 'JJ'), ('language', 'NN'), ('interpreter', 'NN'), ('or', 'CC'), ('compiler', 'NN'), ('.', '.')]\n" + ] + } + ], "source": [ "tagged = nltk.pos_tag(tokens)\n", "print(tagged)" @@ -71,9 +96,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['complicate', 'has', '”']\n" + ] + } + ], "source": [ "selection = []\n", "\n", @@ -111,9 +144,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRP: pronoun, personal\n", + " hers herself him himself hisself it itself me myself one oneself ours\n", + " ourselves ownself self she thee theirs them themselves they thou thy us\n" + ] + } + ], "source": [ "nltk.help.upenn_tagset('PRP')" ] @@ -324,6 +367,1051 @@ " Wh-adverb \n", "" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Applying to an entire text" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "language = open('txt/language.txt').read()\n", + "tokens = nltk.word_tokenize(language)\n", + "tagged = nltk.pos_tag(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Language', 'NN'),\n", + " ('Florian', 'JJ'),\n", + " ('Cramer', 'NNP'),\n", + " ('Software', 'NNP'),\n", + " ('and', 'CC'),\n", + " ('language', 'NN'),\n", + " ('are', 'VBP'),\n", + " ('intrinsically', 'RB'),\n", + " ('related', 'VBN'),\n", + " (',', ','),\n", + " ('since', 'IN'),\n", + " ('software', 'NN'),\n", + " ('may', 'MD'),\n", + " ('process', 'VB'),\n", + " ('language', 'NN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('is', 'VBZ'),\n", + " ('constructed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('language', 'NN'),\n", + " ('.', '.'),\n", + " ('Yet', 'CC'),\n", + " ('language', 'NN'),\n", + " ('means', 'VBZ'),\n", + " ('different', 'JJ'),\n", + " ('things', 'NNS'),\n", + " ('in', 'IN'),\n", + " ('the', 'DT'),\n", + " ('context', 'NN'),\n", + " ('of', 'IN'),\n", + " ('computing', 'VBG'),\n", + " (':', ':'),\n", + " ('formal', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('in', 'IN'),\n", + " ('which', 'WDT'),\n", + " ('algorithms', 'EX'),\n", + " ('are', 'VBP'),\n", + " ('expressed', 'VBN'),\n", + " ('and', 'CC'),\n", + " ('software', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('implemented', 'VBN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('in', 'IN'),\n", + " ('so-called', 'JJ'),\n", + " ('“', 'NNP'),\n", + " ('natural', 'JJ'),\n", + " ('”', 'NNP'),\n", + " ('spoken', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('.', '.'),\n", + " ('There', 'EX'),\n", + " ('are', 'VBP'),\n", + " ('at', 'IN'),\n", + " ('least', 'JJS'),\n", + " ('two', 'CD'),\n", + " ('layers', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('formal', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('in', 'IN'),\n", + " ('software', 'NN'),\n", + " (':', ':'),\n", + " ('programming', 'NN'),\n", + " ('language', 'NN'),\n", + " ('in', 'IN'),\n", + " ('which', 'WDT'),\n", + " ('the', 'DT'),\n", + " ('software', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('written', 'VBN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('the', 'DT'),\n", + " ('language', 'NN'),\n", + " ('implemented', 'VBD'),\n", + " ('within', 'IN'),\n", + " ('the', 'DT'),\n", + " ('software', 'NN'),\n", + " ('as', 'IN'),\n", + " ('its', 'PRP$'),\n", + " ('symbolic', 'JJ'),\n", + " ('controls', 'NNS'),\n", + " ('.', '.'),\n", + " ('In', 'IN'),\n", + " ('the', 'DT'),\n", + " ('case', 'NN'),\n", + " ('of', 'IN'),\n", + " ('compilers', 'NNS'),\n", + " (',', ','),\n", + " ('shells', 'NNS'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('macro', 'NN'),\n", + " ('languages', 'NNS'),\n", + " (',', ','),\n", + " ('for', 'IN'),\n", + " ('example', 'NN'),\n", + " (',', ','),\n", + " ('these', 'DT'),\n", + " ('layers', 'NNS'),\n", + " ('can', 'MD'),\n", + " ('overlap', 'VB'),\n", + " ('.', '.'),\n", + " ('“', 'VB'),\n", + " ('Natural', 'NNP'),\n", + " ('”', 'NNP'),\n", + " ('language', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('what', 'WP'),\n", + " ('can', 'MD'),\n", + " ('be', 'VB'),\n", + " ('processed', 'VBN'),\n", + " ('as', 'IN'),\n", + " ('data', 'NNS'),\n", + " ('by', 'IN'),\n", + " ('software', 'NN'),\n", + " (';', ':'),\n", + " ('since', 'IN'),\n", + " ('this', 'DT'),\n", + " ('processing', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('formal', 'JJ'),\n", + " (',', ','),\n", + " ('however', 'RB'),\n", + " (',', ','),\n", + " ('it', 'PRP'),\n", + " ('is', 'VBZ'),\n", + " ('restricted', 'VBN'),\n", + " ('to', 'TO'),\n", + " ('syntactical', 'JJ'),\n", + " ('operations', 'NNS'),\n", + " ('.', '.'),\n", + " ('While', 'IN'),\n", + " ('differentiation', 'NN'),\n", + " ('of', 'IN'),\n", + " ('computer', 'NN'),\n", + " ('programming', 'VBG'),\n", + " ('languages', 'NNS'),\n", + " ('as', 'IN'),\n", + " ('“', 'JJ'),\n", + " ('artificial', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('from', 'IN'),\n", + " ('languages', 'NNS'),\n", + " ('like', 'VBP'),\n", + " ('English', 'NNP'),\n", + " ('as', 'IN'),\n", + " ('“', 'NNP'),\n", + " ('natural', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('is', 'VBZ'),\n", + " ('conceptually', 'RB'),\n", + " ('important', 'JJ'),\n", + " ('and', 'CC'),\n", + " ('undisputed', 'JJ'),\n", + " (',', ','),\n", + " ('it', 'PRP'),\n", + " ('remains', 'VBZ'),\n", + " ('problematic', 'JJ'),\n", + " ('in', 'IN'),\n", + " ('its', 'PRP$'),\n", + " ('pure', 'NN'),\n", + " ('terminology', 'NN'),\n", + " (':', ':'),\n", + " ('There', 'EX'),\n", + " ('is', 'VBZ'),\n", + " ('nothing', 'NN'),\n", + " ('“', 'JJ'),\n", + " ('natural', 'JJ'),\n", + " ('”', 'NN'),\n", + " ('about', 'IN'),\n", + " ('spoken', 'JJ'),\n", + " ('language', 'NN'),\n", + " (';', ':'),\n", + " ('it', 'PRP'),\n", + " ('is', 'VBZ'),\n", + " ('a', 'DT'),\n", + " ('cultural', 'JJ'),\n", + " ('construct', 'NN'),\n", + " ('and', 'CC'),\n", + " ('thus', 'RB'),\n", + " ('just', 'RB'),\n", + " ('as', 'IN'),\n", + " ('“', 'JJ'),\n", + " ('artificial', 'JJ'),\n", + " ('”', 'NN'),\n", + " ('as', 'IN'),\n", + " ('any', 'DT'),\n", + " ('formal', 'JJ'),\n", + " ('machine', 'NN'),\n", + " ('control', 'NN'),\n", + " ('language', 'NN'),\n", + " ('.', '.'),\n", + " ('To', 'TO'),\n", + " ('call', 'VB'),\n", + " ('programming', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('“', 'VBP'),\n", + " ('machine', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('doesn', 'JJ'),\n", + " ('’', 'NNP'),\n", + " ('t', 'NN'),\n", + " ('solve', 'VBP'),\n", + " ('the', 'DT'),\n", + " ('problem', 'NN'),\n", + " ('either', 'RB'),\n", + " (',', ','),\n", + " ('as', 'IN'),\n", + " ('it', 'PRP'),\n", + " ('obscures', 'VBZ'),\n", + " ('that', 'IN'),\n", + " ('“', 'FW'),\n", + " ('machine', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('are', 'VBP'),\n", + " ('human', 'JJ'),\n", + " ('creations', 'NNS'),\n", + " ('.', '.'),\n", + " ('High-level', 'JJ'),\n", + " ('machine-independent', 'JJ'),\n", + " ('programming', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('such', 'JJ'),\n", + " ('as', 'IN'),\n", + " ('Fortran', 'NNP'),\n", + " (',', ','),\n", + " ('C', 'NNP'),\n", + " (',', ','),\n", + " ('Java', 'NNP'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('Basic', 'NNP'),\n", + " ('are', 'VBP'),\n", + " ('not', 'RB'),\n", + " ('even', 'RB'),\n", + " ('direct', 'JJ'),\n", + " ('mappings', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('machine', 'NN'),\n", + " ('logic', 'NN'),\n", + " ('.', '.'),\n", + " ('If', 'IN'),\n", + " ('programming', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('are', 'VBP'),\n", + " ('human', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('for', 'IN'),\n", + " ('machine', 'NN'),\n", + " ('control', 'NN'),\n", + " (',', ','),\n", + " ('they', 'PRP'),\n", + " ('could', 'MD'),\n", + " ('be', 'VB'),\n", + " ('called', 'VBN'),\n", + " ('cybernetic', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('.', '.'),\n", + " ('But', 'CC'),\n", + " ('these', 'DT'),\n", + " ('languages', 'NNS'),\n", + " ('can', 'MD'),\n", + " ('also', 'RB'),\n", + " ('be', 'VB'),\n", + " ('used', 'VBN'),\n", + " ('outside', 'JJ'),\n", + " ('machines—in', 'NN'),\n", + " ('programming', 'VBG'),\n", + " ('handbooks', 'NNS'),\n", + " (',', ','),\n", + " ('for', 'IN'),\n", + " ('example', 'NN'),\n", + " (',', ','),\n", + " ('in', 'IN'),\n", + " ('programmer', 'NN'),\n", + " ('’', 'NNP'),\n", + " ('s', 'NN'),\n", + " ('dinner', 'NN'),\n", + " ('table', 'JJ'),\n", + " ('jokes', 'NNS'),\n", + " (',', ','),\n", + " ('or', 'CC'),\n", + " ('as', 'IN'),\n", + " ('abstract', 'JJ'),\n", + " ('formal', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('for', 'IN'),\n", + " ('expressing', 'VBG'),\n", + " ('logical', 'JJ'),\n", + " ('constructs', 'NNS'),\n", + " (',', ','),\n", + " ('such', 'JJ'),\n", + " ('as', 'IN'),\n", + " ('in', 'IN'),\n", + " ('Hugh', 'NNP'),\n", + " ('Kenner', 'NNP'),\n", + " ('’', 'NNP'),\n", + " ('s', 'NN'),\n", + " ('use', 'NN'),\n", + " ('of', 'IN'),\n", + " ('the', 'DT'),\n", + " ('Pascal', 'NNP'),\n", + " ('programming', 'NN'),\n", + " ('language', 'NN'),\n", + " ('to', 'TO'),\n", + " ('explain', 'VB'),\n", + " ('aspects', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('the', 'DT'),\n", + " ('structure', 'NN'),\n", + " ('of', 'IN'),\n", + " ('Samuel', 'NNP'),\n", + " ('Beckett', 'NNP'),\n", + " ('’', 'NNP'),\n", + " ('s', 'VBD'),\n", + " ('writing.1', 'NN'),\n", + " ('In', 'IN'),\n", + " ('this', 'DT'),\n", + " ('sense', 'NN'),\n", + " (',', ','),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('could', 'MD'),\n", + " ('be', 'VB'),\n", + " ('more', 'RBR'),\n", + " ('broadly', 'RB'),\n", + " ('defined', 'VBN'),\n", + " ('as', 'IN'),\n", + " ('syntactical', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('as', 'IN'),\n", + " ('opposed', 'VBN'),\n", + " ('to', 'TO'),\n", + " ('semantic', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('.', '.'),\n", + " ('But', 'CC'),\n", + " ('this', 'DT'),\n", + " ('terminology', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('not', 'RB'),\n", + " ('without', 'IN'),\n", + " ('its', 'PRP$'),\n", + " ('problems', 'NNS'),\n", + " ('either', 'DT'),\n", + " ('.', '.'),\n", + " ('Common', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('like', 'IN'),\n", + " ('English', 'NNP'),\n", + " ('are', 'VBP'),\n", + " ('both', 'DT'),\n", + " ('formal', 'JJ'),\n", + " ('and', 'CC'),\n", + " ('semantic', 'JJ'),\n", + " (';', ':'),\n", + " ('although', 'IN'),\n", + " ('their', 'PRP$'),\n", + " ('scope', 'NN'),\n", + " ('extends', 'VBZ'),\n", + " ('beyond', 'IN'),\n", + " ('the', 'DT'),\n", + " ('formal', 'JJ'),\n", + " (',', ','),\n", + " ('anything', 'NN'),\n", + " ('that', 'WDT'),\n", + " ('can', 'MD'),\n", + " ('be', 'VB'),\n", + " ('expressed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('a', 'DT'),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('language', 'NN'),\n", + " ('can', 'MD'),\n", + " ('also', 'RB'),\n", + " ('be', 'VB'),\n", + " ('expressed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('common', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('.', '.'),\n", + " ('It', 'PRP'),\n", + " ('follows', 'VBZ'),\n", + " ('that', 'IN'),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('languages', 'NNS'),\n", + " ('are', 'VBP'),\n", + " ('a', 'DT'),\n", + " ('formal', 'JJ'),\n", + " ('(', '('),\n", + " ('and', 'CC'),\n", + " ('as', 'IN'),\n", + " ('such', 'JJ'),\n", + " ('rather', 'RB'),\n", + " ('primitive', 'JJ'),\n", + " (')', ')'),\n", + " ('subset', 'NN'),\n", + " ('of', 'IN'),\n", + " ('common', 'JJ'),\n", + " ('human', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " ('.', '.'),\n", + " ('To', 'TO'),\n", + " ('complicate', 'VB'),\n", + " ('things', 'NNS'),\n", + " ('even', 'RB'),\n", + " ('further', 'RB'),\n", + " (',', ','),\n", + " ('computer', 'NN'),\n", + " ('science', 'NN'),\n", + " ('has', 'VBZ'),\n", + " ('its', 'PRP$'),\n", + " ('own', 'JJ'),\n", + " ('understanding', 'NN'),\n", + " ('of', 'IN'),\n", + " ('“', 'NNP'),\n", + " ('operational', 'JJ'),\n", + " ('semantics', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('in', 'IN'),\n", + " ('programming', 'NN'),\n", + " ('languages', 'NNS'),\n", + " (',', ','),\n", + " ('for', 'IN'),\n", + " ('example', 'NN'),\n", + " ('in', 'IN'),\n", + " ('the', 'DT'),\n", + " ('construction', 'NN'),\n", + " ('of', 'IN'),\n", + " ('a', 'DT'),\n", + " ('programming', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('interpreter', 'NN'),\n", + " ('or', 'CC'),\n", + " ('compiler', 'NN'),\n", + " ('.', '.'),\n", + " ('Just', 'RB'),\n", + " ('as', 'IN'),\n", + " ('this', 'DT'),\n", + " ('interpreter', 'NN'),\n", + " ('doesn', 'NN'),\n", + " ('’', 'NNP'),\n", + " ('t', 'NN'),\n", + " ('perform', 'NN'),\n", + " ('“', 'NNP'),\n", + " ('interpretations', 'NNS'),\n", + " ('”', 'VBP'),\n", + " ('in', 'IN'),\n", + " ('a', 'DT'),\n", + " ('hermeneutic', 'JJ'),\n", + " ('sense', 'NN'),\n", + " ('of', 'IN'),\n", + " ('semantic', 'JJ'),\n", + " ('text', 'NN'),\n", + " ('explication', 'NN'),\n", + " (',', ','),\n", + " ('the', 'DT'),\n", + " ('computer', 'NN'),\n", + " ('science', 'NN'),\n", + " ('notion', 'NN'),\n", + " ('of', 'IN'),\n", + " ('“', 'JJ'),\n", + " ('semantics', 'NNS'),\n", + " ('”', 'JJ'),\n", + " ('defies', 'NNS'),\n", + " ('linguistic', 'JJ'),\n", + " ('and', 'CC'),\n", + " ('common', 'JJ'),\n", + " ('sense', 'NN'),\n", + " ('understanding', 'NN'),\n", + " ('of', 'IN'),\n", + " ('the', 'DT'),\n", + " ('word', 'NN'),\n", + " (',', ','),\n", + " ('since', 'IN'),\n", + " ('compiler', 'NN'),\n", + " ('construction', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('purely', 'RB'),\n", + " ('syntactical', 'JJ'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('programming', 'VBG'),\n", + " ('languages', 'NNS'),\n", + " ('denote', 'VBP'),\n", + " ('nothing', 'NN'),\n", + " ('but', 'CC'),\n", + " ('syntactical', 'JJ'),\n", + " ('manipulations', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('symbols', 'NNS'),\n", + " ('.', '.'),\n", + " ('What', 'WP'),\n", + " ('might', 'MD'),\n", + " ('more', 'JJR'),\n", + " ('suitably', 'RB'),\n", + " ('be', 'VB'),\n", + " ('called', 'VBN'),\n", + " ('the', 'DT'),\n", + " ('semantics', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('languages', 'VBZ'),\n", + " ('resides', 'NNS'),\n", + " ('in', 'IN'),\n", + " ('the', 'DT'),\n", + " ('symbols', 'NNS'),\n", + " ('with', 'IN'),\n", + " ('which', 'WDT'),\n", + " ('those', 'DT'),\n", + " ('operations', 'NNS'),\n", + " ('are', 'VBP'),\n", + " ('denoted', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('most', 'JJS'),\n", + " ('programming', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " (':', ':'),\n", + " ('English', 'JJ'),\n", + " ('words', 'NNS'),\n", + " ('like', 'IN'),\n", + " ('“', 'NN'),\n", + " ('if', 'IN'),\n", + " (',', ','),\n", + " ('”', 'FW'),\n", + " ('“', 'FW'),\n", + " ('then', 'RB'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('else', 'RB'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('for', 'IN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('while', 'IN'),\n", + " (',', ','),\n", + " ('”', 'FW'),\n", + " ('“', 'NNP'),\n", + " ('goto', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('and', 'CC'),\n", + " ('“', 'NNP'),\n", + " ('print', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NN'),\n", + " ('in', 'IN'),\n", + " ('conjunction', 'NN'),\n", + " ('with', 'IN'),\n", + " ('arithmetical', 'JJ'),\n", + " ('and', 'CC'),\n", + " ('punctuation', 'NN'),\n", + " ('symbols', 'NNS'),\n", + " (';', ':'),\n", + " ('in', 'IN'),\n", + " ('alphabetic', 'JJ'),\n", + " ('software', 'NN'),\n", + " ('controls', 'NNS'),\n", + " (',', ','),\n", + " ('words', 'NNS'),\n", + " ('like', 'IN'),\n", + " ('“', 'NNP'),\n", + " ('list', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('move', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('copy', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NN'),\n", + " ('and', 'CC'),\n", + " ('“', 'NNP'),\n", + " ('paste', 'NN'),\n", + " ('”', 'NN'),\n", + " (';', ':'),\n", + " ('in', 'IN'),\n", + " ('graphical', 'JJ'),\n", + " ('software', 'NN'),\n", + " ('controls', 'NNS'),\n", + " (',', ','),\n", + " ('such', 'JJ'),\n", + " ('as', 'IN'),\n", + " ('symbols', 'NNS'),\n", + " ('like', 'IN'),\n", + " ('the', 'DT'),\n", + " ('trash', 'NN'),\n", + " ('can', 'MD'),\n", + " ('.', '.'),\n", + " ('Ferdinand', 'NNP'),\n", + " ('de', 'IN'),\n", + " ('Saussure', 'NNP'),\n", + " ('states', 'VBZ'),\n", + " ('that', 'IN'),\n", + " ('the', 'DT'),\n", + " ('signs', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('common', 'JJ'),\n", + " ('human', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('are', 'VBP'),\n", + " ('arbitrary2', 'RB'),\n", + " ('because', 'IN'),\n", + " ('it', 'PRP'),\n", + " ('’', 'VBZ'),\n", + " ('s', 'JJ'),\n", + " ('purely', 'RB'),\n", + " ('a', 'DT'),\n", + " ('cultural-social', 'JJ'),\n", + " ('convention', 'NN'),\n", + " ('that', 'IN'),\n", + " ('assigns', 'VBZ'),\n", + " ('phonemes', 'NNS'),\n", + " ('to', 'TO'),\n", + " ('concepts', 'NNS'),\n", + " ('.', '.'),\n", + " ('Likewise', 'NNP'),\n", + " (',', ','),\n", + " ('it', 'PRP'),\n", + " ('’', 'VBZ'),\n", + " ('s', 'JJ'),\n", + " ('purely', 'RB'),\n", + " ('a', 'DT'),\n", + " ('cultural', 'JJ'),\n", + " ('convention', 'NN'),\n", + " ('to', 'TO'),\n", + " ('assign', 'VB'),\n", + " ('symbols', 'NNS'),\n", + " ('to', 'TO'),\n", + " ('machine', 'NN'),\n", + " ('operations', 'NNS'),\n", + " ('.', '.'),\n", + " ('But', 'CC'),\n", + " ('just', 'RB'),\n", + " ('as', 'IN'),\n", + " ('the', 'DT'),\n", + " ('cultural', 'JJ'),\n", + " ('choice', 'NN'),\n", + " ('of', 'IN'),\n", + " ('phonemes', 'NNS'),\n", + " ('in', 'IN'),\n", + " ('spoken', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('restrained', 'VBN'),\n", + " ('by', 'IN'),\n", + " ('what', 'WP'),\n", + " ('the', 'DT'),\n", + " ('human', 'JJ'),\n", + " ('voice', 'NN'),\n", + " ('can', 'MD'),\n", + " ('pronounce', 'VB'),\n", + " (',', ','),\n", + " ('the', 'DT'),\n", + " ('assignment', 'NN'),\n", + " ('of', 'IN'),\n", + " ('symbols', 'NNS'),\n", + " ('to', 'TO'),\n", + " ('machine', 'NN'),\n", + " ('operations', 'NNS'),\n", + " ('is', 'VBZ'),\n", + " ('limited', 'VBN'),\n", + " ('to', 'TO'),\n", + " ('what', 'WP'),\n", + " ('can', 'MD'),\n", + " ('be', 'VB'),\n", + " ('efficiently', 'RB'),\n", + " ('processed', 'VBN'),\n", + " ('by', 'IN'),\n", + " ('the', 'DT'),\n", + " ('machine', 'NN'),\n", + " ('and', 'CC'),\n", + " ('of', 'IN'),\n", + " ('good', 'JJ'),\n", + " ('use', 'NN'),\n", + " ('to', 'TO'),\n", + " ('humans.3', 'VB'),\n", + " ('This', 'DT'),\n", + " ('compromise', 'NN'),\n", + " ('between', 'IN'),\n", + " ('operability', 'NN'),\n", + " ('and', 'CC'),\n", + " ('usability', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('obvious', 'JJ'),\n", + " ('in', 'IN'),\n", + " (',', ','),\n", + " ('for', 'IN'),\n", + " ('example', 'NN'),\n", + " (',', ','),\n", + " ('Unix', 'NNP'),\n", + " ('commands', 'VBZ'),\n", + " ('.', '.'),\n", + " ('Originally', 'RB'),\n", + " ('used', 'VBN'),\n", + " ('on', 'IN'),\n", + " ('teletype', 'NN'),\n", + " ('terminals', 'NNS'),\n", + " (',', ','),\n", + " ('the', 'DT'),\n", + " ('operation', 'NN'),\n", + " ('“', 'NNP'),\n", + " ('copy', 'NN'),\n", + " ('”', 'NN'),\n", + " ('was', 'VBD'),\n", + " ('abbreviated', 'VBN'),\n", + " ('to', 'TO'),\n", + " ('the', 'DT'),\n", + " ('command', 'NN'),\n", + " ('“', 'NNP'),\n", + " ('cp', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('move', 'NN'),\n", + " ('”', 'NN'),\n", + " ('to', 'TO'),\n", + " ('“', 'VB'),\n", + " ('mv', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('“', 'NNP'),\n", + " ('list', 'NN'),\n", + " ('”', 'NN'),\n", + " ('to', 'TO'),\n", + " ('“', 'VB'),\n", + " ('ls', 'NN'),\n", + " (',', ','),\n", + " ('”', 'NNP'),\n", + " ('etc.', 'NN'),\n", + " (',', ','),\n", + " ('in', 'IN'),\n", + " ('order', 'NN'),\n", + " ('to', 'TO'),\n", + " ('cut', 'VB'),\n", + " ('down', 'RP'),\n", + " ('machine', 'NN'),\n", + " ('memory', 'NN'),\n", + " ('use', 'NN'),\n", + " (',', ','),\n", + " ('teletype', 'JJ'),\n", + " ('paper', 'NN'),\n", + " ('consumption', 'NN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('human', 'JJ'),\n", + " ('typing', 'VBG'),\n", + " ('effort', 'NN'),\n", + " ('at', 'IN'),\n", + " ('the', 'DT'),\n", + " ('same', 'JJ'),\n", + " ('time', 'NN'),\n", + " ('.', '.'),\n", + " ('Any', 'DT'),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('language', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('thus', 'RB'),\n", + " ('a', 'DT'),\n", + " ('cultural', 'JJ'),\n", + " ('compromise', 'NN'),\n", + " ('between', 'IN'),\n", + " ('the', 'DT'),\n", + " ('constraints', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('machine', 'NN'),\n", + " ('design—which', 'NN'),\n", + " ('is', 'VBZ'),\n", + " ('far', 'RB'),\n", + " ('from', 'IN'),\n", + " ('objective', 'JJ'),\n", + " (',', ','),\n", + " ('but', 'CC'),\n", + " ('based', 'VBN'),\n", + " ('on', 'IN'),\n", + " ('human', 'JJ'),\n", + " ('choices', 'NNS'),\n", + " (',', ','),\n", + " ('culture', 'NN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('thinking', 'VBG'),\n", + " ('style', 'NN'),\n", + " ('itself', 'PRP'),\n", + " ('4—and', 'CD'),\n", + " ('the', 'DT'),\n", + " ('equally', 'RB'),\n", + " ('subjective', 'JJ'),\n", + " ('user', 'NN'),\n", + " ('preferences', 'NNS'),\n", + " (',', ','),\n", + " ('involving', 'VBG'),\n", + " ('fuzzy', 'JJ'),\n", + " ('factors', 'NNS'),\n", + " ('like', 'IN'),\n", + " ('readability', 'NN'),\n", + " (',', ','),\n", + " ('elegance', 'NN'),\n", + " (',', ','),\n", + " ('and', 'CC'),\n", + " ('usage', 'JJ'),\n", + " ('efficiency', 'NN'),\n", + " ('.', '.'),\n", + " ('The', 'DT'),\n", + " ('symbols', 'NNS'),\n", + " ('of', 'IN'),\n", + " ('computer', 'NN'),\n", + " ('control', 'NN'),\n", + " ('languages', 'VBZ'),\n", + " ('inevitably', 'RB'),\n", + " ('do', 'VBP'),\n", + " ('have', 'VB'),\n", + " ('semantic', 'JJ'),\n", + " ('connotations', 'NNS'),\n", + " ('simply', 'RB'),\n", + " ('because', 'IN'),\n", + " ('there', 'EX'),\n", + " ('exist', 'VBP'),\n", + " ('no', 'DT'),\n", + " ('symbols', 'NNS'),\n", + " ('with', 'IN'),\n", + " ('which', 'WDT'),\n", + " ('humans', 'NNS'),\n", + " ('would', 'MD'),\n", + " ('not', 'RB'),\n", + " ('associate', 'VB'),\n", + " ('some', 'DT'),\n", + " ('meaning', 'NN'),\n", + " ('.', '.'),\n", + " ('But', 'CC'),\n", + " ('symbols', 'NNS'),\n", + " ('can', 'MD'),\n", + " ('’', 'VB'),\n", + " ('t', 'JJ'),\n", + " ('denote', 'NN'),\n", + " ('any', 'DT'),\n", + " ('semantic', 'JJ'),\n", + " ('statements', 'NNS'),\n", + " (',', ','),\n", + " ('that', 'DT'),\n", + " ('is', 'VBZ'),\n", + " (',', ','),\n", + " ('they', 'PRP'),\n", + " ('do', 'VBP'),\n", + " ('not', 'RB'),\n", + " ('express', 'VB'),\n", + " ('meaning', 'VBG'),\n", + " ('in', 'IN'),\n", + " ('their', 'PRP$'),\n", + " ('own', 'JJ'),\n", + " ('terms', 'NNS'),\n", + " (';', ':'),\n", + " ('humans', 'NNS'),\n", + " ('metaphorically', 'RB'),\n", + " ('read', 'VB'),\n", + " ('meaning', 'VBG'),\n", + " ('into', 'IN'),\n", + " ('them', 'PRP'),\n", + " ('through', 'IN'),\n", + " ('associations', 'NNS'),\n", + " ('they', 'PRP'),\n", + " ('make', 'VBP'),\n", + " ('.', '.'),\n", + " ('Languages', 'NNS'),\n", + " ('without', 'IN'),\n", + " ('semantic', 'JJ'),\n", + " ('denotation', 'NN'),\n", + " ('are', 'VBP'),\n", + " ('not', 'RB'),\n", + " ('historically', 'RB'),\n", + " ('new', 'JJ'),\n", + " ('phenomena', 'NNS'),\n", + " (';', ':'),\n", + " ('mathematical', 'JJ'),\n", + " ('formulas', 'NNS'),\n", + " ('are', 'VBP'),\n", + " ('their', 'PRP$'),\n", + " ('oldest', 'JJS'),\n", + " ('example', 'NN'),\n", + " ('.', '.'),\n", + " ('In', 'IN'),\n", + " ('comparison', 'NN'),\n", + " ('to', 'TO'),\n", + " ('common', 'JJ'),\n", + " ('human', 'JJ'),\n", + " ('languages', 'NNS'),\n", + " (',', ','),\n", + " ('the', 'DT'),\n", + " ('multitude', 'NN'),\n", + " ('of', 'IN'),\n", + " ('programming', 'VBG'),\n", + " ('languages', 'NNS'),\n", + " ('is', 'VBZ'),\n", + " ('of', 'IN'),\n", + " ('lesser', 'JJR'),\n", + " ('significance', 'NN'),\n", + " ('.', '.'),\n", + " ('The', 'DT'),\n", + " ('criterion', 'NN'),\n", + " ('of', 'IN'),\n", + " ('Turing', 'NNP'),\n", + " ('completeness', 'NN'),\n", + " ('of', 'IN'),\n", + " ('a', 'DT'),\n", + " ('programming', 'NN'),\n", + " ('language', 'NN'),\n", + " (',', ','),\n", + " ('that', 'WDT'),\n", + " ('is', 'VBZ'),\n", + " (',', ','),\n", + " ('that', 'IN'),\n", + " ('any', 'DT'),\n", + " ('computation', 'NN'),\n", + " ('can', 'MD'),\n", + " ('be', 'VB'),\n", + " ('expressed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('it', 'PRP'),\n", + " (',', ','),\n", + " ('means', 'VBZ'),\n", + " ('that', 'IN'),\n", + " ('every', 'DT'),\n", + " ('programming', 'NN'),\n", + " ('language', 'NN'),\n", + " ('is', 'VBZ'),\n", + " (',', ','),\n", + " ('formally', 'RB'),\n", + " ('speaking', 'VBG'),\n", + " (',', ','),\n", + " ('just', 'RB'),\n", + " ('a', 'DT'),\n", + " ('riff', 'NN'),\n", + " ('on', 'IN'),\n", + " ('every', 'DT'),\n", + " ('other', 'JJ'),\n", + " ('programming', 'NN'),\n", + " ('language', 'NN'),\n", + " ('.', '.'),\n", + " ('Nothing', 'NN'),\n", + " ('can', 'MD'),\n", + " ('be', 'VB'),\n", + " ('expressed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('a', 'DT'),\n", + " ('Turingcomplete', 'JJ'),\n", + " ('language', 'NN'),\n", + " ('such', 'JJ'),\n", + " ('as', 'IN'),\n", + " ('C', 'NNP'),\n", + " ('that', 'IN'),\n", + " ('couldn', 'NN'),\n", + " ('’', 'NNP'),\n", + " ('t', 'NN'),\n", + " ('also', 'RB'),\n", + " ('be', 'VB'),\n", + " ('expressed', 'VBN'),\n", + " ('in', 'IN'),\n", + " ('another', 'DT'),\n", + " ('Turingcomplete', 'NNP'),\n", + " ('language', 'NN'),\n", + " ('such', 'JJ'),\n", + " ('as', 'IN'),\n", + " ('Lisp', 'NNP'),\n", + " ('(', '('),\n", + " ('or', 'CC'),\n", + " ('Fortran', 'NNP'),\n", + " (',', ','),\n", + " ('Smalltalk', 'NNP'),\n", + " (',', ','),\n", + " ('Java', 'NNP'),\n", + " ('...', ':'),\n", + " (')', ')'),\n", + " ('and', 'CC'),\n", + " ('vice', 'NN'),\n", + " ('versa', 'NN'),\n", + " ('.', '.'),\n", + " ('This', 'DT'),\n", + " ('ultimately', 'JJ'),\n", + " ('proves', 'VBZ'),\n", + " ('the', 'DT'),\n", + " ...]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tagged" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/weasyprint.ipynb b/weasyprint.ipynb index cc4b028..dcab8ed 100644 --- a/weasyprint.ipynb +++ b/weasyprint.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -55,166 +55,130 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "or ..." + "or in this case let's use python + nltk to make a custom HTML page with parts of speech used as CSS classes..." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "# making an HTML object using our mini-datasets\n", - "import json\n", - "\n", - "f = open('json-dataset.json').read()\n", - "dataset = json.loads(f)\n", - "print(dataset)\n", - "\n", - "content = ''\n", + "import nltk\n", "\n", - "for word, value in dataset.items():\n", - " content += f'{ word } ({ value })
'\n", - " \n", - "html = HTML(string=content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "or ..." + "txt = open('txt/language.txt').read()\n", + "words = nltk.word_tokenize(txt)\n", + "tagged_words = nltk.pos_tag(words)" ] }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "# making an HTML object using our mini-datasets to insert a layer into a text\n", - "import json, nltk\n", - "\n", - "f = open('json-dataset.json').read()\n", - "dataset = json.loads(f)\n", - "#print(dataset)\n", - "\n", - "txt = open('txt/language.txt').read()\n", - "words = nltk.word_tokenize(txt)\n", - "#print(words)\n", - "\n", "content = ''\n", - "\n", "content += '

Language and Software Studies, by Florian Cramer

'\n", "\n", - "for word in words:\n", - " if word in dataset:\n", - " content += f'{ word } ({ value }) '\n", - " else:\n", - " content += f' { word } '\n", + "for word, tag in tagged_words:\n", + " content += f'{ word } '\n", "\n", - "html = HTML(string=content)" + "with open(\"txt/language.html\", \"w\") as f:\n", + " f.write(f\"\"\"\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "{content}\n", + "\n", + "\"\"\")\n", + "\n", + "html = HTML(\"txt/language.html\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## CSS" + "Saved to [language.html](txt/language.html). Fun fact: jupyter filters HTML pages that are displayed in the notebook. To see the HTML unfiltered, use an iframe (as below), or right-click and select Open in New Tab in the file list.\n", + "\n", + "Maybe useful evt. https://stackoverflow.com/questions/23358444/how-can-i-use-word-tokenize-in-nltk-and-keep-the-spaces" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NB: The above HTML refers to the stylesheet [language.css](txt/language.css) (notice that the path is relative to the HTML page, so no need to say txt in the link)." ] }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "css = CSS(string='''\n", - " @page{\n", - " size: A4;\n", - " margin: 15mm;\n", - " background-color: lightgrey;\n", - " font-family: monospace;\n", - " font-size: 8pt;\n", - " color: red;\n", - " border:1px dotted red;\n", - " \n", - " @top-left{\n", - " content: \"natural\";\n", - " }\n", - " @top-center{\n", - " content: \"language\";\n", - " }\n", - " @top-right{\n", - " content: \"artificial\";\n", - " }\n", - " @top-middle{\n", - " content: \"\"\n", - " }\n", - " @left-top{\n", - " content: \"computer control\";\n", - " }\n", - " @right-top{\n", - " content: \"markup\";\n", - " }\n", - " @bottom-left{\n", - " content: \"formal\";\n", - " }\n", - " @bottom-center{\n", - " content: \"programming\";\n", - " }\n", - " @bottom-right{\n", - " content: \"machine\";\n", - " }\n", - " }\n", - " body{\n", - " font-family: serif;\n", - " font-size: 12pt;\n", - " line-height: 1.4;\n", - " color: magenta;\n", - " }\n", - " h1{\n", - " width: 100%;\n", - " text-align: center;\n", - " font-size: 250%;\n", - " line-height: 1.25;\n", - " color: orange;\n", - " }\n", - " strong{\n", - " color: blue;\n", - " }\n", - " em{\n", - " color: green;\n", - " }\n", - "''', font_config=font_config)" + "from IPython.display import IFrame\n", + "IFrame(\"txt/language.html\", width=1024, height=600)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## PDF" + "## Generating the PDF!\n", + "\n", + "Now let's let weasyprint do it's stuff! Write_pdf actually calculates the layout, behaving like a web browser to render the HTML visibly and following the CSS guidelines for page media (notice the special rules in the CSS that weasy print recognizes and uses that the browser does not). Notice that the CSS file gets mentioned again explicitly (and here we need to refer to its path relative to this folder)." ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "html.write_pdf('weasyprint-test.pdf', stylesheets=[css], font_config=font_config)" + "## If we had not linked the CSS in the HTML, you could specify it in this way\n", + "# css = CSS(\"txt/language.css\", font_config=font_config)\n", + "# html.write_pdf('txt/language.pdf', stylesheets=[css], font_config=font_config)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 40, "metadata": {}, + "outputs": [], "source": [ - "## Previewing the PDF" + "html.write_pdf('txt/language.pdf', font_config=font_config)" ] }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -224,24 +188,24 @@ " \n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 117, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import IFrame\n", - "IFrame(\"weasyprint-test.pdf\", width=1024, height=600)" + "IFrame(\"txt/language.pdf\", width=1024, height=600)" ] }, {