updated/simplified weasyprint notebook

4 years ago · 76a7f198bb
parent ebac31033e
commit 76a7f198bb
4 changed files with 1315 additions and 157 deletions
--- a/json-making-datasets.ipynb
+++ b/json-making-datasets.ipynb
@ -74,9 +74,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'dataset' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-f8b69ae2bb04>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Adding a new key to the dictionary, assigning a string as value:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'new'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'NEW WORD'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m# or assigning a number as value:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'new'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'dataset' is not defined"
+     ]
+    }
+   ],
   "source": [
    "# Adding a new key to the dictionary, assigning a string as value:\n",
    "dataset['new'] = 'NEW WORD'\n",
@ -113,19 +125,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is sample data, a list of words and POS tags:\n",
-    "data = [('Common', 'JJ'), ('languages', 'NNS'), ('like', 'IN'), ('English', 'NNP'), ('are', 'VBP'), ('both', 'DT'), ('formal', 'JJ'), ('and', 'CC'), ('semantic', 'JJ'), (';', ':'), ('although', 'IN'), ('their', 'PRP$'), ('scope', 'NN'), ('extends', 'VBZ'), ('beyond', 'IN'), ('the', 'DT'), ('formal', 'JJ'), (',', ','), ('anything', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('common', 'JJ'), ('language', 'NN'), ('.', '.')]"
+    "dataset = [('Common', 'JJ'), ('languages', 'NNS'), ('like', 'IN'), ('English', 'NNP'), ('are', 'VBP'), ('both', 'DT'), ('formal', 'JJ'), ('and', 'CC'), ('semantic', 'JJ'), (';', ':'), ('although', 'IN'), ('their', 'PRP$'), ('scope', 'NN'), ('extends', 'VBZ'), ('beyond', 'IN'), ('the', 'DT'), ('formal', 'JJ'), (',', ','), ('anything', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('common', 'JJ'), ('language', 'NN'), ('.', '.')]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'are': 'VBP', 'extends': 'VBZ', 'be': 'VB', 'expressed': 'VBN'}\n"
+     ]
+    }
+   ],
   "source": [
    "# Making a dataset with only verbs\n",
    "dataset = {}\n",
@ -146,7 +166,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -155,9 +175,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"are\": \"VBP\",\n",
+      "    \"extends\": \"VBZ\",\n",
+      "    \"be\": \"VB\",\n",
+      "    \"expressed\": \"VBN\"\n",
+      "}\n"
+     ]
+    }
+   ],
   "source": [
    "out = json.dumps(dataset, indent=4)\n",
    "print(out)"
@ -165,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/nltk-frequency-distribution.ipynb
+++ b/nltk-frequency-distribution.ipynb
@ -16,7 +16,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -26,9 +26,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The symbols of computer control languages inevitably do have semantic connotations simply because there exist no symbols with which humans would not associate some meaning.\n",
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "lines = open('txt/language.txt').readlines()\n",
    "sentence = random.choice(lines)\n",
@ -44,9 +53,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['The', 'symbols', 'of', 'computer', 'control', 'languages', 'inevitably', 'do', 'have', 'semantic', 'connotations', 'simply', 'because', 'there', 'exist', 'no', 'symbols', 'with', 'which', 'humans', 'would', 'not', 'associate', 'some', 'meaning', '.']\n"
+     ]
+    }
+   ],
   "source": [
    "tokens = nltk.word_tokenize(sentence)\n",
    "print(tokens)"
@ -68,9 +85,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<FreqDist with 26 samples and 173 outcomes>\n"
+     ]
+    }
+   ],
   "source": [
    "# frequency of characters\n",
    "fd = nltk.FreqDist(sentence)\n",
@ -79,9 +104,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(' ', 24), ('o', 15), ('e', 14), ('s', 14), ('n', 12), ('t', 11), ('a', 11), ('i', 10), ('m', 8), ('h', 7), ('l', 7), ('c', 7), ('u', 5), ('y', 4), ('b', 4), ('r', 3), ('g', 3), ('w', 3), ('p', 2), ('v', 2), ('d', 2), ('T', 1), ('f', 1), ('x', 1), ('.', 1), ('\\n', 1)]\n"
+     ]
+    }
+   ],
   "source": [
    "print(fd.most_common(50))"
   ]
@ -95,9 +128,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<FreqDist with 25 samples and 26 outcomes>\n"
+     ]
+    }
+   ],
   "source": [
    "# frequency of words\n",
    "fd = nltk.FreqDist(tokens)\n",
@ -106,9 +147,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('symbols', 2), ('The', 1), ('of', 1), ('computer', 1), ('control', 1), ('languages', 1), ('inevitably', 1), ('do', 1), ('have', 1), ('semantic', 1), ('connotations', 1), ('simply', 1), ('because', 1), ('there', 1), ('exist', 1), ('no', 1), ('with', 1), ('which', 1), ('humans', 1), ('would', 1), ('not', 1), ('associate', 1), ('some', 1), ('meaning', 1), ('.', 1)]\n"
+     ]
+    }
+   ],
   "source": [
    "print(fd.most_common(50))"
   ]
@ -122,9 +171,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<FreqDist with 944 samples and 2835 outcomes>\n"
+     ]
+    }
+   ],
   "source": [
    "# frequency of a text\n",
    "txt = open('txt/language.txt').read()\n",
@ -135,9 +192,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(',', 172), ('.', 93), ('the', 88), ('of', 88), ('”', 66), ('“', 65), ('and', 61), ('a', 61), ('is', 58), ('languages', 54), ('in', 51), ('language', 47), ('to', 41), ('as', 37), ('computer', 32), ('that', 29), ('programming', 25), ('control', 23), ('are', 22), ('for', 21), ('’', 21), ('The', 18), ('can', 17), ('be', 16), ('it', 16), ('machine', 16), ('human', 15), ('not', 15), ('software', 14), ('formal', 14), ('or', 14), ('symbols', 14), ('s', 12), ('with', 12), (':', 11), ('its', 11), ('this', 11), ('common', 11), ('their', 10), ('example', 9), (';', 9), ('operations', 9), ('such', 9), ('from', 8), ('through', 8), ('code', 8), ('since', 7), ('different', 7), ('In', 7), ('like', 7)]\n"
+     ]
+    }
+   ],
   "source": [
    "print(fd.most_common(50))"
   ]
@ -151,9 +216,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "47\n"
+     ]
+    }
+   ],
   "source": [
    "# Requesting the frequency of a specific word\n",
    "print(fd['language'])"
--- a/nltk-pos-tagger.ipynb
+++ b/nltk-pos-tagger.ipynb
--- a/weasyprint.ipynb
+++ b/weasyprint.ipynb
@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -26,7 +26,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -43,7 +43,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -55,166 +55,130 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "or ..."
+    "or in this case let's use python + nltk to make a custom HTML page with parts of speech used as CSS classes..."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# making an HTML object using our mini-datasets\n",
-    "import json\n",
-    "\n",
-    "f = open('json-dataset.json').read()\n",
-    "dataset = json.loads(f)\n",
-    "print(dataset)\n",
-    "\n",
-    "content = ''\n",
+    "import nltk\n",
    "\n",
-    "for word, value in dataset.items():\n",
-    "    content += f'<em>{ word }</em> (<strong>{ value }</strong>)<br />'\n",
-    "    \n",
-    "html = HTML(string=content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "or ..."
+    "txt = open('txt/language.txt').read()\n",
+    "words = nltk.word_tokenize(txt)\n",
+    "tagged_words = nltk.pos_tag(words)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# making an HTML object using our mini-datasets to insert a layer into a text\n",
-    "import json, nltk\n",
-    "\n",
-    "f = open('json-dataset.json').read()\n",
-    "dataset = json.loads(f)\n",
-    "#print(dataset)\n",
-    "\n",
-    "txt = open('txt/language.txt').read()\n",
-    "words = nltk.word_tokenize(txt)\n",
-    "#print(words)\n",
-    "\n",
    "content = ''\n",
-    "\n",
    "content += '<h1>Language and Software Studies, by Florian Cramer</h1>'\n",
    "\n",
-    "for word in words:\n",
-    "    if word in dataset:\n",
-    "        content += f'<em>{ word }</em> (<strong>{ value }</strong>) '\n",
-    "    else:\n",
-    "        content += f' { word } '\n",
+    "for word, tag in tagged_words:\n",
+    "    content += f'<span class=\"{tag}\">{ word }</span> '\n",
    "\n",
-    "html = HTML(string=content)"
+    "with open(\"txt/language.html\", \"w\") as f:\n",
+    "    f.write(f\"\"\"<!DOCTYPE html>\n",
+    "<html>\n",
+    "<head>\n",
+    "    <meta charset=\"utf-8\">\n",
+    "    <link rel=\"stylesheet\" type=\"text/css\" href=\"language.css\">\n",
+    "    <title></title>\n",
+    "</head>\n",
+    "<body>\n",
+    "{content}\n",
+    "</body>\n",
+    "\"\"\")\n",
+    "\n",
+    "html = HTML(\"txt/language.html\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## CSS"
+    "Saved to [language.html](txt/language.html). Fun fact: jupyter filters HTML pages that are displayed in the notebook. To see the HTML unfiltered, use an iframe (as below), or right-click and select Open in New Tab in the file list.\n",
+    "\n",
+    "Maybe useful evt. https://stackoverflow.com/questions/23358444/how-can-i-use-word-tokenize-in-nltk-and-keep-the-spaces"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NB: The above HTML refers to the stylesheet [language.css](txt/language.css) (notice that the path is relative to the HTML page, so no need to say txt in the link)."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 34,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"1024\"\n",
+       "            height=\"600\"\n",
+       "            src=\"txt/language.html\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7f0bc93b9668>"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "css = CSS(string='''\n",
-    "    @page{\n",
-    "        size: A4;\n",
-    "        margin: 15mm;\n",
-    "        background-color: lightgrey;\n",
-    "        font-family: monospace;\n",
-    "        font-size: 8pt;\n",
-    "        color: red;\n",
-    "        border:1px dotted red;\n",
-    "        \n",
-    "        @top-left{\n",
-    "            content: \"natural\";\n",
-    "        }\n",
-    "        @top-center{\n",
-    "            content: \"language\";\n",
-    "        }\n",
-    "        @top-right{\n",
-    "            content: \"artificial\";\n",
-    "        }\n",
-    "        @top-middle{\n",
-    "            content: \"\"\n",
-    "        }\n",
-    "        @left-top{\n",
-    "            content: \"computer control\";\n",
-    "        }\n",
-    "        @right-top{\n",
-    "            content: \"markup\";\n",
-    "        }\n",
-    "        @bottom-left{\n",
-    "            content: \"formal\";\n",
-    "        }\n",
-    "        @bottom-center{\n",
-    "            content: \"programming\";\n",
-    "        }\n",
-    "        @bottom-right{\n",
-    "            content: \"machine\";\n",
-    "        }\n",
-    "    }\n",
-    "    body{\n",
-    "        font-family: serif;\n",
-    "        font-size: 12pt;\n",
-    "        line-height: 1.4;\n",
-    "        color: magenta;\n",
-    "    }\n",
-    "    h1{\n",
-    "        width: 100%;\n",
-    "        text-align: center;\n",
-    "        font-size: 250%;\n",
-    "        line-height: 1.25;\n",
-    "        color: orange;\n",
-    "    }\n",
-    "    strong{\n",
-    "        color: blue;\n",
-    "    }\n",
-    "    em{\n",
-    "        color: green;\n",
-    "    }\n",
-    "''', font_config=font_config)"
+    "from IPython.display import IFrame\n",
+    "IFrame(\"txt/language.html\", width=1024, height=600)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## PDF"
+    "## Generating the PDF!\n",
+    "\n",
+    "Now let's let weasyprint do it's stuff! Write_pdf actually calculates the layout, behaving like a web browser to render the HTML visibly and following the CSS guidelines for page media (notice the special rules in the CSS that weasy print recognizes and uses that the browser does not). Notice that the CSS file gets mentioned again explicitly (and here we need to refer to its path relative to this folder)."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
-    "html.write_pdf('weasyprint-test.pdf', stylesheets=[css], font_config=font_config)"
+    "## If we had not linked the CSS in the HTML, you could specify it in this way\n",
+    "# css = CSS(\"txt/language.css\", font_config=font_config)\n",
+    "# html.write_pdf('txt/language.pdf', stylesheets=[css], font_config=font_config)"
   ]
  },
  {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 40,
   "metadata": {},
+   "outputs": [],
   "source": [
-    "## Previewing the PDF"
+    "html.write_pdf('txt/language.pdf', font_config=font_config)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
@ -224,24 +188,24 @@
       "        <iframe\n",
       "            width=\"1024\"\n",
       "            height=\"600\"\n",
-       "            src=\"weasyprint-test.pdf\"\n",
+       "            src=\"txt/language.pdf\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f2e5dcdcb38>"
+       "<IPython.lib.display.IFrame at 0x7f0bcbe67630>"
      ]
     },
-     "execution_count": 117,
+     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import IFrame\n",
-    "IFrame(\"weasyprint-test.pdf\", width=1024, height=600)"
+    "IFrame(\"txt/language.pdf\", width=1024, height=600)"
   ]
  },
  {