{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK pos-tagged HTML → PDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from weasyprint import HTML, CSS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open the input file\n", "txt = open('../txt/language.txt').read()\n", "words = nltk.word_tokenize(txt)\n", "tagged_words = nltk.pos_tag(words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# collect all the pieces of HTML\n", "content = ''\n", "content += '

Language and Software Studies, by Florian Cramer

'\n", "\n", "for word, tag in tagged_words:\n", " content += f'{ word } '" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# write the HTML file\n", "with open(\"language.html\", \"w\") as f:\n", " f.write(f\"\"\"\n", "\n", "\n", " \n", " \n", " \n", "\n", "\n", "{ content }\n", "\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# write a CSS file\n", "with open(\"language.css\", \"w\") as f:\n", " f.write(\"\"\"\n", "\n", "@page{\n", " size:A4;\n", " background-color:lightgrey;\n", " margin:10mm;\n", "}\n", ".JJ{\n", " color:red;\n", "}\n", ".VB,\n", ".VBG{\n", " color:magenta;\n", "}\n", ".NN,\n", ".NNP{\n", " color:green;\n", "}\n", ".EX{\n", " color: blue;\n", "}\n", " \"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()\n", "from weasyprint.fonts import FontConfiguration\n", "\n", "font_config = FontConfiguration()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# collect all the files and write the PDF\n", "html = HTML(\"language.html\")\n", "css = CSS(\"language.css\")\n", "html.write_pdf('language.pdf', stylesheets=[css], font_config=font_config)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Preview your PDF in the notebook!\n", "from IPython.display import IFrame, display\n", "IFrame(\"language.pdf\", width=900, height=600)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }