{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK pos-tagged HTML → PDF" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from weasyprint import HTML, CSS" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# open the input file\n", "txt = open('manifesto1.txt').read()\n", "words = nltk.word_tokenize(txt)\n", "tagged_words = nltk.pos_tag(words)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A0Grid_30opac-01.png manifesto.pdf\n", "language.css\t nltk-frequency-distribution-Copy1.ipynb\n", "manifesto1.txt\t nltk-pos-tagger-Copy1.ipynb\n", "manifesto.css\t nltk-pos-tagging-and-weasyprint.ipynb\n", "manifesto.html\t nltk-similar-words-Copy1.ipynb\n", "manifestonltk.ipynb pattern-search-Copy1.ipynb\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# collect all the pieces of HTML\n", "content = ''\n", "content += '