{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK pos-tagged HTML → PDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from weasyprint import HTML, CSS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open the input file\n", "txt = open('../txt/language.txt').read()\n", "words = nltk.word_tokenize(txt)\n", "tagged_words = nltk.pos_tag(words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# collect all the pieces of HTML\n", "content = ''\n", "content += '