You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SI13/LIQUID/MANIFESTO/nltk-pos-tagging-and-weasyp...

208 lines
4.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NLTK pos-tagged HTML → PDF"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from weasyprint import HTML, CSS"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# open the input file\n",
"txt = open('manifesto1.txt').read()\n",
"words = nltk.word_tokenize(txt)\n",
"tagged_words = nltk.pos_tag(words)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A0Grid_30opac-01.png manifesto.pdf\n",
"language.css\t nltk-frequency-distribution-Copy1.ipynb\n",
"manifesto1.txt\t nltk-pos-tagger-Copy1.ipynb\n",
"manifesto.css\t nltk-pos-tagging-and-weasyprint.ipynb\n",
"manifesto.html\t nltk-similar-words-Copy1.ipynb\n",
"manifestonltk.ipynb pattern-search-Copy1.ipynb\n"
]
}
],
"source": [
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# collect all the pieces of HTML\n",
"content = ''\n",
"content += '<h1>A Liquid Manifesto</h1>'\n",
"\n",
"for word, tag in tagged_words:\n",
" content += f'<span class=\"{ tag }\">{ word }</span> '"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# write the HTML file\n",
"with open(\"manifesto.html\", \"w\") as f:\n",
" f.write(f\"\"\"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
" <meta charset=\"utf-8\">\n",
" <link rel=\"stylesheet\" type=\"text/css\" href=\"manifesto.css\">\n",
" <title></title>\n",
"</head>\n",
"<body>\n",
"{ content }\n",
"</body>\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# write a CSS file\n",
"with open(\"language.css\", \"w\") as f:\n",
" f.write(\"\"\"\n",
"\n",
"@page{\n",
" size:A4;\n",
" background-color:lightgrey;\n",
" margin:10mm;\n",
"}\n",
".JJ{\n",
" color:red;\n",
"}\n",
".VB,\n",
".VBG{\n",
" color:magenta;\n",
"}\n",
".NN,\n",
".NNP{\n",
" color:green;\n",
"}\n",
".EX{\n",
" color: blue;\n",
"}\n",
" \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()\n",
"from weasyprint.fonts import FontConfiguration\n",
"\n",
"font_config = FontConfiguration()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# collect all the files and write the PDF\n",
"html = HTML(\"manifesto.html\")\n",
"css = CSS(\"manifesto.css\")\n",
"html.write_pdf('manifesto.pdf', stylesheets=[css], font_config=font_config)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"900\"\n",
" height=\"600\"\n",
" src=\".pdf\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0xb3d3b310>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Preview your PDF in the notebook!\n",
"from IPython.display import IFrame, display\n",
"IFrame(\".pdf\", width=900, height=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}