You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
208 lines
4.5 KiB
Plaintext
208 lines
4.5 KiB
Plaintext
4 years ago
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# NLTK pos-tagged HTML → PDF"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import nltk\n",
|
||
|
"from weasyprint import HTML, CSS"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# open the input file\n",
|
||
|
"txt = open('manifesto1.txt').read()\n",
|
||
|
"words = nltk.word_tokenize(txt)\n",
|
||
|
"tagged_words = nltk.pos_tag(words)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"A0Grid_30opac-01.png manifesto.pdf\n",
|
||
|
"language.css\t nltk-frequency-distribution-Copy1.ipynb\n",
|
||
|
"manifesto1.txt\t nltk-pos-tagger-Copy1.ipynb\n",
|
||
|
"manifesto.css\t nltk-pos-tagging-and-weasyprint.ipynb\n",
|
||
|
"manifesto.html\t nltk-similar-words-Copy1.ipynb\n",
|
||
|
"manifestonltk.ipynb pattern-search-Copy1.ipynb\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"!ls"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# collect all the pieces of HTML\n",
|
||
|
"content = ''\n",
|
||
|
"content += '<h1>A Liquid Manifesto</h1>'\n",
|
||
|
"\n",
|
||
|
"for word, tag in tagged_words:\n",
|
||
|
" content += f'<span class=\"{ tag }\">{ word }</span> '"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# write the HTML file\n",
|
||
|
"with open(\"manifesto.html\", \"w\") as f:\n",
|
||
|
" f.write(f\"\"\"<!DOCTYPE html>\n",
|
||
|
"<html>\n",
|
||
|
"<head>\n",
|
||
|
" <meta charset=\"utf-8\">\n",
|
||
|
" <link rel=\"stylesheet\" type=\"text/css\" href=\"manifesto.css\">\n",
|
||
|
" <title></title>\n",
|
||
|
"</head>\n",
|
||
|
"<body>\n",
|
||
|
"{ content }\n",
|
||
|
"</body>\n",
|
||
|
"\"\"\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# write a CSS file\n",
|
||
|
"with open(\"language.css\", \"w\") as f:\n",
|
||
|
" f.write(\"\"\"\n",
|
||
|
"\n",
|
||
|
"@page{\n",
|
||
|
" size:A4;\n",
|
||
|
" background-color:lightgrey;\n",
|
||
|
" margin:10mm;\n",
|
||
|
"}\n",
|
||
|
".JJ{\n",
|
||
|
" color:red;\n",
|
||
|
"}\n",
|
||
|
".VB,\n",
|
||
|
".VBG{\n",
|
||
|
" color:magenta;\n",
|
||
|
"}\n",
|
||
|
".NN,\n",
|
||
|
".NNP{\n",
|
||
|
" color:green;\n",
|
||
|
"}\n",
|
||
|
".EX{\n",
|
||
|
" color: blue;\n",
|
||
|
"}\n",
|
||
|
" \"\"\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()\n",
|
||
|
"from weasyprint.fonts import FontConfiguration\n",
|
||
|
"\n",
|
||
|
"font_config = FontConfiguration()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# collect all the files and write the PDF\n",
|
||
|
"html = HTML(\"manifesto.html\")\n",
|
||
|
"css = CSS(\"manifesto.css\")\n",
|
||
|
"html.write_pdf('manifesto.pdf', stylesheets=[css], font_config=font_config)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"\n",
|
||
|
" <iframe\n",
|
||
|
" width=\"900\"\n",
|
||
|
" height=\"600\"\n",
|
||
|
" src=\".pdf\"\n",
|
||
|
" frameborder=\"0\"\n",
|
||
|
" allowfullscreen\n",
|
||
|
" ></iframe>\n",
|
||
|
" "
|
||
|
],
|
||
|
"text/plain": [
|
||
|
"<IPython.lib.display.IFrame at 0xb3d3b310>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Preview your PDF in the notebook!\n",
|
||
|
"from IPython.display import IFrame, display\n",
|
||
|
"IFrame(\".pdf\", width=900, height=600)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.7.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 4
|
||
|
}
|