You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

164 lines
3.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NLTK pos-tagged HTML → PDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"from weasyprint import HTML, CSS"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# open the input file\n",
"txt = open('../txt/language.txt').read()\n",
"words = nltk.word_tokenize(txt)\n",
"tagged_words = nltk.pos_tag(words)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# collect all the pieces of HTML\n",
"content = ''\n",
"content += '<h1>Language and Software Studies, by Florian Cramer</h1>'\n",
"\n",
"for word, tag in tagged_words:\n",
" content += f'<span class=\"{ tag }\">{ word }</span> '"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# write the HTML file\n",
"with open(\"language.html\", \"w\") as f:\n",
" f.write(f\"\"\"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
" <meta charset=\"utf-8\">\n",
" <link rel=\"stylesheet\" type=\"text/css\" href=\"language.css\">\n",
" <title></title>\n",
"</head>\n",
"<body>\n",
"{ content }\n",
"</body>\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# write a CSS file\n",
"with open(\"language.css\", \"w\") as f:\n",
" f.write(\"\"\"\n",
"\n",
"@page{\n",
" size:A4;\n",
" background-color:lightgrey;\n",
" margin:10mm;\n",
"}\n",
".JJ{\n",
" color:red;\n",
"}\n",
".VB,\n",
".VBG{\n",
" color:magenta;\n",
"}\n",
".NN,\n",
".NNP{\n",
" color:green;\n",
"}\n",
".EX{\n",
" color: blue;\n",
"}\n",
" \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()\n",
"from weasyprint.fonts import FontConfiguration\n",
"\n",
"font_config = FontConfiguration()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# collect all the files and write the PDF\n",
"html = HTML(\"language.html\")\n",
"css = CSS(\"language.css\")\n",
"html.write_pdf('language.pdf', stylesheets=[css], font_config=font_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Preview your PDF in the notebook!\n",
"from IPython.display import IFrame, display\n",
"IFrame(\"language.pdf\", width=900, height=600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}