You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3.5 KiB

NLTK pos-tagged HTML → PDF

In [ ]:
import nltk
from weasyprint import HTML, CSS
In [ ]:
# open the input file
txt = open('../txt/language.txt').read()
words = nltk.word_tokenize(txt)
tagged_words = nltk.pos_tag(words)
In [ ]:
# collect all the pieces of HTML
content = ''
content += '<h1>Language and Software Studies, by Florian Cramer</h1>'

for word, tag in tagged_words:
    content += f'<span class="{ tag }">{ word }</span> '
In [ ]:
# write the HTML file
with open("language.html", "w") as f:
    f.write(f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <link rel="stylesheet" type="text/css" href="language.css">
    <title></title>
</head>
<body>
{ content }
</body>
""")
In [ ]:
# write a CSS file
with open("language.css", "w") as f:
    f.write("""

@page{
    size:A4;
    background-color:lightgrey;
    margin:10mm;
}
.JJ{
    color:red;
}
.VB,
.VBG{
    color:magenta;
}
.NN,
.NNP{
    color:green;
}
.EX{
    color: blue;
}
    """)
In [ ]:
# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()
from weasyprint.fonts import FontConfiguration

font_config = FontConfiguration()
In [ ]:
# collect all the files and write the PDF
html = HTML("language.html")
css = CSS("language.css")
html.write_pdf('language.pdf', stylesheets=[css], font_config=font_config)
In [ ]:
# Preview your PDF in the notebook!
from IPython.display import IFrame, display
IFrame("language.pdf", width=900, height=600)
In [ ]: