You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SI13/LIQUID/MANIFESTO/nltk-pos-tagging-and-weasyp...

4.5 KiB

NLTK pos-tagged HTML → PDF

In [4]:
import nltk
from weasyprint import HTML, CSS
In [5]:
# open the input file
txt = open('manifesto1.txt').read()
words = nltk.word_tokenize(txt)
tagged_words = nltk.pos_tag(words)
In [22]:
!ls
A0Grid_30opac-01.png  manifesto.pdf
language.css	      nltk-frequency-distribution-Copy1.ipynb
manifesto1.txt	      nltk-pos-tagger-Copy1.ipynb
manifesto.css	      nltk-pos-tagging-and-weasyprint.ipynb
manifesto.html	      nltk-similar-words-Copy1.ipynb
manifestonltk.ipynb   pattern-search-Copy1.ipynb
In [6]:
# collect all the pieces of HTML
content = ''
content += '<h1>A Liquid Manifesto</h1>'

for word, tag in tagged_words:
    content += f'<span class="{ tag }">{ word }</span> '
In [7]:
# write the HTML file
with open("manifesto.html", "w") as f:
    f.write(f"""<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <link rel="stylesheet" type="text/css" href="manifesto.css">
    <title></title>
</head>
<body>
{ content }
</body>
""")
In [8]:
# write a CSS file
with open("language.css", "w") as f:
    f.write("""

@page{
    size:A4;
    background-color:lightgrey;
    margin:10mm;
}
.JJ{
    color:red;
}
.VB,
.VBG{
    color:magenta;
}
.NN,
.NNP{
    color:green;
}
.EX{
    color: blue;
}
    """)
In [9]:
# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration()
from weasyprint.fonts import FontConfiguration

font_config = FontConfiguration()
In [ ]:
# collect all the files and write the PDF
html = HTML("manifesto.html")
css = CSS("manifesto.css")
html.write_pdf('manifesto.pdf', stylesheets=[css], font_config=font_config)
In [1]:
# Preview your PDF in the notebook!
from IPython.display import IFrame, display
IFrame(".pdf", width=900, height=600)
Out[1]:
In [ ]: