You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4.5 KiB
4.5 KiB
NLTK pos-tagged HTML → PDF¶
In [4]:
import nltk from weasyprint import HTML, CSS
In [5]:
# open the input file txt = open('manifesto1.txt').read() words = nltk.word_tokenize(txt) tagged_words = nltk.pos_tag(words)
In [22]:
!ls
A0Grid_30opac-01.png manifesto.pdf language.css nltk-frequency-distribution-Copy1.ipynb manifesto1.txt nltk-pos-tagger-Copy1.ipynb manifesto.css nltk-pos-tagging-and-weasyprint.ipynb manifesto.html nltk-similar-words-Copy1.ipynb manifestonltk.ipynb pattern-search-Copy1.ipynb
In [6]:
# collect all the pieces of HTML content = '' content += '<h1>A Liquid Manifesto</h1>' for word, tag in tagged_words: content += f'<span class="{ tag }">{ word }</span> '
In [7]:
# write the HTML file with open("manifesto.html", "w") as f: f.write(f"""<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <link rel="stylesheet" type="text/css" href="manifesto.css"> <title></title> </head> <body> { content } </body> """)
In [8]:
# write a CSS file with open("language.css", "w") as f: f.write(""" @page{ size:A4; background-color:lightgrey; margin:10mm; } .JJ{ color:red; } .VB, .VBG{ color:magenta; } .NN, .NNP{ color:green; } .EX{ color: blue; } """)
In [9]:
# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration() from weasyprint.fonts import FontConfiguration font_config = FontConfiguration()
In [ ]:
# collect all the files and write the PDF html = HTML("manifesto.html") css = CSS("manifesto.css") html.write_pdf('manifesto.pdf', stylesheets=[css], font_config=font_config)
In [1]:
# Preview your PDF in the notebook! from IPython.display import IFrame, display IFrame(".pdf", width=900, height=600)
Out[1]:
In [ ]: