You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3.5 KiB
3.5 KiB
NLTK pos-tagged HTML → PDF¶
In [ ]:
import nltk from weasyprint import HTML, CSS
In [ ]:
# open the input file txt = open('../txt/language.txt').read() words = nltk.word_tokenize(txt) tagged_words = nltk.pos_tag(words)
In [ ]:
# collect all the pieces of HTML content = '' content += '<h1>Language and Software Studies, by Florian Cramer</h1>' for word, tag in tagged_words: content += f'<span class="{ tag }">{ word }</span> '
In [ ]:
# write the HTML file with open("language.html", "w") as f: f.write(f"""<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <link rel="stylesheet" type="text/css" href="language.css"> <title></title> </head> <body> { content } </body> """)
In [ ]:
# write a CSS file with open("language.css", "w") as f: f.write(""" @page{ size:A4; background-color:lightgrey; margin:10mm; } .JJ{ color:red; } .VB, .VBG{ color:magenta; } .NN, .NNP{ color:green; } .EX{ color: blue; } """)
In [ ]:
# If you use @font-face in your stylesheet, you would need Weasyprint's FontConfiguration() from weasyprint.fonts import FontConfiguration font_config = FontConfiguration()
In [ ]:
# collect all the files and write the PDF html = HTML("language.html") css = CSS("language.css") html.write_pdf('language.pdf', stylesheets=[css], font_config=font_config)
In [ ]:
# Preview your PDF in the notebook! from IPython.display import IFrame, display IFrame("language.pdf", width=900, height=600)
In [ ]: