# from __future__ import division
from nltk import sent_tokenize , word_tokenize , pos_tag
from nltk . probability import FreqDist
from nltk . corpus import stopwords
import nltk
import codecs
import base64
nltk . download ( ' stopwords ' )
with open ( ' treaty_file/russia-estonia.txt ' , ' r ' ) as russia_file :
russia_text = russia_file . read ( )
russia_text_list = russia_text . split ( " \n \n " )
t_default_stopwords = set ( stopwords . words ( ' english ' ) )
t_custom_stopwords = set ( codecs . open ( ' t_stopwords.txt ' , ' r ' ) . read ( ) . splitlines ( ) )
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print ( ''' <!DOCTYPE>
< html >
< head >
< script src = " https://code.jquery.com/jquery-3.5.0.min.js " > < / script >
< link rel = " stylesheet " href = " estonia.css " >
< link rel = " stylesheet " href = " legend.css " >
< script src = " highlight.js " > < / script >
< meta charset = " utf-8 " >
< title > < / title >
< / head >
< body > ''' )
#t_wrapper (second wrapper)
print ( ' <div class= " t_wrapper " ><div class= " t_intro " > ' )
img_url = base64 . b64encode ( open ( ' img/tartu.jpeg ' , ' rb ' ) . read ( ) ) . decode ( ' utf-8 ' )
t_image = ' <div class= " t_img " >Peace Treaty of Tartu, Estonia<br><img class= " t_image " src= " data:img/tartu.jpeg;base64, {} " ></div> ' . format ( img_url )
print ( t_image )
#t_info box
print ( ' <div class = " t_info " > ' )
t_infotext = [ ( ' Name of Treaty ' , ' Peace Treaty of Tartu ' ) , ( ' Country of Origin ' , ' Russia ' ) , ( ' Signed ' , ' February 2, 1920 ' ) , ( ' Location ' , ' Tartu, Estonia ' ) , ( ' Word Counts ' , ' 2,104 ' ) , ( ' Type ' , ' bilateral peace treaty ' ) , ( ' Original Source ' , ' <a href= " https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian) " target= " _blank " >link</a> ' ) , ( ' Description ' , ' The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence. ' ) ]
for t_title , t_info in t_infotext :
print ( ' <div class= " t_info- {0} " ><div class= " info_t_title " ><b> {0} </b></div><div class= " t_info_content " > {1} </div></div><br> ' . format ( t_title , t_info ) )
print ( ' </div></div> ' )
print ( '''
< div class = " legend " >
< li > < span class = " legendverb " > < / span > verb < / li >
< li > < span class = " legendnoun " > < / span > noun < / li >
< li > < span class = " legendadjective " > < / span > adjective < / li >
< li > < span class = " legendadverb " > < / span > adverb < / li >
< li > < span class = " legendhide " > < / span > stopwords < / li >
< / div >
''' )
#Treaty text
print ( ' <div class= " t_paragraph " > ' )
t_tokenized_all = [ ]
for t_paragraph in russia_text_list :
t_tokenized = word_tokenize ( t_paragraph )
t_tokenized_all + = t_tokenized # add to the tokenized_all
t_tagged = pos_tag ( t_tokenized )
print ( ' <p> ' )
for t_word , t_pos in t_tagged :
print ( ' <span class= " {0} {1} " > {2} </span> ' . format ( t_pos . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) , t_word . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) . lower ( ) , t_word ) )
print ( ' </p> ' )
print ( ' </div> ' )
#treaty colonial top words list
print ( ' <div class= " t_top_words " ><div class= " t_top_words_title " ><b>Frequent words</b></div> ' )
t_tokens_without_stopwords = nltk . FreqDist ( words . lower ( ) for words in t_tokenized_all if words . lower ( ) not in t_custom_stopwords )
t_frequency_word = FreqDist ( t_tokens_without_stopwords )
t_top_words = t_tokens_without_stopwords . most_common ( 20 )
for t_chosen_words , t_frequency in t_top_words :
print ( ' <div class= " t_chosen_words " > {} ( {} ) </div> ' . format ( t_chosen_words , t_frequency ) )
print ( ' </div></div></div> ' )
print ( ' </div></div> ' )
print ( ''' </body></html> ''' )