# from __future__ import division
from nltk import sent_tokenize , word_tokenize , pos_tag
from nltk . probability import FreqDist
from nltk . corpus import stopwords
import nltk
import codecs
import base64
nltk . download ( ' stopwords ' )
with open ( ' treaty_file/uk-korea.txt ' , ' r ' ) as russia_file :
russia_text = russia_file . read ( )
russia_text_list = russia_text . split ( " \n \n " )
t_default_stopwords = set ( stopwords . words ( ' english ' ) )
t_custom_stopwords = set ( codecs . open ( ' t_stopwords.txt ' , ' r ' ) . read ( ) . splitlines ( ) )
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print ( ''' <!DOCTYPE>
< html >
< head >
< script src = " https://code.jquery.com/jquery-3.5.0.min.js " > < / script >
< script src = " https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js " > < / script >
< link rel = " stylesheet " href = " estonia.css " >
< link rel = " stylesheet " href = " legend.css " >
< link rel = " stylesheet " href = " highlight.css " >
< script src = " highlight.js " > < / script >
< meta charset = " utf-8 " >
< title > < / title >
< / head >
< body > ''' )
#t_wrapper (second wrapper)
print ( ' <div class= " t_wrapper " ><div class= " t_intro " > ' )
img_url = base64 . b64encode ( open ( ' img/uk-korea.png ' , ' rb ' ) . read ( ) ) . decode ( ' utf-8 ' )
t_image = ' <div class= " t_img " style= " position: fixed; background-color: gainsboro; " >United Kingdom– Korea <br>Treaty of 1883</div><br><img class= " t_image " src= " data:img/uk-korea.png;base64, {} " > ' . format ( img_url )
print ( t_image )
#t_info box
print ( ' <div class = " t_info " > ' )
t_infotext = [ ( ' Name of Treaty ' , ' United Kingdom– Korea Treaty of 1883 ' ) , ( ' Country of Origin ' , ' United Kingdom ' ) , ( ' Signed ' , ' November, 1883 ' ) , ( ' Location ' , ' Hanyang, Korea ' ) , ( ' Word Counts ' , ' 3,357 ' ) , ( ' Type ' , ' unilateral treaty ' ) , ( ' Original Source ' , ' <a href= " https://en.wikisource.org/wiki/United_Kingdom– Korea_Treaty_of_1883 " target= " _blank " >link</a> ' ) , ( ' Description ' , ' Under the treaty, Great Britain obtained extraterritorial rights in Korea and from 1883 to 1910, British subjects in Korea were not subject to the jurisdiction of Korean courts. ' ) ]
for t_title , t_info in t_infotext :
print ( ' <div class= " t_info- {0} " ><div class= " info_t_title " ><b> {0} </b></div><div class= " t_info_content " > {1} </div></div><br> ' . format ( t_title , t_info ) )
print ( ' </div></div> ' )
print ( '''
< div class = " legend " >
< li class = " legendhide eachlegend " > stopwords < / li >
< li class = " legendadjective eachlegend " > adjective < / li >
< li class = " legendverb eachlegend " > verb < / li >
< li class = " legendnoun eachlegend " > noun < / li >
< li class = " legendpropernoun eachlegend " > proper noun < / li >
< li class = " legendadverb eachlegend " > adverb < / li >
< li class = " legendpossesivepronoun eachlegend " > possesive pronoun < / li >
< li class = " legendpresentparticiple eachlegend " > present participle < / li >
< li class = " legendadjectivesuperlative eachlegend " > adjective superlative < / li >
< li class = " legendadverb-comparative-superative eachlegend " > adverb comparative + superative < / li >
< / div >
''' )
#Treaty text
print ( ' <div class= " t_paragraph " > ' )
t_tokenized_all = [ ]
for t_paragraph in russia_text_list :
t_tokenized = word_tokenize ( t_paragraph )
t_tokenized_all + = t_tokenized # add to the tokenized_all
t_tagged = pos_tag ( t_tokenized )
print ( ' <p> ' )
for t_word , t_pos in t_tagged :
print ( ' <span class= " {0} {1} eachwords " > {2} </span> ' . format ( t_pos . replace ( ' PRP$ ' , ' PRPS ' ) . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) , t_word . replace ( ' ’ ' , ' apostrophe ' ) . replace ( ' . ' , ' dot ' ) . replace ( ' , ' , ' comma ' ) . replace ( ' ( ' , ' marks ' ) . replace ( ' ) ' , ' marks ' ) . replace ( ' : ' , ' marks ' ) . replace ( ' ; ' , ' marks ' ) . lower ( ) , t_word ) )
print ( ' </p> ' )
print ( ' </div> ' )
#treaty colonial top words list
print ( ' <div class= " t_top_words " ><div class= " t_top_words_title " ><b>Frequent words</b></div> ' )
t_tokens_without_stopwords = nltk . FreqDist ( words . lower ( ) for words in t_tokenized_all if words . lower ( ) not in t_custom_stopwords )
t_frequency_word = FreqDist ( t_tokens_without_stopwords )
t_top_words = t_tokens_without_stopwords . most_common ( 20 )
for t_chosen_words , t_frequency in t_top_words :
print ( ' <div class= " t_chosen_words " > {} ( {} ) </div> ' . format ( t_chosen_words , t_frequency ) )
print ( ' </div></div></div> ' )
# at the end of wrapper
print ( ' </div> ' )
print ( ' </div> ' )
print ( ''' </body></html> ''' )