|
|
# from __future__ import division
|
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
|
from nltk.probability import FreqDist
|
|
|
from nltk.corpus import stopwords
|
|
|
import nltk
|
|
|
import codecs
|
|
|
import base64
|
|
|
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
with open('treaty_file/japan-korea.txt', 'r') as russia_file:
|
|
|
russia_text = russia_file.read()
|
|
|
russia_text_list = russia_text.split("\n\n")
|
|
|
|
|
|
t_default_stopwords = set(stopwords.words('english'))
|
|
|
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
|
|
|
t_all_stopwords = t_default_stopwords | t_custom_stopwords
|
|
|
|
|
|
|
|
|
|
|
|
print('''<!DOCTYPE>
|
|
|
<html>
|
|
|
<head>
|
|
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
|
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
|
|
<link rel="stylesheet" href="estonia.css">
|
|
|
<link rel="stylesheet" href="legend.css">
|
|
|
<link rel="stylesheet" href="highlight.css">
|
|
|
<script src="highlight.js"></script>
|
|
|
<meta charset="utf-8">
|
|
|
|
|
|
<title></title>
|
|
|
|
|
|
</head>
|
|
|
<body>''')
|
|
|
|
|
|
|
|
|
#t_wrapper (second wrapper)
|
|
|
print('<div class="t_wrapper"><div class="t_intro">')
|
|
|
|
|
|
img_url = base64.b64encode(open('img/japankorea.jpg', 'rb').read()).decode('utf-8')
|
|
|
t_image = '<div class="t_img" style="position: fixed; background-color: gainsboro;">Japan-Korea Agreement of 1905</div><br><img class="t_image" src="data:img/japankorea.jpg;base64,{}">'.format(img_url)
|
|
|
print(t_image)
|
|
|
|
|
|
|
|
|
#t_info box
|
|
|
print('<div class ="t_info">')
|
|
|
t_infotext = [('Name of Treaty', 'japan–korea-agreement-of-1905'), ('Country of Origin', 'Japan'), ('Signed', ' August, 1905'), ('Location', 'Seoul, Korea'), ('Word Counts', '547'), ('Type', 'Unequal treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Japan–Korea_Agreement_of_April_1905" target="_blank">link</a>'), ('Description', 'The Japan–Korea Treaty of 1905, also known as the Eulsa Unwilling Treaty, was made between the Empire of Japan and the Korean Empire. The treaty deprived Korea of its diplomatic sovereignty and made Korea a protectorate of Imperial Japan.')]
|
|
|
|
|
|
for t_title, t_info in t_infotext:
|
|
|
print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
|
|
|
|
|
|
print('</div></div>')
|
|
|
|
|
|
print('''
|
|
|
<div class="legend">
|
|
|
<li class="legendhide eachlegend">stopwords</li>
|
|
|
<li class="legendadjective eachlegend">adjective</li>
|
|
|
<li class="legendverb eachlegend">verb</li>
|
|
|
<li class="legendnoun eachlegend">noun</li>
|
|
|
<li class="legendpropernoun eachlegend">proper noun</li>
|
|
|
<li class="legendadverb eachlegend">adverb</li>
|
|
|
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
|
|
|
<li class="legendpresentparticiple eachlegend">present participle</li>
|
|
|
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
|
|
|
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
|
|
|
</div>
|
|
|
''')
|
|
|
|
|
|
|
|
|
|
|
|
#Treaty text
|
|
|
print('<div class="t_paragraph">')
|
|
|
t_tokenized_all = []
|
|
|
for t_paragraph in russia_text_list:
|
|
|
t_tokenized = word_tokenize(t_paragraph)
|
|
|
t_tokenized_all += t_tokenized # add to the tokenized_all
|
|
|
t_tagged = pos_tag(t_tokenized)
|
|
|
print('<p>')
|
|
|
for t_word, t_pos in t_tagged:
|
|
|
print('<span class="{0} {1} eachwords">{2}</span>'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
|
|
|
print('</p>')
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
#treaty colonial top words list
|
|
|
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')
|
|
|
|
|
|
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
|
|
|
t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
|
|
t_top_words = t_tokens_without_stopwords.most_common(20)
|
|
|
|
|
|
for t_chosen_words, t_frequency in t_top_words:
|
|
|
print('<div class="t_chosen_words" > {} ({}) </div>'.format(t_chosen_words, t_frequency))
|
|
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
|
|
|
# at the end of wrapper
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
print('</div>')
|
|
|
print('''</body></html>''')
|