You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

107 lines
4.2 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
with open('treaty_file/japan-korea.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print('''<!DOCTYPE>
<html>
<head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<link rel="stylesheet" href="estonia.css">
<link rel="stylesheet" href="legend.css">
<link rel="stylesheet" href="highlight.css">
<script src="highlight.js"></script>
<meta charset="utf-8">
<title></title>
</head>
<body>''')
#t_wrapper (second wrapper)
print('<div class="t_wrapper"><div class="t_intro">')
img_url = base64.b64encode(open('img/japankorea.jpg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img" style="position: fixed; background-color: gainsboro;">Japan-Korea Agreement of 1905</div><br><img class="t_image" src="data:img/japankorea.jpg;base64,{}">'.format(img_url)
print(t_image)
#t_info box
print('<div class ="t_info">')
t_infotext = [('Name of Treaty', 'japankorea-agreement-of-1905'), ('Country of Origin', 'Japan'), ('Signed', ' August, 1905'), ('Location', 'Seoul, Korea'), ('Word Counts', '547'), ('Type', 'Unequal treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/JapanKorea_Agreement_of_April_1905" target="_blank">link</a>'), ('Description', 'The JapanKorea Treaty of 1905, also known as the Eulsa Unwilling Treaty, was made between the Empire of Japan and the Korean Empire. The treaty deprived Korea of its diplomatic sovereignty and made Korea a protectorate of Imperial Japan.')]
for t_title, t_info in t_infotext:
print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
print('</div></div>')
print('''
<div class="legend">
<li class="legendhide eachlegend">stopwords</li>
<li class="legendadjective eachlegend">adjective</li>
<li class="legendverb eachlegend">verb</li>
<li class="legendnoun eachlegend">noun</li>
<li class="legendpropernoun eachlegend">proper noun</li>
<li class="legendadverb eachlegend">adverb</li>
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
<li class="legendpresentparticiple eachlegend">present participle</li>
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
</div>
''')
#Treaty text
print('<div class="t_paragraph">')
t_tokenized_all = []
for t_paragraph in russia_text_list:
t_tokenized = word_tokenize(t_paragraph)
t_tokenized_all += t_tokenized # add to the tokenized_all
t_tagged = pos_tag(t_tokenized)
print('<p>')
for t_word, t_pos in t_tagged:
print('<span class="{0} {1} eachwords">{2}</span>'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
print('</p>')
print('</div>')
#treaty colonial top words list
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('<div class="t_chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(t_chosen_words, t_frequency))
print('</div></div></div>')
# at the end of wrapper
print('</div>')
print('</div>')
print('''</body></html>''')