You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
3.0 KiB
Python
83 lines
3.0 KiB
Python
# from __future__ import division
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
from nltk.probability import FreqDist
|
|
from nltk.corpus import stopwords
|
|
import nltk
|
|
import codecs
|
|
import base64
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
|
|
russia_text = russia_file.read()
|
|
russia_text_list = russia_text.split("\n\n")
|
|
|
|
t_default_stopwords = set(stopwords.words('english'))
|
|
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
|
|
t_all_stopwords = t_default_stopwords | t_custom_stopwords
|
|
|
|
|
|
|
|
print('''<!DOCTYPE>
|
|
<html>
|
|
<head>
|
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
|
<link rel="stylesheet" href="estonia.css">
|
|
<script src="highlight.js"></script>
|
|
<meta charset="utf-8">
|
|
|
|
<title></title>
|
|
|
|
</head>
|
|
<body>''')
|
|
|
|
|
|
#t_wrapper (second wrapper)
|
|
print('<div class="t_wrapper"><div class="t_intro">')
|
|
|
|
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
|
|
t_image = '<div class="t_img">Peace Treaty of Tartu, Estonia<br><img class="t_image" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
|
|
print(t_image)
|
|
|
|
|
|
#t_info box
|
|
print('<div class ="t_info">')
|
|
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country of Origin', 'Russia'), ('Signed', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Word Counts', '2,104'), ('Type', 'bilateral peace treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
|
|
|
|
for t_title, t_info in t_infotext:
|
|
print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
|
|
|
|
print('</div></div>')
|
|
|
|
|
|
#Treaty text
|
|
print('<div class="t_paragraph">')
|
|
for t_paragraph in russia_text_list:
|
|
t_tokenized = word_tokenize(t_paragraph)
|
|
t_tagged = pos_tag(t_tokenized)
|
|
print('<p>')
|
|
for t_word, t_pos in t_tagged:
|
|
print('<span class="{0} {1}">{2}</span>'.format(t_pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
|
|
print('</p>')
|
|
|
|
print('</div>')
|
|
|
|
|
|
#treaty colonial top words list
|
|
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')
|
|
|
|
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords)
|
|
t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
|
t_top_words = t_tokens_without_stopwords.most_common(20)
|
|
|
|
for t_chosen_words, t_frequency in t_top_words:
|
|
print('<div class="t_chosen_words" > {} ({}) </div>'.format(t_chosen_words, t_frequency))
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
print('</div></div>')
|
|
print('''</body></html>''')
|
|
|