You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
4.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print('''<!DOCTYPE>
<html>
<head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<link rel="stylesheet" href="estonia.css">
<link rel="stylesheet" href="legend.css">
<script src="highlight.js"></script>
<meta charset="utf-8">
<title></title>
</head>
<body>''')
#t_wrapper (second wrapper)
print('<div class="t_wrapper"><div class="t_intro">')
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img" style="position: fixed; background-color: gainsboro;">Peace Treaty of Tartu</div><br><img class="t_image" src="data:img/tartu.jpeg;base64,{}">'.format(img_url)
print(t_image)
#t_info box
print('<div class ="t_info">')
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country of Origin', 'Russia'), ('Signed', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Word Counts', '2,104'), ('Type', 'bilateral peace treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)" target="_blank">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
for t_title, t_info in t_infotext:
print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
print('</div></div>')
print('''
<div class="legend">
<li><span class="legendverb"></span> verb</li>
<li><span class="legendnoun"></span> noun</li>
<li><span class="legendadjective"></span> adjective</li>
<li><span class="legendadverb"></span> adverb</li>
<li><span class="legendpossesivepronoun"></span> possesive pronoun</li>
<li><span class="legendpresentparticiple"></span> present participle</li>
<li><span class="legendadjectivesuperlative"></span> adjective superlative</li>
<li><span class="legendadverb-comparative-superative"></span> adverb comparative + superative</li>
<li><span class="legendpropernoun"></span> proper noun</li>
<li><span class="legendhide"></span> stopwords</li>
</div>
''')
#Treaty text
print('<div class="t_paragraph">')
t_tokenized_all = []
for t_paragraph in russia_text_list:
t_tokenized = word_tokenize(t_paragraph)
t_tokenized_all += t_tokenized # add to the tokenized_all
t_tagged = pos_tag(t_tokenized)
print('<p>')
for t_word, t_pos in t_tagged:
print('<span class="{0} {1} eachwords">{2}</span>'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
print('</p>')
print('</div>')
#treaty colonial top words list
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('<div class="t_chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(t_chosen_words, t_frequency))
print('</div></div></div>')
# at the end of wrapper
print('</div>')
print('</div>')
print('''</body></html>''')