bo-graduation/website/estonia.py

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64


nltk.download('stopwords')

with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
	russia_text = russia_file.read()
	russia_text_list = russia_text.split("\n\n")

t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords


print('''<!DOCTYPE>
<html>
<head>
	<script	src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
	<link rel="stylesheet" href="estonia.css">
	<link rel="stylesheet" href="legend.css">
	<script src="highlight.js"></script>
	<meta charset="utf-8">

	<title></title>
	
</head>
<body>''')


#t_wrapper (second wrapper)
print('<div class="t_wrapper"><div class="t_intro">')

img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img">Peace Treaty of Tartu, Estonia<br><img class="t_image" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
print(t_image)


#t_info box
print('<div class ="t_info">')
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country of Origin', 'Russia'), ('Signed', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Word Counts', '2,104'), ('Type', 'bilateral peace treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)" target="_blank">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]

for t_title, t_info in t_infotext:
	print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))

print('</div></div>')

print('''
	<div class="legend">
	    <li><span class="legendverb"></span> verb</li>
	    <li><span class="legendnoun"></span> noun</li>
	    <li><span class="legendadjective"></span> adjective</li>
	    <li><span class="legendadverb"></span> adverb</li>
	    <li><span class="legendhide"></span> stopwords</li>
	</div>
''')


#Treaty text
print('<div class="t_paragraph">')
t_tokenized_all = []
for t_paragraph in russia_text_list:
	t_tokenized = word_tokenize(t_paragraph)
	t_tokenized_all += t_tokenized  # add to the tokenized_all
	t_tagged = pos_tag(t_tokenized)
	print('<p>')
	for t_word, t_pos in t_tagged:
		print('<span class="{0} {1}">{2}</span>'.format(t_pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
	print('</p>')

print('</div>')


#treaty colonial top words list
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')

t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)

for t_chosen_words, t_frequency in t_top_words:
	print('<div class="t_chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(t_chosen_words, t_frequency))

print('</div></div></div>')


print('</div></div>')
print('''</body></html>''')
worked with Iframe 5 years ago			`# from __future__ import division`
			`from nltk import sent_tokenize, word_tokenize, pos_tag`
			`from nltk.probability import FreqDist`
			`from nltk.corpus import stopwords`
			`import nltk`
			`import codecs`
			`import base64`


			`nltk.download('stopwords')`

			`with open('treaty_file/russia-estonia.txt', 'r') as russia_file:`
			`russia_text = russia_file.read()`
			`russia_text_list = russia_text.split("\n\n")`

			`t_default_stopwords = set(stopwords.words('english'))`
			`t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())`
			`t_all_stopwords = t_default_stopwords \| t_custom_stopwords`



			`print('''<!DOCTYPE>`
			`<html>`
			`<head>`
			`<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>`
			`<link rel="stylesheet" href="estonia.css">`
updated 5 years ago			`<link rel="stylesheet" href="legend.css">`
worked with Iframe 5 years ago			`<script src="highlight.js"></script>`
			`<meta charset="utf-8">`

			`<title></title>`

			`</head>`
			`<body>''')`


			`#t_wrapper (second wrapper)`
			`print('<div class="t_wrapper"><div class="t_intro">')`

			`img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')`
			`t_image = '<div class="t_img">Peace Treaty of Tartu, Estonia<br><img class="t_image" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)`
			`print(t_image)`


			`#t_info box`
			`print('<div class ="t_info">')`
updated 5 years ago			t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country of Origin', 'Russia'), ('Signed', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Word Counts', '2,104'), ('Type', 'bilateral peace treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)" target="_blank">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
worked with Iframe 5 years ago
			`for t_title, t_info in t_infotext:`
			`print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))`

			`print('</div></div>')`

updated 5 years ago			`print('''`
			`<div class="legend">`
			`<li><span class="legendverb"></span> verb</li>`
			`<li><span class="legendnoun"></span> noun</li>`
			`<li><span class="legendadjective"></span> adjective</li>`
			`<li><span class="legendadverb"></span> adverb</li>`
			`<li><span class="legendhide"></span> stopwords</li>`
			`</div>`
			`''')`


worked with Iframe 5 years ago
			`#Treaty text`
			`print('<div class="t_paragraph">')`
updated 5 years ago			`t_tokenized_all = []`
worked with Iframe 5 years ago			`for t_paragraph in russia_text_list:`
			`t_tokenized = word_tokenize(t_paragraph)`
updated 5 years ago			`t_tokenized_all += t_tokenized # add to the tokenized_all`
worked with Iframe 5 years ago			`t_tagged = pos_tag(t_tokenized)`
			`print('<p>')`
			`for t_word, t_pos in t_tagged:`
			`print('<span class="{0} {1}">{2}</span>'.format(t_pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))`
			`print('</p>')`

			`print('</div>')`


			`#treaty colonial top words list`
			`print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')`

updated 5 years ago			`t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)`
worked with Iframe 5 years ago			`t_frequency_word = FreqDist(t_tokens_without_stopwords)`
			`t_top_words = t_tokens_without_stopwords.most_common(20)`

			`for t_chosen_words, t_frequency in t_top_words:`
			`print('<div class="t_chosen_words" > {} ({}) </div>'.format(t_chosen_words, t_frequency))`

			`print('</div></div></div>')`


			`print('</div></div>')`
			`print('''</body></html>''')`