You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
521 lines
12 KiB
Python
521 lines
12 KiB
Python
# from __future__ import division
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
from nltk.probability import FreqDist
|
|
from nltk.corpus import stopwords
|
|
import nltk
|
|
import codecs
|
|
import base64
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
# faceapp_file = open('faceapp.txt','r')
|
|
with open('tos_file/faceapp.txt', 'r') as faceapp_file:
|
|
faceapp_text = faceapp_file.read()
|
|
faceapp_text_list = faceapp_text.split("\n\n")
|
|
|
|
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
|
|
russia_text = russia_file.read()
|
|
russia_text_list = russia_text.split("\n\n")
|
|
|
|
|
|
#tos stopwords
|
|
tos_default_stopwords = set(stopwords.words('english'))
|
|
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
|
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
|
|
|
|
#treaty stopwords
|
|
t_default_stopwords = set(stopwords.words('english'))
|
|
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
|
|
t_all_stopwords = t_default_stopwords | t_custom_stopwords
|
|
|
|
|
|
# multi-line string HTML
|
|
print('''<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
|
|
|
<meta charset="utf-8">
|
|
<title></title>
|
|
<style>
|
|
|
|
@font-face {
|
|
font-family: "Belgika";
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
|
|
}
|
|
|
|
@font-face {
|
|
font-family: "Belgika";
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
|
|
}
|
|
|
|
@font-face {
|
|
font-family: "Belgika";
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
|
|
}
|
|
|
|
@font-face {
|
|
font-family: "SourceCodePro";
|
|
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
* {
|
|
margin: 0;
|
|
padding: 0;
|
|
}
|
|
|
|
|
|
html, body {
|
|
height: 100%;
|
|
}
|
|
|
|
p {
|
|
margin-bottom: 30px;
|
|
}
|
|
|
|
.tos_wrapper {
|
|
# background-color: yellow;
|
|
width: 49%;
|
|
float: left;
|
|
margin-right: 20px;
|
|
border-right: 2px solid black;
|
|
|
|
}
|
|
|
|
.t_wrapper {
|
|
# background-color: green;
|
|
width: 49%;
|
|
float: right;
|
|
}
|
|
|
|
.intro {
|
|
width: 75%;
|
|
float: right;
|
|
}
|
|
|
|
.t_intro {
|
|
width: 75%;
|
|
float: left;
|
|
}
|
|
|
|
.img {
|
|
# background-color: Aquamarine;
|
|
font-family: Belgika;
|
|
font-size: 20pt;
|
|
padding-top: 30px;
|
|
float: left;
|
|
}
|
|
|
|
.image {
|
|
height: 30vh;
|
|
margin-top: 30px;
|
|
}
|
|
|
|
.t_img {
|
|
# background-color: Aquamarine;
|
|
font-family: Belgika;
|
|
font-size: 20pt;
|
|
padding-top: 30px;
|
|
float: left;
|
|
}
|
|
|
|
.t_image {
|
|
height: 30vh;
|
|
margin-top: 30px;
|
|
}
|
|
|
|
.info {
|
|
# background-color: LightSkyBlue;
|
|
font-family: helvetica;
|
|
font-weight: regular;
|
|
font-size: 10pt;
|
|
width: 45vh;
|
|
float: left;
|
|
margin-top: 30px;
|
|
|
|
# border: 2px solid black;
|
|
# text-align: center;
|
|
column-count: 2;
|
|
|
|
}
|
|
|
|
.t_info {
|
|
# background-color: LightSkyBlue;
|
|
font-family: helvetica;
|
|
font-weight: regular;
|
|
font-size: 10pt;
|
|
width: 45vh;
|
|
float: left;
|
|
margin-top: 30px;
|
|
|
|
# border: 2px solid black;
|
|
# text-align: center;
|
|
column-count: 2;
|
|
}
|
|
|
|
|
|
.paragraph {
|
|
# background-color: gold;
|
|
font-family: helvetica;
|
|
font-weight: regular;
|
|
font-size: 20px;
|
|
width: 75%;
|
|
padding: 10px;
|
|
margin-top: 70px;
|
|
float: right;
|
|
}
|
|
|
|
.t_paragraph {
|
|
# background-color: gold;
|
|
font-family: helvetica;
|
|
font-weight: regular;
|
|
font-size: 20px;
|
|
width: 98%;
|
|
margin-top: 70px;
|
|
float: right;
|
|
}
|
|
|
|
|
|
|
|
|
|
.top_words {
|
|
background-color: black;
|
|
width: 10%;
|
|
height: 100%;
|
|
|
|
font-family: Belgika;
|
|
font-weight: 16th;
|
|
font-size: 7.5pt;
|
|
color: white;
|
|
letter-spacing: 0.5px;
|
|
|
|
position: fixed;
|
|
float: left;
|
|
|
|
}
|
|
|
|
.top_words_title {
|
|
# background-color: yellow;
|
|
margin: 40px 10px 10px 10px;
|
|
width: 80%;
|
|
}
|
|
|
|
.chosen_words {
|
|
# background-color: pink;
|
|
padding: 10px 1px 1px 10px;
|
|
}
|
|
|
|
.chosen_words:hover {
|
|
background-color: white;
|
|
color: red;
|
|
}
|
|
|
|
|
|
|
|
.t_top_words {
|
|
background-color: black;
|
|
width: 10%;
|
|
height: 100%;
|
|
|
|
font-family: Belgika;
|
|
font-weight: 16th;
|
|
font-size: 7.5pt;
|
|
color: white;
|
|
letter-spacing: 0.5px;
|
|
|
|
position: fixed;
|
|
float: right;
|
|
right: 0;
|
|
top: 0;
|
|
|
|
}
|
|
|
|
.t_top_words_title {
|
|
margin: 40px 10px 10px 10px;
|
|
width: 80%;
|
|
}
|
|
|
|
.t_chosen_words {
|
|
padding: 10px 1px 1px 10px;
|
|
}
|
|
|
|
.t_chosen_words:hover {
|
|
background-color: white;
|
|
color: red;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
.hide {
|
|
# color: white;
|
|
background-color: black;
|
|
}
|
|
|
|
.TO:hover, .DT:hover, .IN:hover, .WDT:hover, .WP:hover, .MD:hover, .PRP:hover, .CC:hover, .marks:hover, .comma:hover, .dot:hover {
|
|
background-color: black;
|
|
}
|
|
|
|
.adjective {
|
|
color: white;
|
|
background-color: blue;
|
|
}
|
|
|
|
.JJ:hover {
|
|
color: white;
|
|
background-color: blue;
|
|
}
|
|
|
|
.noun {
|
|
background-color: springgreen;
|
|
}
|
|
|
|
.NN:hover, .NNS:hover {
|
|
background-color: springgreen;
|
|
}
|
|
|
|
.verb {
|
|
background-color: yellow;
|
|
# color: yellow;
|
|
}
|
|
|
|
.VB:hover, .VBZ:hover, .VBN:hover, .VVD:hover {
|
|
background-color: yellow;
|
|
|
|
}
|
|
|
|
.propernoun {
|
|
background-color: pink;
|
|
# color: pink;
|
|
}
|
|
|
|
.NNP:hover, .NNPS:hover {
|
|
background-color: pink;
|
|
|
|
}
|
|
|
|
.adverb {
|
|
background-color: lightgreen;
|
|
# color: lightgreen;
|
|
}
|
|
|
|
.RB:hover, .RBR:hover, .JJR:hover, .RBS:hover {
|
|
background-color: lightgreen;
|
|
|
|
}
|
|
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<script>
|
|
$(document).ready(function() {
|
|
|
|
$('.chosen_words').click( function(){
|
|
var word = $(this).text();
|
|
alert(word);
|
|
});
|
|
|
|
|
|
$('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .marks, .comma, .dot').click(
|
|
function() {
|
|
$('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .marks, .comma, .dot').toggleClass('hide');
|
|
});
|
|
|
|
$('.JJ').click(
|
|
function() {
|
|
$('.JJ').toggleClass('adjective');
|
|
});
|
|
|
|
$('.NN, .NNS').click(
|
|
function() {
|
|
$('.NN, .NNS').toggleClass('noun');
|
|
});
|
|
|
|
$('.VB, .VBZ, .VBN, .VVD').click(
|
|
function() {
|
|
$('.VB, .VBZ, .VBN, .VVD').toggleClass('verb');
|
|
});
|
|
|
|
$('.NNP, .NNPS').click(
|
|
function() {
|
|
$('.NNP, .NNPS').toggleClass('propernoun');
|
|
});
|
|
|
|
$('.RB').click(
|
|
function() {
|
|
$('.RB').toggleClass('adverb');
|
|
});
|
|
|
|
$('.RBR, .JJR').click(
|
|
function() {
|
|
$('.RBR, .JJR').toggleClass('adverb-comparative');
|
|
});
|
|
|
|
$('.RBS').click(
|
|
function() {
|
|
$('.RBS').toggleClass('adverb-superlative');
|
|
});
|
|
})
|
|
|
|
</script>''')
|
|
|
|
|
|
# $('div.t_chosen_words_{0}').mouseover(function(){
|
|
# $(this).find('.t_chosen_words_{0}').text('i + 'is here').css('color', 'red');
|
|
# })
|
|
# .mouseout(function() {
|
|
# $( this ).find( ".t_chosen_words" ).text( " " ).css('color', 'black');
|
|
# });
|
|
|
|
|
|
|
|
|
|
#wrapper
|
|
print('<div class ="tos_wrapper"><div class="intro">')
|
|
|
|
#insert an image
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
|
|
FaceApp_image = '<div class="img">FaceApp<br><img class="image" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
|
|
print(FaceApp_image)
|
|
|
|
|
|
#info box
|
|
print('<div class ="info">')
|
|
infotext = [('Name of Service', 'FaceApp'), ('Country of Origin', 'Russia'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('Word Counts', '5,392'), ('Original Source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
|
|
|
|
for title, info in infotext:
|
|
print('<div class="info_{0}" ><div class="info_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info_content">{1}</div></div><br>'.format(title, info))
|
|
|
|
print('</div></div>')
|
|
|
|
|
|
|
|
#ToS text
|
|
print('<div class ="paragraph">')
|
|
for paragraph in faceapp_text_list:
|
|
tokenized = word_tokenize(paragraph)
|
|
tagged = pos_tag(tokenized)
|
|
print('<p>')
|
|
for word, pos in tagged:
|
|
print('<span class="{0} {1}">{2}</span>'.format(pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
|
|
print('</p>')
|
|
|
|
print('</div>')
|
|
|
|
|
|
#tos top words list
|
|
print('<div class="top_words"><div class="top_words_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >Frequent words:</div>')
|
|
|
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords)
|
|
frequency_word = FreqDist(tokens_without_stopwords)
|
|
top_words = tokens_without_stopwords.most_common(30)
|
|
|
|
|
|
for chosen_words, frequency in top_words:
|
|
print('<div class="chosen_words" > {} ({}) </div>'.format(chosen_words, frequency))
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#t_wrapper (second wrapper)
|
|
print('</div><div class="t_wrapper"><div class="t_intro">')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#insert an image
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
|
|
t_image = '<div class="t_img">Peace Treaty of Tartu, Estonia<br><img class="t_image" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
|
|
print(t_image)
|
|
|
|
|
|
#t_info box
|
|
print('<div class ="t_info">')
|
|
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country of Origin', 'Russia'), ('Signed', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Word Counts', '2,104'), ('Type', 'bilateral peace treaty'), ('Original Source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
|
|
|
|
for t_title, t_info in t_infotext:
|
|
print('<div class="t_info-{0}"><div class="info_t_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
|
|
|
|
print('</div>')
|
|
|
|
|
|
#Treaty text
|
|
print('<div class="t_paragraph">')
|
|
for t_paragraph in russia_text_list:
|
|
t_tokenized = word_tokenize(t_paragraph)
|
|
t_tagged = pos_tag(t_tokenized)
|
|
print('<p>')
|
|
for t_word, t_pos in t_tagged:
|
|
print('<span class="{0} {1}">{2}</span>'.format(t_pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
|
|
print('</p>')
|
|
|
|
print('</div>')
|
|
|
|
|
|
#treaty colonial top words list
|
|
print('<div class="t_top_words"><div class="t_top_words_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >Frequent words:</div>')
|
|
|
|
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords)
|
|
t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
|
t_top_words = t_tokens_without_stopwords.most_common(20)
|
|
|
|
for t_chosen_words, t_frequency in t_top_words:
|
|
print('<div class="t_chosen_words" > {} ({}) </div>'.format(t_chosen_words, t_frequency))
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
|
|
##### not working
|
|
# #treaty colonial top words list
|
|
# print('<div class="t_top_words"><div class="t_top_words_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</div>')
|
|
|
|
# for words in faceapp_text_list:
|
|
# t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords)
|
|
# t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
|
# t_top_words = t_tokens_without_stopwords.most_common(20)
|
|
|
|
# for t_chosen_words, t_frequency in t_top_words:
|
|
# print('<div class="t_chosen_words" > {} ({}) </div>'.format(t_chosen_words, t_frequency))
|
|
|
|
# print('</div></div></div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('</div>')
|
|
print('''</body></html>''')
|
|
|