You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

351 lines
8.8 KiB
Python

from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import glob
import re
import codecs
import base64
nltk.download('stopwords')
# faceapp_file = open('faceapp.txt','r')
with open('tos_file/faceapp.txt', 'r') as faceapp_file:
faceapp_text = faceapp_file.read()
faceapp_text_list = faceapp_text.split("\n\n")
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
4 years ago
#tos stopwords
tos_default_stopwords = set(stopwords.words('english'))
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
#treaty stopwords
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
# multi-line string HTML
print('''<!DOCTYPE html>
<html>
<head>
4 years ago
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<meta charset="utf-8">
<title></title>
<style>
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
}
.tos_wrapper {
# background-color: yellow;
4 years ago
width: 49.3%;
float: left;
margin-right: 20px;
4 years ago
border-right: 2px solid black;
}
.t_wrapper {
# background-color: green;
4 years ago
width: 49.3%;
float: right;
}
.t_img {
# background-color: Aquamarine;
4 years ago
font-family: helvetica;
font-weight: regular;
font-size: 20pt;
float: left;
}
4 years ago
.img {
width: 30vh;
}
.info {
# background-color: LightSkyBlue;
font-family: SourceCodePro;
font-size: 10pt;
4 years ago
width: 30vh;
float: right;
# border: 2px solid black;
margin: 10px;
text-align: center;
}
.t_info {
# background-color: LightSkyBlue;
font-family: SourceCodePro;
font-size: 10pt;
4 years ago
width: 30vh;
float: right;
# border: 2px solid black;
padding: 10px;
text-align: center;
}
.paragraph {
# background-color: gold;
4 years ago
font-family: helvetica;
font-weight: regular;
4 years ago
font-size: 40px;
width: 98%;
padding: 10px;
margin-top: 70px;
float: right;
}
.t_paragraph {
# background-color: gold;
4 years ago
font-family: helvetica;
font-weight: regular;
4 years ago
font-size: 40px;
width: 98%;
margin-top: 70px;
float: right;
}
.top_words {
# background-color: purple;
4 years ago
font-family: helvetica;
font-size: 15pt;
width: 100%;
column-count: 4;
float: left;
}
4 years ago
.top_words_title {
# background-color: yellow;
width: 100%;
}
.chosen_words {
# background-color: pink;
line-height: 0.1;
4 years ago
float: none;
}
.t_top_words {
# background-color: purple;
font-family: helvetica;
font-size: 15pt;
width: 100%;
column-count: 4;
float: left;
}
.t_top_words_title {
# background-color: yellow;
width: 100%;
}
.t_chosen_words {
# background-color: pink;
line-height: 0.1;
4 years ago
float: none;
}
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
.hide {
color: white;
}
4 years ago
4 years ago
</style>
</head>
<body>
<script>
4 years ago
4 years ago
$(document).ready(function() {
if ($('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .VB, .VBZ, .CD').click(function() {
$('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .VB, .VBZ, .CD').toggleClass('hide'); // reset
}));
else ($('span').click(
function(){
var selectedclass = $(this).attr('class');
$('span').css('background-color', 'white').css('color','black'); // reset
$('span.' + selectedclass).css('background-color', '#FF4500').css('color', 'white'); //highlighting the select
}));
4 years ago
})
</script>
''')
4 years ago
#wrapper
print('<div class ="tos_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="t_img"><h1>FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
4 years ago
infotext = [('Service', 'FaceApp'), ('Country', 'Russia'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
for title, info in infotext:
4 years ago
print('<div class="info_{0}"><div class="info_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info_content">{1}</div></div><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
# for paragraph in faceapp_text_list:
# tokenized = word_tokenize(paragraph)
# tagged = pos_tag(tokenized)
# print('<p>')
# for word, pos in tagged:
# print('<span class="{}">{}</span>'.format(pos, word))
# print('</p>')
# print('</div>')
# #faceapp_text
tokenized = word_tokenize(faceapp_text)
4 years ago
tagged = pos_tag(tokenized)
print('<p>')
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</p>')
print('</div>')
#colonial words list
print('<div class="top_words"> <div style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</div>')
4 years ago
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(20)
for chosen_words, frequency in top_words:
print('<br><div class="chosen_words" >{}({}) </div>'.format(chosen_words, frequency))
print('</div>')
4 years ago
#t_wrapper (second wrapper)
print('</div><div class="t_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img"><h1>Peace Treaty of Tartu<h1><img style="width:90%" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
print(t_image)
#t_info box
print('<div class ="t_info">')
4 years ago
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country', 'Russia'), ('Date', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Signed', 'February 2, 1920'), ('Type', 'bilateral peace treaty'), ('source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
for t_title, t_info in t_infotext:
4 years ago
print('<div class="t_info-{0}"><div class="info_t_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
print('</div>')
#ToS text
print('<div class="t_paragraph">')
t_tokenized = word_tokenize(russia_text)
t_tagged = pos_tag(t_tokenized)
for t_word, t_pos in t_tagged:
print('<span class="{}">{}</span>'.format(t_pos, t_word))
4 years ago
print('<p>')
print('</div>')
#treaty colonial words list
print('<div class="t_top_words" > <div style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</div>')
4 years ago
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('<br><div class="t_chosen_words" >{}({}) </div>'.format(t_chosen_words, t_frequency))
print('</div>')
print('</div>')
print('''</body></html>''')