You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
351 lines
8.8 KiB
Python
351 lines
8.8 KiB
Python
4 years ago
|
from __future__ import division
|
||
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
||
|
from nltk.probability import FreqDist
|
||
|
from nltk.corpus import stopwords
|
||
4 years ago
|
import nltk
|
||
|
import glob
|
||
|
import re
|
||
|
import codecs
|
||
4 years ago
|
import base64
|
||
|
|
||
|
|
||
4 years ago
|
nltk.download('stopwords')
|
||
4 years ago
|
|
||
|
|
||
4 years ago
|
# faceapp_file = open('faceapp.txt','r')
|
||
4 years ago
|
with open('tos_file/faceapp.txt', 'r') as faceapp_file:
|
||
4 years ago
|
faceapp_text = faceapp_file.read()
|
||
|
faceapp_text_list = faceapp_text.split("\n\n")
|
||
![]()
4 years ago
|
|
||
4 years ago
|
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
|
||
4 years ago
|
russia_text = russia_file.read()
|
||
|
russia_text_list = russia_text.split("\n\n")
|
||
4 years ago
|
|
||
|
|
||
4 years ago
|
#tos stopwords
|
||
|
tos_default_stopwords = set(stopwords.words('english'))
|
||
|
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
||
|
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
|
||
|
|
||
|
#treaty stopwords
|
||
|
t_default_stopwords = set(stopwords.words('english'))
|
||
|
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
|
||
|
t_all_stopwords = t_default_stopwords | t_custom_stopwords
|
||
4 years ago
|
|
||
|
|
||
|
# multi-line string HTML
|
||
|
print('''<!DOCTYPE html>
|
||
|
<html>
|
||
|
<head>
|
||
4 years ago
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
||
|
|
||
4 years ago
|
<meta charset="utf-8">
|
||
|
<title></title>
|
||
|
<style>
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "SourceCodePro";
|
||
|
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
body {
|
||
|
}
|
||
|
|
||
|
.tos_wrapper {
|
||
4 years ago
|
# background-color: yellow;
|
||
4 years ago
|
width: 49.3%;
|
||
4 years ago
|
float: left;
|
||
4 years ago
|
margin-right: 20px;
|
||
4 years ago
|
border-right: 2px solid black;
|
||
|
|
||
4 years ago
|
}
|
||
|
|
||
|
.t_wrapper {
|
||
4 years ago
|
# background-color: green;
|
||
4 years ago
|
width: 49.3%;
|
||
4 years ago
|
float: right;
|
||
|
}
|
||
|
|
||
|
.t_img {
|
||
4 years ago
|
# background-color: Aquamarine;
|
||
4 years ago
|
font-family: helvetica;
|
||
|
font-weight: regular;
|
||
|
font-size: 20pt;
|
||
4 years ago
|
float: left;
|
||
|
}
|
||
|
|
||
4 years ago
|
.img {
|
||
|
width: 30vh;
|
||
|
}
|
||
|
|
||
4 years ago
|
.info {
|
||
4 years ago
|
# background-color: LightSkyBlue;
|
||
4 years ago
|
font-family: SourceCodePro;
|
||
|
font-size: 10pt;
|
||
4 years ago
|
width: 30vh;
|
||
|
float: right;
|
||
|
|
||
|
# border: 2px solid black;
|
||
|
margin: 10px;
|
||
|
text-align: center;
|
||
4 years ago
|
}
|
||
|
|
||
|
.t_info {
|
||
4 years ago
|
# background-color: LightSkyBlue;
|
||
4 years ago
|
font-family: SourceCodePro;
|
||
|
font-size: 10pt;
|
||
4 years ago
|
width: 30vh;
|
||
|
float: right;
|
||
|
|
||
|
# border: 2px solid black;
|
||
|
padding: 10px;
|
||
|
text-align: center;
|
||
4 years ago
|
}
|
||
|
|
||
|
|
||
|
.paragraph {
|
||
4 years ago
|
# background-color: gold;
|
||
4 years ago
|
font-family: helvetica;
|
||
4 years ago
|
font-weight: regular;
|
||
4 years ago
|
font-size: 40px;
|
||
|
width: 98%;
|
||
|
padding: 10px;
|
||
|
margin-top: 70px;
|
||
4 years ago
|
float: right;
|
||
|
}
|
||
|
|
||
|
.t_paragraph {
|
||
4 years ago
|
# background-color: gold;
|
||
4 years ago
|
font-family: helvetica;
|
||
4 years ago
|
font-weight: regular;
|
||
4 years ago
|
font-size: 40px;
|
||
|
width: 98%;
|
||
|
margin-top: 70px;
|
||
4 years ago
|
float: right;
|
||
|
}
|
||
|
|
||
|
.top_words {
|
||
4 years ago
|
# background-color: purple;
|
||
4 years ago
|
font-family: helvetica;
|
||
|
font-size: 15pt;
|
||
|
width: 100%;
|
||
|
column-count: 4;
|
||
4 years ago
|
float: left;
|
||
|
}
|
||
|
|
||
4 years ago
|
.top_words_title {
|
||
|
# background-color: yellow;
|
||
|
width: 100%;
|
||
4 years ago
|
}
|
||
|
|
||
4 years ago
|
.chosen_words {
|
||
|
# background-color: pink;
|
||
|
line-height: 0.1;
|
||
4 years ago
|
float: none;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
.t_top_words {
|
||
|
# background-color: purple;
|
||
|
font-family: helvetica;
|
||
|
font-size: 15pt;
|
||
|
width: 100%;
|
||
|
column-count: 4;
|
||
|
float: left;
|
||
|
}
|
||
|
|
||
|
.t_top_words_title {
|
||
|
# background-color: yellow;
|
||
|
width: 100%;
|
||
4 years ago
|
}
|
||
|
|
||
|
.t_chosen_words {
|
||
|
# background-color: pink;
|
||
|
line-height: 0.1;
|
||
4 years ago
|
float: none;
|
||
4 years ago
|
}
|
||
4 years ago
|
|
||
4 years ago
|
|
||
4 years ago
|
|
||
4 years ago
|
|
||
4 years ago
|
|
||
4 years ago
|
|
||
|
|
||
4 years ago
|
.hide {
|
||
|
color: white;
|
||
|
}
|
||
4 years ago
|
|
||
|
|
||
4 years ago
|
</style>
|
||
|
</head>
|
||
|
<body>
|
||
|
<script>
|
||
4 years ago
|
|
||
4 years ago
|
$(document).ready(function() {
|
||
|
if ($('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .VB, .VBZ, .CD').click(function() {
|
||
|
$('.TO, .DT, .IN, .WDT, .WP, .MD, .PRP, .CC, .VB, .VBZ, .CD').toggleClass('hide'); // reset
|
||
|
}));
|
||
|
|
||
|
else ($('span').click(
|
||
|
function(){
|
||
|
var selectedclass = $(this).attr('class');
|
||
|
$('span').css('background-color', 'white').css('color','black'); // reset
|
||
|
$('span.' + selectedclass).css('background-color', '#FF4500').css('color', 'white'); //highlighting the select
|
||
|
}));
|
||
4 years ago
|
})
|
||
4 years ago
|
</script>
|
||
|
|
||
|
|
||
|
''')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
4 years ago
|
|
||
4 years ago
|
#wrapper
|
||
|
print('<div class ="tos_wrapper">')
|
||
|
|
||
|
#insert an image
|
||
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
||
|
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
|
||
|
FaceApp_image = '<div class="t_img"><h1>FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
|
||
|
print(FaceApp_image)
|
||
|
|
||
|
|
||
|
#info box
|
||
|
print('<div class ="info">')
|
||
4 years ago
|
infotext = [('Service', 'FaceApp'), ('Country', 'Russia'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
|
||
4 years ago
|
|
||
|
for title, info in infotext:
|
||
4 years ago
|
print('<div class="info_{0}"><div class="info_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info_content">{1}</div></div><br>'.format(title, info))
|
||
4 years ago
|
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
|
|
||
|
#ToS text
|
||
|
print('<div class ="paragraph">')
|
||
|
|
||
|
|
||
4 years ago
|
# for paragraph in faceapp_text_list:
|
||
4 years ago
|
# tokenized = word_tokenize(paragraph)
|
||
|
# tagged = pos_tag(tokenized)
|
||
|
# print('<p>')
|
||
|
# for word, pos in tagged:
|
||
|
# print('<span class="{}">{}</span>'.format(pos, word))
|
||
|
# print('</p>')
|
||
|
# print('</div>')
|
||
|
|
||
|
|
||
|
|
||
4 years ago
|
# #faceapp_text
|
||
|
tokenized = word_tokenize(faceapp_text)
|
||
4 years ago
|
tagged = pos_tag(tokenized)
|
||
|
print('<p>')
|
||
|
for word, pos in tagged:
|
||
|
print('<span class="{}">{}</span>'.format(pos, word))
|
||
|
print('</p>')
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
4 years ago
|
|
||
|
#colonial words list
|
||
4 years ago
|
print('<div class="top_words"> <div style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</div>')
|
||
4 years ago
|
|
||
4 years ago
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords)
|
||
4 years ago
|
frequency_word = FreqDist(tokens_without_stopwords)
|
||
|
top_words = tokens_without_stopwords.most_common(20)
|
||
|
|
||
4 years ago
|
|
||
4 years ago
|
for chosen_words, frequency in top_words:
|
||
4 years ago
|
print('<br><div class="chosen_words" >{}({}) </div>'.format(chosen_words, frequency))
|
||
4 years ago
|
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
4 years ago
|
#t_wrapper (second wrapper)
|
||
4 years ago
|
print('</div><div class="t_wrapper">')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#insert an image
|
||
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
||
|
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
|
||
|
t_image = '<div class="t_img"><h1>Peace Treaty of Tartu<h1><img style="width:90%" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
|
||
|
print(t_image)
|
||
|
|
||
|
|
||
|
#t_info box
|
||
|
print('<div class ="t_info">')
|
||
4 years ago
|
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country', 'Russia'), ('Date', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Signed', 'February 2, 1920'), ('Type', 'bilateral peace treaty'), ('source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian-Estonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
|
||
4 years ago
|
|
||
|
for t_title, t_info in t_infotext:
|
||
4 years ago
|
print('<div class="t_info-{0}"><div class="info_t_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
|
||
4 years ago
|
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
|
#ToS text
|
||
|
print('<div class="t_paragraph">')
|
||
4 years ago
|
t_tokenized = word_tokenize(russia_text)
|
||
4 years ago
|
t_tagged = pos_tag(t_tokenized)
|
||
|
|
||
|
for t_word, t_pos in t_tagged:
|
||
|
print('<span class="{}">{}</span>'.format(t_pos, t_word))
|
||
4 years ago
|
print('<p>')
|
||
4 years ago
|
print('</div>')
|
||
|
|
||
|
|
||
|
#treaty colonial words list
|
||
4 years ago
|
print('<div class="t_top_words" > <div style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</div>')
|
||
4 years ago
|
|
||
4 years ago
|
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords)
|
||
4 years ago
|
t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
||
|
t_top_words = t_tokens_without_stopwords.most_common(20)
|
||
|
|
||
|
for t_chosen_words, t_frequency in t_top_words:
|
||
4 years ago
|
print('<br><div class="t_chosen_words" >{}({}) </div>'.format(t_chosen_words, t_frequency))
|
||
4 years ago
|
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
|
|
||
|
print('</div>')
|
||
|
print('''</body></html>''')
|
||
|
|
||
|
|