|
|
from __future__ import division
|
|
|
import glob
|
|
|
from nltk import *
|
|
|
import re
|
|
|
|
|
|
|
|
|
import nltk
|
|
|
import codecs
|
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
|
from nltk.probability import FreqDist
|
|
|
from nltk.corpus import stopwords
|
|
|
from PIL import Image
|
|
|
import base64
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# infofile = open('faceapp_infos.txt','r')
|
|
|
# infotext = infofile.read()
|
|
|
|
|
|
#open the txt file, read, and tokenize
|
|
|
file = open('faceapp.txt','r')
|
|
|
text = file.read()
|
|
|
#not sure if this works..
|
|
|
x = 1
|
|
|
|
|
|
t_file = open('russia-estonia.txt', 'r')
|
|
|
t_text = t_file.read()
|
|
|
|
|
|
#stopwords
|
|
|
default_stopwords = set(stopwords.words('english'))
|
|
|
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
|
|
all_stopwords = default_stopwords | custom_stopwords
|
|
|
|
|
|
|
|
|
# multi-line string HTML
|
|
|
print('''<!DOCTYPE html>
|
|
|
<html>
|
|
|
<head>
|
|
|
<meta charset="utf-8">
|
|
|
<title></title>
|
|
|
<style>
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Belgika";
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
|
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Belgika";
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
|
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "Belgika";
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
|
|
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
|
|
|
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
|
|
|
}
|
|
|
|
|
|
@font-face {
|
|
|
font-family: "SourceCodePro";
|
|
|
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
body {
|
|
|
background-color: ghostwhite;
|
|
|
# font-family: Belgika;
|
|
|
# font-weight: 8th;
|
|
|
# letter-spacing: -0.3px;
|
|
|
font-size: 14px;
|
|
|
line-height: 1.2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
.tos_wrapper {
|
|
|
width: 50%;
|
|
|
float: left;
|
|
|
}
|
|
|
|
|
|
.t_wrapper {
|
|
|
width: 50%;
|
|
|
float: right;
|
|
|
}
|
|
|
|
|
|
|
|
|
.NNP {
|
|
|
background-color: pink;
|
|
|
}
|
|
|
|
|
|
.VBP {
|
|
|
}
|
|
|
|
|
|
.VBP:hover {
|
|
|
background-color: gold;
|
|
|
}
|
|
|
|
|
|
.NN {
|
|
|
background-color: LightSkyBlue;
|
|
|
}
|
|
|
|
|
|
.NNS {
|
|
|
background-color: Aquamarine;
|
|
|
}
|
|
|
|
|
|
.t_img {
|
|
|
font-family: SourceCodePro;
|
|
|
font-size: 10pt;
|
|
|
float: left;
|
|
|
}
|
|
|
|
|
|
.info {
|
|
|
font-family: SourceCodePro;
|
|
|
font-size: 10pt;
|
|
|
width: 60%;
|
|
|
float: left;
|
|
|
border: 1px solid black;
|
|
|
padding:10px;
|
|
|
margin-bottom: 50px;
|
|
|
}
|
|
|
|
|
|
.t_info {
|
|
|
font-family: SourceCodePro;
|
|
|
font-size: 10pt;
|
|
|
width: 90%;
|
|
|
float: left;
|
|
|
border: 1px solid black;
|
|
|
padding:10px;
|
|
|
margin-bottom: 50px;
|
|
|
}
|
|
|
|
|
|
|
|
|
.paragraph {
|
|
|
font-family: SourceCodePro;
|
|
|
font-weight: regular;
|
|
|
letter-spacing: -0.5px;
|
|
|
width: 70%;
|
|
|
float: right;
|
|
|
}
|
|
|
|
|
|
.t_paragraph {
|
|
|
font-family: SourceCodePro;
|
|
|
font-weight: regular;
|
|
|
letter-spacing: -0.5px;
|
|
|
width: 70%;
|
|
|
float: right;
|
|
|
}
|
|
|
|
|
|
.top_words {
|
|
|
font-family: Belgika;
|
|
|
font-weight: 8th;
|
|
|
font-size: 9pt;
|
|
|
width: 15%;
|
|
|
float: left;
|
|
|
}
|
|
|
|
|
|
.t_top_words {
|
|
|
font-family: Belgika;
|
|
|
font-weight: 8th;
|
|
|
font-size: 9pt;
|
|
|
width: 15%;
|
|
|
float: left;
|
|
|
}
|
|
|
|
|
|
</style>
|
|
|
</head>
|
|
|
<body>
|
|
|
|
|
|
<input type="checkbox" id="myCheck" onclick="myFunction()"> Noun
|
|
|
|
|
|
<p id="text" style="display:none">Checkbox is CHECKED!</p>
|
|
|
|
|
|
<script>
|
|
|
function myFunction() {
|
|
|
// Get the checkbox
|
|
|
var checkBox = document.getElementById("myCheck");
|
|
|
// Get the output text
|
|
|
var text = document.getElementById("text");
|
|
|
|
|
|
// If the checkbox is checked, display the output text
|
|
|
if (checkBox.checked == true){
|
|
|
text.style.display = "block";
|
|
|
} else {
|
|
|
text.style.display = "none";
|
|
|
}
|
|
|
}
|
|
|
</script>
|
|
|
|
|
|
|
|
|
''')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#wrapper
|
|
|
print('<div class ="tos_wrapper">')
|
|
|
|
|
|
#insert an image
|
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
|
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
|
|
|
FaceApp_image = '<div class="t_img"><h1>FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
|
|
|
print(FaceApp_image)
|
|
|
|
|
|
|
|
|
#info box
|
|
|
print('<div class ="info">')
|
|
|
infotext = [('Service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
|
|
|
|
|
|
for title, info in infotext:
|
|
|
print('<span class="info_{0}"><div class="info_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info_content">{1}</div></span><br>'.format(title, info))
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
|
|
|
#ToS text
|
|
|
print('<div class ="paragraph">')
|
|
|
tokenized = word_tokenize(text)
|
|
|
tagged = pos_tag(tokenized)
|
|
|
|
|
|
for word, pos in tagged:
|
|
|
print('<span class="{}">{}</span>'.format(pos, word))
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
|
|
|
#colonial words list
|
|
|
print('<div class="top_words" > <span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
|
|
|
|
|
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
|
|
|
frequency_word = FreqDist(tokens_without_stopwords)
|
|
|
top_words = tokens_without_stopwords.most_common(20)
|
|
|
|
|
|
for chosen_words, frequency in top_words:
|
|
|
print('<br><span class="chosen_words" >{}({}) </span>'.format(chosen_words, frequency))
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#t_wrapper (second wrapper)
|
|
|
print('</div><div class="t_wrapper">')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#insert an image
|
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
|
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
|
|
|
t_image = '<div class="t_img"><h1>Peace Treaty of Tartu<h1><img style="width:90%" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
|
|
|
print(t_image)
|
|
|
|
|
|
|
|
|
#t_info box
|
|
|
print('<div class ="t_info">')
|
|
|
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Date', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Signed', 'February 2, 1920'), ('Type', 'bilateral peace treaty'), ('source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(Russian–Estonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
|
|
|
|
|
|
for t_title, t_info in t_infotext:
|
|
|
print('<span class="t_info-{0}"><div class="info_t_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="t_info_content">{1}</div></span><br>'.format(t_title, t_info))
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
#ToS text
|
|
|
print('<div class="t_paragraph">')
|
|
|
t_tokenized = word_tokenize(t_text)
|
|
|
t_tagged = pos_tag(t_tokenized)
|
|
|
|
|
|
for t_word, t_pos in t_tagged:
|
|
|
print('<span class="{}">{}</span>'.format(t_pos, t_word))
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
#treaty colonial words list
|
|
|
print('<div class="t_top_words" > <span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
|
|
|
|
|
|
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in all_stopwords)
|
|
|
t_frequency_word = FreqDist(t_tokens_without_stopwords)
|
|
|
t_top_words = t_tokens_without_stopwords.most_common(20)
|
|
|
|
|
|
for t_chosen_words, t_frequency in t_top_words:
|
|
|
print('<br><span class="t_chosen_words" >{}({}) </span>'.format(t_chosen_words, t_frequency))
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
|
|
|
print('</div>')
|
|
|
print('''</body></html>''')
|
|
|
|
|
|
|