|
|
# from __future__ import division
|
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
|
from nltk.probability import FreqDist
|
|
|
from nltk.corpus import stopwords
|
|
|
import nltk
|
|
|
import codecs
|
|
|
import base64
|
|
|
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
# faceapp_file = open('faceapp.txt','r')
|
|
|
with open('tos_file/faceapp.txt', 'r') as faceapp_file:
|
|
|
faceapp_text = faceapp_file.read()
|
|
|
faceapp_text_list = faceapp_text.split("\n\n")
|
|
|
|
|
|
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
|
|
|
russia_text = russia_file.read()
|
|
|
russia_text_list = russia_text.split("\n\n")
|
|
|
|
|
|
|
|
|
#tos stopwords
|
|
|
tos_default_stopwords = set(stopwords.words('english'))
|
|
|
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
|
|
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
|
|
|
|
|
|
#treaty stopwords
|
|
|
t_default_stopwords = set(stopwords.words('english'))
|
|
|
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
|
|
|
t_all_stopwords = t_default_stopwords | t_custom_stopwords
|
|
|
|
|
|
|
|
|
# multi-line string HTML
|
|
|
print('''<!DOCTYPE>
|
|
|
<html>
|
|
|
<head>
|
|
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
|
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
|
|
<link rel="stylesheet" href="faceapp.css">
|
|
|
<link rel="stylesheet" href="legend.css">
|
|
|
<link rel="stylesheet" href="highlight.css">
|
|
|
<script src="highlight.js"></script>
|
|
|
<meta charset="utf-8">
|
|
|
|
|
|
<title></title>
|
|
|
|
|
|
|
|
|
</head>
|
|
|
<body>''')
|
|
|
|
|
|
|
|
|
#wrapper
|
|
|
print('<div class ="tos_wrapper"><div class="intro">')
|
|
|
|
|
|
#insert an image
|
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
|
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
|
|
|
FaceApp_image = '<div class="img" style="position: fixed; background-color: gainsboro;">FaceApp</div><br><img class="image" src="data:img/faceapp_logo.png;base64,{}">'.format(FaceApp_img_url)
|
|
|
print(FaceApp_image)
|
|
|
|
|
|
|
|
|
#info box
|
|
|
print('<div class ="info">')
|
|
|
infotext = [('Name of Service', 'FaceApp'), ('Country of Origin', 'Russia'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('Word Counts', '5,392'), ('Original Source', '<a href="https://www.faceapp.com/terms-en.html" target="_blank">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
|
|
|
|
|
|
for title, info in infotext:
|
|
|
print('<div class="info_{0}" ><div class="info_title" ><b>{0}</b></div><div class="info_content">{1}</div></div><br>'.format(title, info))
|
|
|
|
|
|
print('</div></div>')
|
|
|
|
|
|
print('''
|
|
|
<div class="legend">
|
|
|
<li class="legendhide eachlegend">stopwords</li>
|
|
|
<li class="legendadjective eachlegend">adjective</li>
|
|
|
<li class="legendverb eachlegend">verb</li>
|
|
|
<li class="legendnoun eachlegend">noun</li>
|
|
|
<li class="legendpropernoun eachlegend">proper noun</li>
|
|
|
<li class="legendadverb eachlegend">adverb</li>
|
|
|
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
|
|
|
<li class="legendpresentparticiple eachlegend">present participle</li>
|
|
|
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
|
|
|
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
|
|
|
</div>
|
|
|
''')
|
|
|
|
|
|
#ToS text
|
|
|
print('<div class ="paragraph">')
|
|
|
tokenized_all = []
|
|
|
for paragraph in faceapp_text_list:
|
|
|
tokenized = word_tokenize(paragraph)
|
|
|
tokenized_all += tokenized # add to the tokenized_all
|
|
|
tagged = pos_tag(tokenized)
|
|
|
print('<p>')
|
|
|
for word, pos in tagged:
|
|
|
print('<span class="{0} {1} eachwords">{2}</span>'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
|
|
|
print('</p>')
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
#tos top words list
|
|
|
print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>')
|
|
|
|
|
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords)
|
|
|
frequency_word = FreqDist(tokens_without_stopwords)
|
|
|
top_words = tokens_without_stopwords.most_common(30)
|
|
|
|
|
|
|
|
|
for chosen_words, frequency in top_words:
|
|
|
print('<div class="chosen_words" > {} ({}) </div>'.format(chosen_words, frequency))
|
|
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
|
|
|
# at the end of wrapper
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
print('</div>')
|
|
|
print('''</body></html>''')
|
|
|
|