You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

114 lines
3.9 KiB
Python

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
# faceapp_file = open('faceapp.txt','r')
with open('tos_file/faceapp.txt', 'r') as faceapp_file:
faceapp_text = faceapp_file.read()
faceapp_text_list = faceapp_text.split("\n\n")
with open('treaty_file/russia-estonia.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
#tos stopwords
tos_default_stopwords = set(stopwords.words('english'))
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
#treaty stopwords
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
# multi-line string HTML
print('''<!DOCTYPE>
<html>
<head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<link rel="stylesheet" href="css/faceapp.css">
<script src="js/highlight.js"></script>
<meta charset="utf-8">
<title></title>
</head>
<body>''')
# $('div.t_chosen_words_{0}').mouseover(function(){
# $(this).find('.t_chosen_words_{0}').text('i + 'is here').css('color', 'red');
# })
# .mouseout(function() {
# $( this ).find( ".t_chosen_words" ).text( " " ).css('color', 'black');
# });
#wrapper
print('<div class ="tos_wrapper"><div class="intro">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="img">FaceApp<br><img class="image" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
infotext = [('Name of Service', 'FaceApp'), ('Country of Origin', 'Russia'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('Word Counts', '5,392'), ('Original Source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
for title, info in infotext:
print('<div class="info_{0}" ><div class="info_title" ><b>{0}</b></div><div class="info_content">{1}</div></div><br>'.format(title, info))
print('</div></div>')
#ToS text
print('<div class ="paragraph">')
for paragraph in faceapp_text_list:
tokenized = word_tokenize(paragraph)
tagged = pos_tag(tokenized)
print('<p>')
for word, pos in tagged:
print('<span class="{0} {1}">{2}</span>'.format(pos.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
print('</p>')
print('</div>')
#tos top words list
print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(30)
for chosen_words, frequency in top_words:
print('<div class="chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(chosen_words, frequency))
print('</div></div></div>')
# at the end of wrapper
print('</div>')
print('</div>')
print('''</body></html>''')