|
|
# from __future__ import division
|
|
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
|
|
from nltk.probability import FreqDist
|
|
|
from nltk.corpus import stopwords
|
|
|
import nltk
|
|
|
import codecs
|
|
|
import base64
|
|
|
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
# faceapp_file = open('faceapp.txt','r')
|
|
|
with open('tos_file/facebook.txt', 'r') as faceapp_file:
|
|
|
faceapp_text = faceapp_file.read()
|
|
|
faceapp_text_list = faceapp_text.split("\n\n")
|
|
|
|
|
|
#tos stopwords
|
|
|
tos_default_stopwords = set(stopwords.words('english'))
|
|
|
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
|
|
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
|
|
|
|
|
|
# multi-line string HTML
|
|
|
print('''<!DOCTYPE>
|
|
|
<html>
|
|
|
<head>
|
|
|
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
|
|
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
|
|
<link rel="stylesheet" href="faceapp.css">
|
|
|
<link rel="stylesheet" href="legend.css">
|
|
|
<script src="highlight.js"></script>
|
|
|
<meta charset="utf-8">
|
|
|
|
|
|
<title></title>
|
|
|
|
|
|
|
|
|
</head>
|
|
|
<body>''')
|
|
|
|
|
|
|
|
|
#wrapper
|
|
|
print('<div class ="tos_wrapper"><div class="intro">')
|
|
|
|
|
|
#insert an image
|
|
|
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
|
|
|
FaceApp_img_url = base64.b64encode(open('img/facebook_logo.png', 'rb').read()).decode('utf-8')
|
|
|
FaceApp_image = '<div class="img" style="position: fixed; background-color: gainsboro;">Facebook</div><br><img class="image" src="data:img/facebook_logo.png;base64,{}">'.format(FaceApp_img_url)
|
|
|
print(FaceApp_image)
|
|
|
|
|
|
|
|
|
#info box
|
|
|
print('<div class ="info">')
|
|
|
infotext = [('Name of Service', 'Facebook'), ('Country of Origin', 'United States'), ('Initial release', 'February, 2004'), ('Type', 'Social Media'), ('Word Counts', '4,041'), ('Original Source', '<a href="https://www.facebook.com/terms.php" target="_blank">link</a>'), ('Description', 'Facebook, Inc. is an American social media conglomerate corporation based in Menlo Park, California. It was founded at Harvard College, originally as TheFacebook.com—today's Facebook, a popular global social networking website. ')]
|
|
|
|
|
|
for title, info in infotext:
|
|
|
print('<div class="info_{0}" ><div class="info_title" ><b>{0}</b></div><div class="info_content">{1}</div></div><br>'.format(title, info))
|
|
|
|
|
|
print('</div></div>')
|
|
|
|
|
|
print('''
|
|
|
<div class="legend">
|
|
|
<li class="legendhide eachlegend">stopwords</li>
|
|
|
<li class="legendadjective eachlegend">adjective</li>
|
|
|
<li class="legendverb eachlegend">verb</li>
|
|
|
<li class="legendnoun eachlegend">noun</li>
|
|
|
<li class="legendpropernoun eachlegend">proper noun</li>
|
|
|
<li class="legendadverb eachlegend">adverb</li>
|
|
|
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
|
|
|
<li class="legendpresentparticiple eachlegend">present participle</li>
|
|
|
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
|
|
|
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
|
|
|
</div>
|
|
|
''')
|
|
|
|
|
|
#ToS text
|
|
|
print('<div class ="paragraph">')
|
|
|
tokenized_all = []
|
|
|
for paragraph in faceapp_text_list:
|
|
|
tokenized = word_tokenize(paragraph)
|
|
|
tokenized_all += tokenized # add to the tokenized_all
|
|
|
tagged = pos_tag(tokenized)
|
|
|
print('<p>')
|
|
|
for word, pos in tagged:
|
|
|
print('<span class="{0} {1} eachwords">{2}</span>'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
|
|
|
print('</p>')
|
|
|
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
#tos top words list
|
|
|
print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>')
|
|
|
|
|
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords)
|
|
|
frequency_word = FreqDist(tokens_without_stopwords)
|
|
|
top_words = tokens_without_stopwords.most_common(30)
|
|
|
|
|
|
|
|
|
for chosen_words, frequency in top_words:
|
|
|
print('<div class="chosen_words" > {} ({}) </div>'.format(chosen_words, frequency))
|
|
|
|
|
|
print('</div></div></div>')
|
|
|
|
|
|
|
|
|
# at the end of wrapper
|
|
|
print('</div>')
|
|
|
|
|
|
|
|
|
print('</div>')
|
|
|
print('''</body></html>''')
|
|
|
|