You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

111 lines
4.2 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
# faceapp_file = open('faceapp.txt','r')
with open('tos_file/facebook.txt', 'r') as faceapp_file:
faceapp_text = faceapp_file.read()
faceapp_text_list = faceapp_text.split("\n\n")
#tos stopwords
tos_default_stopwords = set(stopwords.words('english'))
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
# multi-line string HTML
print('''<!DOCTYPE>
<html>
<head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<link rel="stylesheet" href="faceapp.css">
<link rel="stylesheet" href="legend.css">
<script src="highlight.js"></script>
<meta charset="utf-8">
<title></title>
</head>
<body>''')
#wrapper
print('<div class ="tos_wrapper"><div class="intro">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/facebook_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="img" style="position: fixed; background-color: gainsboro;">Facebook</div><br><img class="image" src="data:img/facebook_logo.png;base64,{}">'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
infotext = [('Name of Service', 'Facebook'), ('Country of Origin', 'United States'), ('Initial release', 'February, 2004'), ('Type', 'Social Media'), ('Word Counts', '4,041'), ('Original Source', '<a href="https://www.facebook.com/terms.php" target="_blank">link</a>'), ('Description', 'Facebook, Inc. is an American social media conglomerate corporation based in Menlo Park, California. It was founded at Harvard College, originally as TheFacebook.com—today&#39;s Facebook, a popular global social networking website. ')]
for title, info in infotext:
print('<div class="info_{0}" ><div class="info_title" ><b>{0}</b></div><div class="info_content">{1}</div></div><br>'.format(title, info))
print('</div></div>')
print('''
<div class="legend">
<li class="legendhide eachlegend">stopwords</li>
<li class="legendadjective eachlegend">adjective</li>
<li class="legendverb eachlegend">verb</li>
<li class="legendnoun eachlegend">noun</li>
<li class="legendpropernoun eachlegend">proper noun</li>
<li class="legendadverb eachlegend">adverb</li>
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
<li class="legendpresentparticiple eachlegend">present participle</li>
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
</div>
''')
#ToS text
print('<div class ="paragraph">')
tokenized_all = []
for paragraph in faceapp_text_list:
tokenized = word_tokenize(paragraph)
tokenized_all += tokenized # add to the tokenized_all
tagged = pos_tag(tokenized)
print('<p>')
for word, pos in tagged:
print('<span class="{0} {1} eachwords">{2}</span>'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
print('</p>')
print('</div>')
#tos top words list
print('<div class="top_words"><div class="top_words_title" ><b>Frequent words</b></div>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(30)
for chosen_words, frequency in top_words:
print('<div class="chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(chosen_words, frequency))
print('</div></div></div>')
# at the end of wrapper
print('</div>')
print('</div>')
print('''</body></html>''')