You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

211 lines
3.9 KiB
Python

from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
x = 1
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
print(
'''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title></title>
<style>
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: whitesmoke;
# font-family: Belgika;
# font-weight: 8th;
# letter-spacing: -0.3px;
font-size: 20px;
line-height: 1.2;
}
.info {
font-family: Belgika;
font-weight: 8th;
font-size: 10pt;
width: 20%;
float: left;
border: 1px solid black;
}
.NNP {
background-color: pink;
}
.VBP {
}
.VBP:hover {
background-color: gold;
}
.NN {
background-color: LightSkyBlue;
}
.NNS {
background-color: Aquamarine;
}
.paragraph {
font-family: SourceCodePro;
font-weight: regular;
letter-spacing: -0.5px;
width: 50%;
float: left;
}
.top_words {
font-family: Belgika;
font-weight: 8th;
font-size: 9pt;
width: 25%;
float: left;
}
</style>
</head>
<body>'''
)
#info box
print('<div class ="info">')
infotext = [('service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>')]
for title, info in infotext:
print('<span class="info-{0}">{0}:{1}</span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</div>')
#colonial words list
print('<div class="top_words"> colonial words:')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
print('''</div></body></html>''')
# # for new_file in tokens_without_stopwords:
# appendFile = open('tokenized_words.txt', 'a')
# appendFile.write(" " + new_file)
# appendFile.close()
# #shows only stopwords
# processed_word_list = []
# for word in tokenized:
# # print(word)
# if word not in all_stopwords:
# processed_word_list.append('*')
# else:
# processed_word_list.append(word)
# print(processed_word_list)
# # # result putting in a graph
# top_words_plot = frequency_word.plot(10)
# print(top_words_plot)