from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')
# infofile = open('faceapp_infos.txt','r')
# infotext = infofile.read()
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#not sure if this works..
x = 1
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
# with open(output_html, 'w') as new_html:
# new_html.write(
# '''
#
#
#
#
#
#
# '''
# )
print('''
''')
#info part
print('')
infotext = [('platform', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing')]
for title, info in infotext:
print('{0}:{1}
'.format(title, info))
print('
')
#ToS text
print('')
# for sentence in sent_tokenize(text):
print('')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
# for HTML
for word, pos in tagged:
print('{}'.format(pos, word))
print('')
print('
')
# filtering stopwords
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
print(tokens_without_stopwords)
print(' colonial words:')
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('
{}({}) '.format(chosen_words, frequency))
# new_html = open('output.html', 'wb') # open the output file
# new_html.write('''
''')
# new_html.close() # close the output file
print('''''')