from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
print('''
''')
# my stopwords are common words I don't want to count, like "a", "an", "the".
print('')
for sentence in sent_tokenize(text):
print('')
tokenized = word_tokenize(sentence)
tagged = pos_tag(tokenized)
# for HTML
for word, pos in tagged:
print('{}'.format(pos, word))
print('')
print('
')
# filtering stopwords
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
print(tokens_without_stopwords)
# for read_whole_text in tokens_without_stopwords:
# whole_text_tokenized =
# print(whole_text_tokenized)
# #filtered words in sentence
# filtered_sentence = (" ").join(tokens_without_stopwords)
# print(filtered_sentence)
print(' colonial words:')
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(10)
for chosen_words, frequency in top_words:
print('
{}({}) '.format(chosen_words, frequency))
print('''
''')
# for new_file in tokens_without_stopwords:
# appendFile = open('tokenized_words.txt', 'a')
# appendFile.write(" " + new_file)
# appendFile.close()
# #shows only stopwords
# processed_word_list = []
# for word in tokenized:
# # print(word)
# if word not in all_stopwords:
# processed_word_list.append('*')
# else:
# processed_word_list.append(word)
# print(processed_word_list)
# # # result putting in a graph
# top_words_plot = frequency_word.plot(10)
# print(top_words_plot)