from __future__ import division import glob from nltk import * import re import nltk import codecs from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords nltk.download('stopwords') #open the txt file, read, and tokenize file = open('faceapp.txt','r') text = file.read() #stopwords default_stopwords = set(stopwords.words('english')) custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords print(''' ''') # my stopwords are common words I don't want to count, like "a", "an", "the". print('
') for sentence in sent_tokenize(text): print('') tokenized = word_tokenize(sentence) tagged = pos_tag(tokenized) # for HTML for word, pos in tagged: print('{}'.format(pos, word)) print('') print('
') # filtering stopwords tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords) print(tokens_without_stopwords) # for read_whole_text in tokens_without_stopwords: # whole_text_tokenized = # print(whole_text_tokenized) # #filtered words in sentence # filtered_sentence = (" ").join(tokens_without_stopwords) # print(filtered_sentence) print('
colonial words:') frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(10) for chosen_words, frequency in top_words: print('
{}({}) '.format(chosen_words, frequency)) print('''
''') # for new_file in tokens_without_stopwords: # appendFile = open('tokenized_words.txt', 'a') # appendFile.write(" " + new_file) # appendFile.close() # #shows only stopwords # processed_word_list = [] # for word in tokenized: # # print(word) # if word not in all_stopwords: # processed_word_list.append('*') # else: # processed_word_list.append(word) # print(processed_word_list) # # # result putting in a graph # top_words_plot = frequency_word.plot(10) # print(top_words_plot)