from __future__ import division import glob from nltk import * import re import nltk import codecs from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords nltk.download('stopwords') #open the txt file, read, and tokenize file = open('faceapp.txt','r') text = file.read() x = 1 #stopwords default_stopwords = set(stopwords.words('english')) custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords print( ''' ''' ) #info box print('
') infotext = [('service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing'), ('source', 'link')] for title, info in infotext: print('{0}:{1}
'.format(title, info)) print('
') #ToS text print('
') tokenized = word_tokenize(text) tagged = pos_tag(tokenized) for word, pos in tagged: print('{}'.format(pos, word)) print('
') #colonial words list print('
colonial words:') tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords) frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(100) for chosen_words, frequency in top_words: print('
{}({}) '.format(chosen_words, frequency)) print('''
''') # # for new_file in tokens_without_stopwords: # appendFile = open('tokenized_words.txt', 'a') # appendFile.write(" " + new_file) # appendFile.close() # #shows only stopwords # processed_word_list = [] # for word in tokenized: # # print(word) # if word not in all_stopwords: # processed_word_list.append('*') # else: # processed_word_list.append(word) # print(processed_word_list) # # # result putting in a graph # top_words_plot = frequency_word.plot(10) # print(top_words_plot)