from __future__ import division import glob from nltk import * import re import nltk import codecs from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords nltk.download('stopwords') # infofile = open('faceapp_infos.txt','r') # infotext = infofile.read() #open the txt file, read, and tokenize file = open('faceapp.txt','r') text = file.read() #not sure if this works.. x = 1 #stopwords default_stopwords = set(stopwords.words('english')) custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords # with open(output_html, 'w') as new_html: # new_html.write( # ''' # # # # # # # ''' # ) print(''' ''') #info part print('
') infotext = [('platform', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing')] for title, info in infotext: print('{0}:{1}
'.format(title, info)) print('
') #ToS text print('
') # for sentence in sent_tokenize(text): print('') tokenized = word_tokenize(text) tagged = pos_tag(tokenized) # for HTML for word, pos in tagged: print('{}'.format(pos, word)) print('') print('
') # filtering stopwords tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords) print(tokens_without_stopwords) print('
colonial words:') frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(100) for chosen_words, frequency in top_words: print('
{}({}) '.format(chosen_words, frequency)) # new_html = open('output.html', 'wb') # open the output file # new_html.write('''
''') # new_html.close() # close the output file print('''''')