# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') # faceapp_file = open('faceapp.txt','r') with open('tos_file/facebook.txt', 'r') as faceapp_file: faceapp_text = faceapp_file.read() faceapp_text_list = faceapp_text.split("\n\n") #tos stopwords tos_default_stopwords = set(stopwords.words('english')) tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords # multi-line string HTML print(''' ''') #wrapper print('
') #insert an image # https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg FaceApp_img_url = base64.b64encode(open('img/facebook_logo.png', 'rb').read()).decode('utf-8') FaceApp_image = '
Facebook

'.format(FaceApp_img_url) print(FaceApp_image) #info box print('
') infotext = [('Name of Service', 'Facebook'), ('Country of Origin', 'United States'), ('Initial release', 'February, 2004'), ('Type', 'Social Media'), ('Word Counts', '4,041'), ('Original Source', 'link'), ('Description', 'Facebook, Inc. is an American social media conglomerate corporation based in Menlo Park, California. It was founded at Harvard College, originally as TheFacebook.com—today's Facebook, a popular global social networking website. ')] for title, info in infotext: print('
{0}
{1}

'.format(title, info)) print('
') print('''
  • stopwords
  • adjective
  • verb
  • noun
  • proper noun
  • adverb
  • possesive pronoun
  • present participle
  • adjective superlative
  • adverb comparative + superative
  • ''') #ToS text print('
    ') tokenized_all = [] for paragraph in faceapp_text_list: tokenized = word_tokenize(paragraph) tokenized_all += tokenized # add to the tokenized_all tagged = pos_tag(tokenized) print('

    ') for word, pos in tagged: print('{2}'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word)) print('

    ') print('
    ') #tos top words list print('
    Frequent words
    ') tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords) frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(30) for chosen_words, frequency in top_words: print('
     {} ({})
    '.format(chosen_words, frequency)) print('
    ') # at the end of wrapper print('') print('') print('''''')