# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') # faceapp_file = open('faceapp.txt','r') with open('tos_file/netflix.txt', 'r') as faceapp_file: faceapp_text = faceapp_file.read() faceapp_text_list = faceapp_text.split("\n\n") #tos stopwords tos_default_stopwords = set(stopwords.words('english')) tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords # multi-line string HTML print(''' ''') #wrapper print('
') #insert an image # https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg FaceApp_img_url = base64.b64encode(open('img/netflix_logo.png', 'rb').read()).decode('utf-8') FaceApp_image = '
Netflix

'.format(FaceApp_img_url) print(FaceApp_image) #info box print('
') infotext = [('Name of Service', 'Netflix'), ('Country of Origin', 'United States'), ('Initial release', 'August, 1997'), ('Type', 'Online Video Streaming'), ('Word Counts', '2,283'), ('Original Source', 'link'), ('Description', 'Netflix is an American media-services provider and production company headquartered in Los Gatos, California. The company's primary business is its subscription-based streaming service which offers online streaming of a library of films and television programs, including those produced in-house.')] for title, info in infotext: print('
{0}
{1}

'.format(title, info)) print('
') print('''
  • stopwords
  • adjective
  • verb
  • noun
  • proper noun
  • adverb
  • possesive pronoun
  • present participle
  • adjective superlative
  • adverb comparative + superative
  • ''') #ToS text print('
    ') tokenized_all = [] for paragraph in faceapp_text_list: tokenized = word_tokenize(paragraph) tokenized_all += tokenized # add to the tokenized_all tagged = pos_tag(tokenized) print('

    ') for word, pos in tagged: print('{2}'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word)) print('

    ') print('
    ') #tos top words list print('
    Frequent words
    ') tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords) frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(30) for chosen_words, frequency in top_words: print('
     {} ({})
    '.format(chosen_words, frequency)) print('
    ') # at the end of wrapper print('') print('') print('''''')