# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
# faceapp_file = open('faceapp.txt','r')
with open('tos_file/pokemongo.txt', 'r') as faceapp_file:
faceapp_text = faceapp_file.read()
faceapp_text_list = faceapp_text.split("\n\n")
#tos stopwords
tos_default_stopwords = set(stopwords.words('english'))
tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords
# multi-line string HTML
print('''
')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/pokemongo_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '
Pokemongo
'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('
')
infotext = [('Name of Service', 'Pokemongo'), ('Country of Origin', 'United States'), ('Initial release', 'July, 2016'), ('Type', 'Augmented Reality'), ('Word Counts', '8,516'), ('Original Source', '
link'), ('Description', 'Pokémon Go is a 2016 augmented reality (AR) mobile game developed and published by Niantic(American software company) in collaboration with The Pokémon Company for iOS and Android devices.')]
for title, info in infotext:
print('
'.format(title, info))
print('
')
print('''
stopwords
adjective
verb
noun
proper noun
adverb
possesive pronoun
present participle
adjective superlative
adverb comparative + superative
''')
#ToS text
print('
')
tokenized_all = []
for paragraph in faceapp_text_list:
tokenized = word_tokenize(paragraph)
tokenized_all += tokenized # add to the tokenized_all
tagged = pos_tag(tokenized)
print('
')
for word, pos in tagged:
print('{2}'.format(pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), word))
print('
')
print('
')
#tos top words list
print('
Frequent words
')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized_all if words.lower() not in tos_custom_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(30)
for chosen_words, frequency in top_words:
print('
{} ({})
'.format(chosen_words, frequency))
print('
')
# at the end of wrapper
print('')
print('')
print('''''')