from __future__ import division import glob from nltk import * import re import nltk import codecs from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords from PIL import Image import base64 nltk.download('stopwords') #open the txt file, read, and tokenize file = open('faceapp.txt','r') text = file.read() #not sure if this works.. x = 1 #stopwords default_stopwords = set(stopwords.words('english')) custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) all_stopwords = default_stopwords | custom_stopwords print(''' ''') print('
') #insert an image # https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8') FaceApp_image = '
FaceApp
'.format(FaceApp_img_url) print(FaceApp_image) #info box print('
') infotext = [('Service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('source', 'link'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')] for title, info in infotext: print('
{0}
{1}

'.format(title, info)) print('
') #ToS text print('
') tokenized = word_tokenize(text) tagged = pos_tag(tokenized) for word, pos in tagged: print('{}'.format(pos, word)) print('
') #colonial words list print('
colonial words:') tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords) frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(100) for chosen_words, frequency in top_words: print('
{}({}) '.format(chosen_words, frequency)) print('
') # new_html = open('output.html', 'wb') # open the output file # new_html.write('''
''') # new_html.close() # close the output file print('''''')