from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import glob import re import codecs import base64 nltk.download('stopwords') # faceapp_file = open('faceapp.txt','r') with open('tos_file/faceapp.txt', 'r') as faceapp_file: faceapp_text = faceapp_file.read() faceapp_text_list = faceapp_text.split("\n\n") with open('treaty_file/russia-estonia.txt', 'r') as russia_file: russia_text = russia_file.read() russia_text_list = russia_text.split("\n\n") #tos stopwords tos_default_stopwords = set(stopwords.words('english')) tos_custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines()) tos_all_stopwords = tos_default_stopwords | tos_custom_stopwords #treaty stopwords t_default_stopwords = set(stopwords.words('english')) t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines()) t_all_stopwords = t_default_stopwords | t_custom_stopwords # multi-line string HTML print(''' ''') #wrapper print('
') #insert an image # https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8') FaceApp_image = '

FaceApp

'.format(FaceApp_img_url) print(FaceApp_image) #info box print('
') infotext = [('Service', 'FaceApp'), ('Country', 'Russia'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('source', 'link'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')] for title, info in infotext: print('
{0}
{1}

'.format(title, info)) print('
') #ToS text print('
') # for paragraph in faceapp_text_list: # tokenized = word_tokenize(paragraph) # tagged = pos_tag(tokenized) # print('

') # for word, pos in tagged: # print('{}'.format(pos, word)) # print('

') # print('
') # #faceapp_text tokenized = word_tokenize(faceapp_text) tagged = pos_tag(tokenized) print('

') for word, pos in tagged: print('{}'.format(pos, word)) print('

') print('
') #colonial words list print('
colonial words:
') tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in tos_all_stopwords) frequency_word = FreqDist(tokens_without_stopwords) top_words = tokens_without_stopwords.most_common(20) for chosen_words, frequency in top_words: print('
{}({})
'.format(chosen_words, frequency)) print('
') #t_wrapper (second wrapper) print('
') #insert an image # https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8') t_image = '

Peace Treaty of Tartu

'.format(img_url) print(t_image) #t_info box print('
') t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Country', 'Russia'), ('Date', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Signed', 'February 2, 1920'), ('Type', 'bilateral peace treaty'), ('source', 'link'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')] for t_title, t_info in t_infotext: print('
{0}
{1}

'.format(t_title, t_info)) print('
') #ToS text print('
') t_tokenized = word_tokenize(russia_text) t_tagged = pos_tag(t_tokenized) for t_word, t_pos in t_tagged: print('{}'.format(t_pos, t_word)) print('

') print('

') #treaty colonial words list print('
colonial words:
') t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in t_all_stopwords) t_frequency_word = FreqDist(t_tokens_without_stopwords) t_top_words = t_tokens_without_stopwords.most_common(20) for t_chosen_words, t_frequency in t_top_words: print('
{}({})
'.format(t_chosen_words, t_frequency)) print('
') print('
') print('''''')