# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') with open('treaty_file/america.txt', 'r') as russia_file: russia_text = russia_file.read() russia_text_list = russia_text.split("\n\n") t_default_stopwords = set(stopwords.words('english')) t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines()) t_all_stopwords = t_default_stopwords | t_custom_stopwords print(''' ''') #t_wrapper (second wrapper) print('
') img_url = base64.b64encode(open('img/america.jpg', 'rb').read()).decode('utf-8') t_image = '
Treaty of Paris

'.format(img_url) print(t_image) #t_info box print('
') t_infotext = [('Name of Treaty', 'Kiram-Bates Treaty'), ('Country of Origin', 'United States'), ('Signed', 'August, 1899'), ('Location', 'Jolo, Sultanate of Sulu'), ('Word Counts', '719'), ('Type', 'unilateral treaty'), ('Original Source', 'link'), ('Description', 'The Kiram-Bates Treaty was a treaty signed by the U.S.A. and the Sultanate of Sulu during the Philippine–American War. The treaty functioned to prevent the entry of the Sulu Sultanate into the Philippine-American War.')] for t_title, t_info in t_infotext: print('
{0}
{1}

'.format(t_title, t_info)) print('
') print('''
  • stopwords
  • adjective
  • verb
  • noun
  • proper noun
  • adverb
  • possesive pronoun
  • present participle
  • adjective superlative
  • adverb comparative + superative
  • ''') #Treaty text print('
    ') t_tokenized_all = [] for t_paragraph in russia_text_list: t_tokenized = word_tokenize(t_paragraph) t_tokenized_all += t_tokenized # add to the tokenized_all t_tagged = pos_tag(t_tokenized) print('

    ') for t_word, t_pos in t_tagged: print('{2}'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word)) print('

    ') print('
    ') #treaty colonial top words list print('
    Frequent words
    ') t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords) t_frequency_word = FreqDist(t_tokens_without_stopwords) t_top_words = t_tokens_without_stopwords.most_common(20) for t_chosen_words, t_frequency in t_top_words: print('
     {} ({})
    '.format(t_chosen_words, t_frequency)) print('
    ') # at the end of wrapper print('') print('') print('''''')