# from __future__ import division from nltk import sent_tokenize, word_tokenize, pos_tag from nltk.probability import FreqDist from nltk.corpus import stopwords import nltk import codecs import base64 nltk.download('stopwords') with open('treaty_file/uk-korea.txt', 'r') as russia_file: russia_text = russia_file.read() russia_text_list = russia_text.split("\n\n") t_default_stopwords = set(stopwords.words('english')) t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines()) t_all_stopwords = t_default_stopwords | t_custom_stopwords print(''' ''') #t_wrapper (second wrapper) print('
') img_url = base64.b64encode(open('img/america.jpg', 'rb').read()).decode('utf-8') t_image = '
United Kingdom–Korea
Treaty of 1883

'.format(img_url) print(t_image) #t_info box print('
') t_infotext = [('Name of Treaty', 'United Kingdom–Korea Treaty of 1883'), ('Country of Origin', 'United Kingdom'), ('Signed', 'November, 1883'), ('Location', 'Hanyang, Korea'), ('Word Counts', '3,357'), ('Type', 'unilateral treaty'), ('Original Source', 'link'), ('Description', 'Under the treaty, Great Britain obtained extraterritorial rights in Korea and from 1883 to 1910, British subjects in Korea were not subject to the jurisdiction of Korean courts.')] for t_title, t_info in t_infotext: print('
{0}
{1}

'.format(t_title, t_info)) print('
') print('''
  • stopwords
  • adjective
  • verb
  • noun
  • proper noun
  • adverb
  • possesive pronoun
  • present participle
  • adjective superlative
  • adverb comparative + superative
  • ''') #Treaty text print('
    ') t_tokenized_all = [] for t_paragraph in russia_text_list: t_tokenized = word_tokenize(t_paragraph) t_tokenized_all += t_tokenized # add to the tokenized_all t_tagged = pos_tag(t_tokenized) print('

    ') for t_word, t_pos in t_tagged: print('{2}'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word)) print('

    ') print('
    ') #treaty colonial top words list print('
    Frequent words
    ') t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords) t_frequency_word = FreqDist(t_tokens_without_stopwords) t_top_words = t_tokens_without_stopwords.most_common(20) for t_chosen_words, t_frequency in t_top_words: print('
     {} ({})
    '.format(t_chosen_words, t_frequency)) print('
    ') # at the end of wrapper print('') print('') print('''''')