# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
with open('treaty_file/hongkong.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print('''
')
img_url = base64.b64encode(open('img/hk.jpg', 'rb').read()).decode('utf-8')
t_image = '
Hong Kong Letters Patent 1843
'.format(img_url)
print(t_image)
#t_info box
print('
')
t_infotext = [('Name of Treaty', 'Hong Kong Letters Patent 1843'), ('Country of Origin', 'United Kingdom'), ('Signed', ' April, 1843'), ('Location', 'Westminster, U.K.'), ('Word Counts', '1,543'), ('Type', 'unilateral treaty'), ('Original Source', '
link'), ('Description', 'It was issued by Queen Vcitoria of the United Kingdom of Great Britain and Ireland in 1843 to established the British Colony of Hong Kong in and over Hong Kong Island.')]
for t_title, t_info in t_infotext:
print('
'.format(t_title, t_info))
print('
')
print('''
stopwords
adjective
verb
noun
proper noun
adverb
possesive pronoun
present participle
adjective superlative
adverb comparative + superative
''')
#Treaty text
print('
')
t_tokenized_all = []
for t_paragraph in russia_text_list:
t_tokenized = word_tokenize(t_paragraph)
t_tokenized_all += t_tokenized # add to the tokenized_all
t_tagged = pos_tag(t_tokenized)
print('
')
for t_word, t_pos in t_tagged:
print('{2}'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('’', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
print('
')
print('
')
#treaty colonial top words list
print('
Frequent words
')
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('
{} ({})
'.format(t_chosen_words, t_frequency))
print('
')
# at the end of wrapper
print('')
print('')
print('''''')