You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

107 lines
4.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# from __future__ import division
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
import codecs
import base64
nltk.download('stopwords')
with open('treaty_file/hongkong.txt', 'r') as russia_file:
russia_text = russia_file.read()
russia_text_list = russia_text.split("\n\n")
t_default_stopwords = set(stopwords.words('english'))
t_custom_stopwords = set(codecs.open('t_stopwords.txt', 'r').read().splitlines())
t_all_stopwords = t_default_stopwords | t_custom_stopwords
print('''<!DOCTYPE>
<html>
<head>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<link rel="stylesheet" href="estonia.css">
<link rel="stylesheet" href="legend.css">
<link rel="stylesheet" href="highlight.css">
<script src="highlight.js"></script>
<meta charset="utf-8">
<title></title>
</head>
<body>''')
#t_wrapper (second wrapper)
print('<div class="t_wrapper"><div class="t_intro">')
img_url = base64.b64encode(open('img/hk.jpg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img" style="position: fixed; background-color: gainsboro;">Hong Kong Letters Patent 1843</div><br><img class="t_image" src="data:img/hk.jpg;base64,{}">'.format(img_url)
print(t_image)
#t_info box
print('<div class ="t_info">')
t_infotext = [('Name of Treaty', 'Hong Kong Letters Patent 1843'), ('Country of Origin', 'United Kingdom'), ('Signed', ' April, 1843'), ('Location', 'Westminster, U.K.'), ('Word Counts', '1,543'), ('Type', 'unilateral treaty'), ('Original Source', '<a href="https://en.wikisource.org/wiki/Hong_Kong_Letters_Patent_1843" target="_blank">link</a>'), ('Description', 'It was issued by Queen Vcitoria of the United Kingdom of Great Britain and Ireland in 1843 to established the British Colony of Hong Kong in and over Hong Kong Island.')]
for t_title, t_info in t_infotext:
print('<div class="t_info-{0}"><div class="info_t_title" ><b>{0}</b></div><div class="t_info_content">{1}</div></div><br>'.format(t_title, t_info))
print('</div></div>')
print('''
<div class="legend">
<li class="legendhide eachlegend">stopwords</li>
<li class="legendadjective eachlegend">adjective</li>
<li class="legendverb eachlegend">verb</li>
<li class="legendnoun eachlegend">noun</li>
<li class="legendpropernoun eachlegend">proper noun</li>
<li class="legendadverb eachlegend">adverb</li>
<li class="legendpossesivepronoun eachlegend">possesive pronoun</li>
<li class="legendpresentparticiple eachlegend">present participle</li>
<li class="legendadjectivesuperlative eachlegend">adjective superlative</li>
<li class="legendadverb-comparative-superative eachlegend">adverb comparative + superative</li>
</div>
''')
#Treaty text
print('<div class="t_paragraph">')
t_tokenized_all = []
for t_paragraph in russia_text_list:
t_tokenized = word_tokenize(t_paragraph)
t_tokenized_all += t_tokenized # add to the tokenized_all
t_tagged = pos_tag(t_tokenized)
print('<p>')
for t_word, t_pos in t_tagged:
print('<span class="{0} {1} eachwords">{2}</span>'.format(t_pos.replace('PRP$', 'PRPS').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks'), t_word.replace('', 'apostrophe').replace('.', 'dot').replace(',', 'comma').replace('(', 'marks').replace(')', 'marks').replace(':', 'marks').replace(';', 'marks').lower(), t_word))
print('</p>')
print('</div>')
#treaty colonial top words list
print('<div class="t_top_words"><div class="t_top_words_title"><b>Frequent words</b></div>')
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized_all if words.lower() not in t_custom_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('<div class="t_chosen_words" >&nbsp;{}&nbsp;({}) </div>'.format(t_chosen_words, t_frequency))
print('</div></div></div>')
# at the end of wrapper
print('</div>')
print('</div>')
print('''</body></html>''')