worked on tag_comparison.py

master
bootje 5 years ago
parent cba64a7b99
commit 778221bf41

BIN
website/.DS_Store vendored

Binary file not shown.

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<title></title>
</head>
<body>
About:
Source:
https://afrolegends.com/2016/12/14/colonial-treaties-in-africa-british-protection-treaty-with-the-itsekiri-of-nigeria-1884/
</body>
</html>

@ -1,30 +0,0 @@
import nltk
file=open('faceapp.txt','r')
raw=file.read()
tokens = nltk.word_tokenize(raw)
faceapp = nltk.Text(tokens)
# my stopwords are common words I don't want to count, like "a", "an", "the".
stopwords = set(line.strip() for line in open('stopwords.txt'))
# dictionary
wordcount = {}
# spliting words from punctuation so "book" and "book!" counts as the same word
for word in raw.lower().split():
word = word.replace(".","")
word = word.replace(",","")
word = word.replace(":","")
word = word.replace("\"","")
word = word.replace("!","")
word = word.replace("“","")
word = word.replace("‘","")
word = word.replace("*","")
word = word.replace("(","")
word = word.replace(")","")
faceapp.concordance('a')

@ -26,10 +26,20 @@
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: whitesmoke;
# font-family: Belgika;
# font-weight: 8th;
letter-spacing: 0.5px;
# letter-spacing: -0.3px;
font-size: 20px;
line-height: 1.2;
@ -57,9 +67,10 @@
}
.paragraph {
font-family: helvetica;
font-family: SourceCodePro;
font-weight: regular;
width: 70%;
letter-spacing: -0.5px;
width: 50%;
float: right;
}
@ -5694,9 +5705,22 @@
<span class="CC">or</span>
<span class="NN">entity</span>
<span class=".">.</span>
<span class="NN">Platform</span>
<span class=":">:</span>
<span class="NNP">FaceApp</span>
<span class="NN">https</span>
<span class=":">:</span>
<span class="JJ">//www.faceapp.com</span>
<span class="NNP">Original</span>
<span class="NN">text</span>
<span class="IN">from</span>
<span class=":">:</span>
<span class="NN">https</span>
<span class=":">:</span>
<span class="JJ">//www.faceapp.com/privacy-en.html</span>
</span>
</div>
<FreqDist with 957 samples and 2523 outcomes>
<FreqDist with 960 samples and 2530 outcomes>
<div class="top_words"> colonial words:
<br><span class="chosen_words">services(69) </span>
<br><span class="chosen_words">agreement(60) </span>

@ -132,3 +132,4 @@ You may not use, export, import, or transfer all or any portion of the Services
21. Miscellaneous
In accordance with California Civil Code section 1789.3, you may report complaints to the Complaint Assistance Unit of the Division of Consumer Services of the California Department of Consumer Affairs by contacting them in writing at 400 R Street, Sacramento, CA 95814, or by telephone at (800) 952-5210. This Agreement constitutes the entire agreement between you and FaceApp relating to your access to and use of our Services. The failure of FaceApp to exercise or enforce any right or provision of this Agreement will not operate as a waiver of such right or provision. The section titles in this Agreement is for convenience only and have no legal or contractual effect. Except as otherwise provided herein, this Agreement is intended solely for the benefit of the parties and are not intended to confer third party beneficiary rights upon any other person or entity.

@ -0,0 +1,6 @@
Platform: FaceApp https://www.faceapp.com
Initial release: December 31, 2016
Type:Image editing
Description: FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence.[1][2][3] The app can transform a face to make it smile, look younger, look older, or change gender.
Original Terms of Service: https://www.faceapp.com/privacy-en.html

@ -0,0 +1,11 @@
[('platform', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing')]
Platform, FaceApp https://www.faceapp.com
Initial release: December 31, 2016
Type:Image editing
Description: FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence.[1][2][3] The app can transform a face to make it smile, look younger, look older, or change gender.
Original Terms of Service: https://www.faceapp.com/privacy-en.html

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

File diff suppressed because it is too large Load Diff

@ -24,7 +24,8 @@ all_stopwords = default_stopwords | custom_stopwords
print('''<!DOCTYPE html>
print(
'''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
@ -52,15 +53,35 @@ print('''<!DOCTYPE html>
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: whitesmoke;
# font-family: Belgika;
# font-weight: 8th;
letter-spacing: 0.5px;
# letter-spacing: -0.3px;
font-size: 20px;
line-height: 1.2;
}
.info {
font-family: Belgika;
font-weight: 8th;
font-size: 10pt;
width: 20%;
float: left;
border: 1px solid black;
}
.NNP {
@ -68,6 +89,7 @@ print('''<!DOCTYPE html>
}
.VBP {
}
.VBP:hover {
@ -83,10 +105,11 @@ print('''<!DOCTYPE html>
}
.paragraph {
font-family: helvetica;
font-family: SourceCodePro;
font-weight: regular;
width: 70%;
float: right;
letter-spacing: -0.5px;
width: 50%;
float: left;
}
.top_words {
@ -99,53 +122,42 @@ print('''<!DOCTYPE html>
</style>
</head>
<body>''')
<body>'''
)
# my stopwords are common words I don't want to count, like "a", "an", "the".
#info box
print('<div class ="info">')
infotext = [('service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>')]
print('<div class ="paragraph">')
# for sentence in sent_tokenize(text):
print('<span>')
for title, info in infotext:
print('<span class="info-{0}">{0}:{1}</span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
# for HTML
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</span>')
print('</div>')
# filtering stopwords
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
print(tokens_without_stopwords)
# for read_whole_text in tokens_without_stopwords:
# whole_text_tokenized =
# print(whole_text_tokenized)
# #filtered words in sentence
# filtered_sentence = (" ").join(tokens_without_stopwords)
# print(filtered_sentence)
#colonial words list
print('<div class="top_words"> colonial words:')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
# new_html = open('output.html', 'wb') # open the output file
# new_html.write('''</div></body></html>''')
# new_html.close() # close the output file
print('''</div></body></html>''')

@ -0,0 +1,61 @@
Article I
The state of war between the Contracting Parties will end on the day on which the present Treaty of Peace comes into force.
Article II
In consequence of the right of all peoples to self-determination, to the point of seceding completely from the State of which they form part, a right proclaimed by the Socialist and Federal Russian Republic of the Soviets, Russia unreservedly recognises the independence and sovereignty of the State of Estonia, and renounces voluntarily and for ever all sovereign rights possessed by Russia over the Estonian people and territory whether these rights be based on the juridical position that formerly existed in public law, or in the international treaties which, in the sense here indicated, lose their validity in future.
From the fact that Estonia has belonged to Russia, no obligation whatsoever will fall on the Estonian people and land to Russia.
Article III
1. The frontier between Esthonia and Russia follows the following line:
Starting from the Bay of Narva, one verst south of the Fishers House, it turns toward Ropscha, then follows the course of the Rivers Mertvitskaja and Rosson, as far as the village of Ilkino, from Ilkino one verst west of the village Keikino, half a verst west of the village of Isvosi and turns towards the village of Kobõljaki; it then crosses the mouth of the river Schtschutschka, passes by Krivaja Luka, by the estate of Petschurki, to the confluence of three sources of the river Vtroja, follows the southern boundary of the village of Kuritschek with its dependences, then turns in a straight line as far as the centre line of the Lake Peipus, follows it in a southern direction, and thus passes one verst to the east of Piirisaar (Pork); follows the narrow strip of the lake dividing it along the middle as far as the island of Salu, thence passes through Lake Pihkva (Pskov) between the Islands of Talabski and the Island of Kamonka, then to the east of the village of Poddubje (on the southern bank of Lake Pihkva) and to the outlook post on the railway situated near to Grjardischtsche, then passes successively to the west of the Village of Schahintsõi, to the east of Novaja, across the Lake Poganova, between the villages of Babina and Vomorski, to one verst and a half to the south of the Forestry keepers house (which is situated to the north of Glybotschina) to Sprechtitschi and to the Farm Kudepi.
Note 1. The Frontier described in this Article is shown in red on the map, scale of three versts to the inch (0.0254 metre), which constitutes the first Annex to Article 3. In case of difference between the text and the map, it is to the text that one must adhere.
Note 2. The tracing of the boundary between the two contracting countries and the placing in position of the frontier signs will be carried out under the direction of a special mixed commission composed of an equal number of members from both parties. In marking the boundary line this mixed commission will decide the allocation of inhabited areas on the frontier to one or other of the parties according to ethnographical indications and bearing in mind economic agreements and agricultural consideration.
2. The portion of the territory of Esthonia to the east of the Narova, the River Narova itself, and the islands in the midst of the stream, as well as the zone to the south of Lake Pihkva, which is situated between the boundary above mentioned and the line of villages, Borok-Smolni-Belkova-Sprechtitschi, will be, from a military point of view, considered as neutral until 1 January 1922.
Esthonia undertakes to maintain no troops of any kind in the neutralized zones other than those which are necessary for the frontier service and the maintenance of order, and of which the strength is laid down in Annex 2 of the present Article; not to construct fortifications or observation posts, nor to constitute military depots, nor to deposit any kind of war material whatsoever with the exception of what is indispensable for the effectives allowed for; nor to establish ther bases or depots for the use of any kind of vessels, or of any kind of aerial fleet.
3. Russia for her part undertakes not to maintain troops in the region of Pskov to the west of the line: western bank of the mouth of Velikaja, the villages of Sivtseva, Luhnova, Samulina, Schalki and Sprechtitschi until 1 January 1922, which are indispensable for the frontier service and for the maintenance of order and for the effectives provided for in Annex 2 of the present Article.
4. The contracting parties undertake to have no armed vessels whatsoever on Lakes Peipus and Pihkva.
Article IV
During one year from the day of ratification of the present Treaty, persons of non-Estonian origin living in Estonia, and over eighteen years of age, have the right to opt for Russian nationality; women, and children, less than eighteen years of age, take the nationality of the husband or the father, unless there exists between man and wife any contrary agreement. The people who have opted for Russian nationality must, within a year from the date of their choice, leave Estonian territory; but they maintain their rights over the property and can take with them their movable property. In the same way persons of Estonian origin living in Russia can opt for Estonian nationality within the same length of time and under the same conditions.
Each of the contracting Governments reserves the right to refuse acceptance to its citizenship of such persons.
Note. In case of doubt about tho origin of persons, all those who could have been personally registered or whose parents would have been registered in a rural or urban community, or in a “class” on the territory now composing the State of Estonia, shall be considered as Estonians.
Article V
In case the perpetual neutrality of Estonia should be internationally recognised, Russia undertakes to respect this neutrality and to take part in guaranteeing it.
Article VI
Should the Gulf of Finland be neutralised, the two contracting Parties undertake to accede to this neutralisation, on conditions determined by common agreement by all the States concerned, and established by the international acts relating thereto; should the international convention referred to be concluded, they also undertake to put their naval forces, or part thereof, into such conditions as this international convention may require.
Article VII
The two Contracting Parties undertake:
1. To prohibit the presence in their territory of any troops with the exception of those of their own Government or of friendly States with whom one of the Contracting Parties may have concluded a military convention, but which are not de facto in a state of war with one of the Contracting Parties, and also to prohibit within the limits of their territory, the recruiting and mobilisation of particular corps by States, organisations and groups, intended for armed conflict against the other Contracting Parties.
2. To disarm those military and naval forces within their territory which did not belong to one of the Contracting Parties on the first of October, One thousand nine hundred and nineteen; to neutralise and immobilise, until the first of January, One thousand nine hundred and twenty-two, all property, artillery and commissariat material (exclusive of food and clothing), engineering and aviational material, i. e., guns, machine guns, rifles, side-arms, munitions, aeroplanes, armoured vehicles, tanks, armoured trains, etc., belonging to the military and naval forces referred to with the exception of such technical and war material as was handed over to these forces, but which belongs to the Contracting Parties or to other States, any portion of this material which belongs to other States shall be returned within six months from the date of the ratification of this Treaty. The disarmament of the above-mentioned irregular military and naval forces, and the immobilisation and neutralisation of their military stocks and of all their technical and war material must be completed: the first 30 per cent. of men and material within the first seven days after the ratification of the present Treaty of Peace, and the remainder within the two following weeks at the rate of thirty-five per cent per week.
3. To prohibit the soldiers and officers of the irregular troops who are subject to disarmament under the conditions laid down by the prececing Point (2) from joining in any capacity whatsoever, as volunteers, the ranks of the Government troops of the Contracting Parties, with the exception of:
(a) Persons of Estonian nationality who reside outside Estonia, but who have opted for that country;
(b) Persons not of Estonian nationality who resided in Estonia before the first of May, One thousand nine hundred and nineteen, and who have not opted to Russia;
(c) Persons not of Estonian nationality who have not opted for Russia and who served in the Estonian army before the twenty-seventh of November, One thousand nine hundred and nineteen.
Persons belonging to the classes mentioned in sub-divisions (a), (b), and (c), may serve with the armies of the Estonian Government.
4. (a) To prohibit any State, which is de facto in a state of war with one of the Contracting Parties, and any organisations or groups intended for an armed conflict with one of the Contracting Parties, for transporting through their ports or their territories anything which might be used for attacking the other Contracting Party, particularly armed forces belonging to these States, organisations or groups, and any article and war material used for artillery, commissariat, engineering, aviation, etc., which may belong to these military formations.
4. (b) To prohibit, exclusive of cases provided for by international law, the launching and navigation in their territorial waters of any warships, gun-boats, mine-layers, etc., belonging either to organisations or groups which are intended to fight against the other Contracting Party or to States which are in a state of war with this party, if those ships are intended to attack it, and if this intention is known to the Contracting Party to whose territory the ports and territorial waters so used belong.
5. Not to authorise the formation or presence in their territory of any organisation or groups whatsoever, which claim to govern the whole or part of the territory of the other Contracting Party, or the presence of representatives or officials of organisations and groups, whose object it is to overthrow the Government of the other party to the Treaty.
6. The Governments of the two Contracting Parties undertake, upon exchanging the ratification of this Treaty of Peace to furnish each other with precise information regarding the strength of the non-Governmental and also of military stocks (stationary or mobile), and of the military and technical material belonging to those irregular forces, which were on their territory at the time when the Armistice was concluded, that is to say, on the thirty-first of December, One thousand nine hundred and nineteen.
7. In order to superintend the carrying out of the military guarantees conceded to each other by the Contracting Parties, a mixed commission is to be established whose composition, rights and obligations shall be determined by “Instructions” which are annxed to the present Article.

@ -3,6 +3,7 @@
(
)
"
faceapp
:
;

@ -0,0 +1,24 @@
# # for new_file in tokens_without_stopwords:
# appendFile = open('tokenized_words.txt', 'a')
# appendFile.write(" " + new_file)
# appendFile.close()
# #shows only stopwords
# processed_word_list = []
# for word in tokenized:
# # print(word)
# if word not in all_stopwords:
# processed_word_list.append('*')
# else:
# processed_word_list.append(word)
# print(processed_word_list)
# # # result putting in a graph
# top_words_plot = frequency_word.plot(10)
# print(top_words_plot)

@ -0,0 +1,11 @@
from bs4 import BeautifulSoup
from urllib.parse import quote as urlquote, unquote as urlunquote
file = open('tag_comparison.py', 'r')
x = 1
print(read)
with open(file, 'w') as new_html:
read = new_html.read()
html = BeautifulSoup(text, 'html.parser')
line = html.find('NN', 'span')

File diff suppressed because one or more lines are too long

@ -0,0 +1,310 @@
from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from PIL import Image
import base64
nltk.download('stopwords')
# infofile = open('faceapp_infos.txt','r')
# infotext = infofile.read()
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#not sure if this works..
x = 1
t_file = open('russia-estonia.txt', 'r')
t_text = t_file.read()
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
# multi-line string HTML
print('''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title></title>
<style>
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: ghostwhite;
# font-family: Belgika;
# font-weight: 8th;
# letter-spacing: -0.3px;
font-size: 14px;
line-height: 1.2;
}
.tos_wrapper {
width: 50%;
float: left;
}
.t_wrapper {
width: 50%;
float: right;
}
.NNP {
background-color: pink;
}
.VBP {
}
.VBP:hover {
background-color: gold;
}
.NN {
background-color: LightSkyBlue;
}
.NNS {
background-color: Aquamarine;
}
.t_img {
font-family: SourceCodePro;
font-size: 10pt;
float: left;
}
.info {
font-family: SourceCodePro;
font-size: 10pt;
width: 60%;
float: left;
border: 1px solid black;
padding:10px;
margin-bottom: 50px;
}
.t_info {
font-family: SourceCodePro;
font-size: 10pt;
width: 90%;
float: left;
border: 1px solid black;
padding:10px;
margin-bottom: 50px;
}
.paragraph {
font-family: SourceCodePro;
font-weight: regular;
letter-spacing: -0.5px;
width: 70%;
float: right;
}
.t_paragraph {
font-family: SourceCodePro;
font-weight: regular;
letter-spacing: -0.5px;
width: 70%;
float: right;
}
.top_words {
font-family: Belgika;
font-weight: 8th;
font-size: 9pt;
width: 15%;
float: left;
}
.t_top_words {
font-family: Belgika;
font-weight: 8th;
font-size: 9pt;
width: 15%;
float: left;
}
</style>
</head>
<body>
<input type="checkbox" id="myCheck" onclick="myFunction()"> Noun
<p id="text" style="display:none">Checkbox is CHECKED!</p>
<script>
function myFunction() {
// Get the checkbox
var checkBox = document.getElementById("myCheck");
// Get the output text
var text = document.getElementById("text");
// If the checkbox is checked, display the output text
if (checkBox.checked == true){
text.style.display = "block";
} else {
text.style.display = "none";
}
}
</script>
''')
#wrapper
print('<div class ="tos_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="t_img"><h1>FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
infotext = [('Service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
for title, info in infotext:
print('<span class="info_{0}"><div class="info_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info_content">{1}</div></span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</div>')
#colonial words list
print('<div class="top_words" > <span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(20)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words" >{}({}) </span>'.format(chosen_words, frequency))
print('</div>')
#t_wrapper
print('</div><div class="t_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
img_url = base64.b64encode(open('img/tartu.jpeg', 'rb').read()).decode('utf-8')
t_image = '<div class="t_img"><h1>Peace Treaty of Tartu<h1><img style="width:90%" src="data:img/tartu.jpeg;base64,{}"></div>'.format(img_url)
print(t_image)
#t_info box
print('<div class ="t_info">')
t_infotext = [('Name of Treaty', 'Peace Treaty of Tartu'), ('Date', 'February 2, 1920'), ('Location', 'Tartu, Estonia'), ('Signed', 'February 2, 1920'), ('Type', 'bilateral peace treaty'), ('source', '<a href="https://en.wikipedia.org/wiki/Treaty_of_Tartu_(RussianEstonian)">link</a>'), ('Description', 'The Tartu Peace Treaty or Treaty of Tartu is a peace treaty between Estonia and Russian Soviet Federative Socialist Republic signed on 2 February 1920, ending the Estonian War of Independence.')]
for t_title, t_info in t_infotext:
print('<span class="t_info-{0}"><div class="info_t_title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="t_info_content">{1}</div></span><br>'.format(t_title, t_info))
print('</div>')
#ToS text
print('<div class="t_paragraph">')
t_tokenized = word_tokenize(t_text)
t_tagged = pos_tag(t_tokenized)
for t_word, t_pos in t_tagged:
print('<span class="{}">{}</span>'.format(t_pos, t_word))
print('</div>')
#treaty colonial words list
print('<div class="t_top_words" > <span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
t_tokens_without_stopwords = nltk.FreqDist(words.lower() for words in t_tokenized if words.lower() not in all_stopwords)
t_frequency_word = FreqDist(t_tokens_without_stopwords)
t_top_words = t_tokens_without_stopwords.most_common(20)
for t_chosen_words, t_frequency in t_top_words:
print('<br><span class="t_chosen_words" >{}({}) </span>'.format(t_chosen_words, t_frequency))
print('</div>')
print('</div>')
print('''</body></html>''')

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

@ -0,0 +1,193 @@
from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from PIL import Image
import base64
nltk.download('stopwords')
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#not sure if this works..
x = 1
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
print('''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title></title>
<style>
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: ghostwhite;
# font-family: Belgika;
# font-weight: 8th;
# letter-spacing: -0.3px;
font-size: 14px;
line-height: 1.2;
padding: 20px;
}
.tos_wrapper {
width: 100%;
float: left;
}
.NNP {
background-color: pink;
}
.VBP {
}
.VBP:hover {
background-color: gold;
}
.NN {
background-color: LightSkyBlue;
}
.NNS {
background-color: Aquamarine;
}
.t_img {
font-family: SourceCodePro;
font-size: 30pt;
float: left;
width: 20%;
clear: both;
}
.info {
font-family: SourceCodePro;
font-weight: regular;
font-size: 10pt;
width: 25%;
float: left;
border: 1px solid black;
padding:10px;
margin-bottom: 50px;
}
.paragraph {
font-family: SourceCodePro;
font-weight: regular;
letter-spacing: -0.5px;
width: 70%;
float: right;
}
.top_words {
font-family: Belgika;
font-weight: 8th;
font-size: 9pt;
width: 25%;
float: left;
}
</style>
</head>
<body>''')
print('<div class ="tos_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="t_img">FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
infotext = [('Service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
for title, info in infotext:
print('<span class="info-{0}"><div class="info-title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info-content">{1}</div></span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</div>')
#colonial words list
print('<div class="top_words"><span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
print('</div>')
# new_html = open('output.html', 'wb') # open the output file
# new_html.write('''</div></body></html>''')
# new_html.close() # close the output file
print('''</div></body></html>''')

@ -26,15 +26,35 @@
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: whitesmoke;
# font-family: Belgika;
# font-weight: 8th;
letter-spacing: 0.5px;
# letter-spacing: -0.3px;
font-size: 20px;
line-height: 1.2;
}
.info {
font-family: Belgika;
font-weight: 8th;
font-size: 10pt;
width: 20%;
float: right;
border: 1px solid black;
}
.NNP {
@ -57,9 +77,10 @@
}
.paragraph {
font-family: helvetica;
font-family: SourceCodePro;
font-weight: regular;
width: 70%;
letter-spacing: -0.5px;
width: 50%;
float: right;
}
@ -74,6 +95,12 @@
</style>
</head>
<body>
<div class ="info">
<span class="info-platform">platform:FaceApp</span><br>
<span class="info-Type">Type:Image editing</span><br>
<span class="info-Initial release">Initial release:December 31 2016</span><br>
<span class="info-Type">Type:Image editing</span><br>
</div>
<div class ="paragraph">
<span>
<span class="CD">1</span>

@ -12,9 +12,15 @@ from nltk.corpus import stopwords
nltk.download('stopwords')
# infofile = open('faceapp_infos.txt','r')
# infotext = infofile.read()
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#not sure if this works..
x = 1
#stopwords
@ -22,6 +28,98 @@ default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
# with open(output_html, 'w') as new_html:
# new_html.write(
# '''<!DOCTYPE html>
# <html>
# <head>
# <meta charset="utf-8">
# <title></title>
# <style>
# @font-face {
# font-family: "Belgika";
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
# url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
# }
# @font-face {
# font-family: "Belgika";
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
# url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
# }
# @font-face {
# font-family: "Belgika";
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
# url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
# }
# @font-face {
# font-family: "SourceCodePro";
# src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
# }
# body {
# background-color: whitesmoke;
# # font-family: Belgika;
# # font-weight: 8th;
# # letter-spacing: -0.3px;
# font-size: 20px;
# line-height: 1.2;
# }
# .NNP {
# background-color: pink;
# }
# .VBP {
# }
# .VBP:hover {
# background-color: gold;
# }
# .NN {
# background-color: LightSkyBlue;
# }
# .NNS {
# background-color: Aquamarine;
# }
# .paragraph {
# font-family: SourceCodePro;
# font-weight: regular;
# letter-spacing: -0.5px;
# width: 50%;
# float: right;
# }
# .top_words {
# font-family: Belgika;
# font-weight: 8th;
# font-size: 9pt;
# width: 25%;
# float: left;
# }
# </style>
# </head>
# <body>'''
# )
print('''<!DOCTYPE html>
@ -52,15 +150,35 @@ print('''<!DOCTYPE html>
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: whitesmoke;
# font-family: Belgika;
# font-weight: 8th;
letter-spacing: 0.5px;
# letter-spacing: -0.3px;
font-size: 20px;
line-height: 1.2;
}
.info {
font-family: Belgika;
font-weight: 8th;
font-size: 10pt;
width: 20%;
float: right;
border: 1px solid black;
}
.NNP {
@ -83,9 +201,10 @@ print('''<!DOCTYPE html>
}
.paragraph {
font-family: helvetica;
font-family: SourceCodePro;
font-weight: regular;
width: 70%;
letter-spacing: -0.5px;
width: 50%;
float: right;
}
@ -102,8 +221,25 @@ print('''<!DOCTYPE html>
<body>''')
# my stopwords are common words I don't want to count, like "a", "an", "the".
#info part
print('<div class ="info">')
infotext = [('platform', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing')]
for title, info in infotext:
print('<span class="info-{0}">{0}:{1}</span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
# for sentence in sent_tokenize(text):
print('<span>')
@ -123,16 +259,6 @@ print('</div>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
print(tokens_without_stopwords)
# for read_whole_text in tokens_without_stopwords:
# whole_text_tokenized =
# print(whole_text_tokenized)
# #filtered words in sentence
# filtered_sentence = (" ").join(tokens_without_stopwords)
# print(filtered_sentence)
print('<div class="top_words"> colonial words:')
@ -142,6 +268,9 @@ top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
# new_html = open('output.html', 'wb') # open the output file
# new_html.write('''</div></body></html>''')
# new_html.close() # close the output file
@ -150,49 +279,3 @@ for chosen_words, frequency in top_words:
print('''</div></body></html>''')
# # for new_file in tokens_without_stopwords:
# appendFile = open('tokenized_words.txt', 'a')
# appendFile.write(" " + new_file)
# appendFile.close()
# #shows only stopwords
# processed_word_list = []
# for word in tokenized:
# # print(word)
# if word not in all_stopwords:
# processed_word_list.append('*')
# else:
# processed_word_list.append(word)
# print(processed_word_list)
# # # result putting in a graph
# top_words_plot = frequency_word.plot(10)
# print(top_words_plot)

Loading…
Cancel
Save