You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
282 lines
5.6 KiB
Python
282 lines
5.6 KiB
Python
5 years ago
|
from __future__ import division
|
||
|
import glob
|
||
|
from nltk import *
|
||
|
import re
|
||
|
|
||
|
|
||
|
import nltk
|
||
|
import codecs
|
||
|
from nltk import sent_tokenize, word_tokenize, pos_tag
|
||
|
from nltk.probability import FreqDist
|
||
|
from nltk.corpus import stopwords
|
||
|
nltk.download('stopwords')
|
||
|
|
||
|
|
||
5 years ago
|
|
||
|
|
||
|
# infofile = open('faceapp_infos.txt','r')
|
||
|
# infotext = infofile.read()
|
||
|
|
||
5 years ago
|
#open the txt file, read, and tokenize
|
||
|
file = open('faceapp.txt','r')
|
||
|
text = file.read()
|
||
5 years ago
|
#not sure if this works..
|
||
5 years ago
|
x = 1
|
||
|
|
||
|
#stopwords
|
||
|
default_stopwords = set(stopwords.words('english'))
|
||
|
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
|
||
|
all_stopwords = default_stopwords | custom_stopwords
|
||
|
|
||
5 years ago
|
# with open(output_html, 'w') as new_html:
|
||
|
# new_html.write(
|
||
|
# '''<!DOCTYPE html>
|
||
|
# <html>
|
||
|
# <head>
|
||
|
# <meta charset="utf-8">
|
||
|
# <title></title>
|
||
|
# <style>
|
||
|
|
||
|
# @font-face {
|
||
|
# font-family: "Belgika";
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
|
||
|
# url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
|
||
|
# }
|
||
|
|
||
|
# @font-face {
|
||
|
# font-family: "Belgika";
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
|
||
|
# url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
|
||
|
# }
|
||
|
|
||
|
# @font-face {
|
||
|
# font-family: "Belgika";
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
|
||
|
# src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
|
||
|
# url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
|
||
|
# }
|
||
|
|
||
|
# @font-face {
|
||
|
# font-family: "SourceCodePro";
|
||
|
# src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
|
||
|
# }
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# body {
|
||
|
# background-color: whitesmoke;
|
||
|
# # font-family: Belgika;
|
||
|
# # font-weight: 8th;
|
||
|
# # letter-spacing: -0.3px;
|
||
|
# font-size: 20px;
|
||
|
# line-height: 1.2;
|
||
|
|
||
|
# }
|
||
|
|
||
|
|
||
|
|
||
|
# .NNP {
|
||
|
# background-color: pink;
|
||
|
# }
|
||
|
|
||
|
# .VBP {
|
||
|
# }
|
||
|
|
||
|
# .VBP:hover {
|
||
|
# background-color: gold;
|
||
|
# }
|
||
|
|
||
|
# .NN {
|
||
|
# background-color: LightSkyBlue;
|
||
|
# }
|
||
|
|
||
|
# .NNS {
|
||
|
# background-color: Aquamarine;
|
||
|
# }
|
||
|
|
||
|
# .paragraph {
|
||
|
# font-family: SourceCodePro;
|
||
|
# font-weight: regular;
|
||
|
# letter-spacing: -0.5px;
|
||
|
# width: 50%;
|
||
|
# float: right;
|
||
|
# }
|
||
|
|
||
|
# .top_words {
|
||
|
# font-family: Belgika;
|
||
|
# font-weight: 8th;
|
||
|
# font-size: 9pt;
|
||
|
# width: 25%;
|
||
|
# float: left;
|
||
|
# }
|
||
|
|
||
|
# </style>
|
||
|
# </head>
|
||
|
# <body>'''
|
||
|
# )
|
||
|
|
||
|
|
||
5 years ago
|
|
||
|
|
||
|
print('''<!DOCTYPE html>
|
||
|
<html>
|
||
|
<head>
|
||
|
<meta charset="utf-8">
|
||
|
<title></title>
|
||
|
<style>
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
|
@font-face {
|
||
|
font-family: "Belgika";
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
|
||
|
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
|
||
|
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
|
||
|
}
|
||
|
|
||
5 years ago
|
@font-face {
|
||
|
font-family: "SourceCodePro";
|
||
|
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
5 years ago
|
body {
|
||
5 years ago
|
background-color: whitesmoke;
|
||
5 years ago
|
# font-family: Belgika;
|
||
|
# font-weight: 8th;
|
||
5 years ago
|
# letter-spacing: -0.3px;
|
||
5 years ago
|
font-size: 20px;
|
||
|
line-height: 1.2;
|
||
|
|
||
|
}
|
||
|
|
||
5 years ago
|
.info {
|
||
|
font-family: Belgika;
|
||
|
font-weight: 8th;
|
||
|
font-size: 10pt;
|
||
|
width: 20%;
|
||
|
float: right;
|
||
|
border: 1px solid black;
|
||
|
|
||
|
|
||
|
}
|
||
5 years ago
|
|
||
|
|
||
|
.NNP {
|
||
|
background-color: pink;
|
||
|
}
|
||
|
|
||
|
.VBP {
|
||
|
}
|
||
|
|
||
|
.VBP:hover {
|
||
|
background-color: gold;
|
||
|
}
|
||
|
|
||
|
.NN {
|
||
|
background-color: LightSkyBlue;
|
||
|
}
|
||
|
|
||
|
.NNS {
|
||
|
background-color: Aquamarine;
|
||
|
}
|
||
|
|
||
|
.paragraph {
|
||
5 years ago
|
font-family: SourceCodePro;
|
||
5 years ago
|
font-weight: regular;
|
||
5 years ago
|
letter-spacing: -0.5px;
|
||
|
width: 50%;
|
||
5 years ago
|
float: right;
|
||
|
}
|
||
|
|
||
|
.top_words {
|
||
|
font-family: Belgika;
|
||
|
font-weight: 8th;
|
||
|
font-size: 9pt;
|
||
|
width: 25%;
|
||
|
float: left;
|
||
|
}
|
||
|
|
||
|
</style>
|
||
|
</head>
|
||
|
<body>''')
|
||
|
|
||
|
|
||
|
|
||
5 years ago
|
|
||
|
#info part
|
||
|
print('<div class ="info">')
|
||
|
infotext = [('platform', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31 2016'), ('Type', 'Image editing')]
|
||
|
|
||
|
for title, info in infotext:
|
||
|
print('<span class="info-{0}">{0}:{1}</span><br>'.format(title, info))
|
||
|
|
||
|
print('</div>')
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#ToS text
|
||
5 years ago
|
print('<div class ="paragraph">')
|
||
|
# for sentence in sent_tokenize(text):
|
||
|
print('<span>')
|
||
|
|
||
|
tokenized = word_tokenize(text)
|
||
|
tagged = pos_tag(tokenized)
|
||
|
|
||
|
# for HTML
|
||
|
for word, pos in tagged:
|
||
|
print('<span class="{}">{}</span>'.format(pos, word))
|
||
|
|
||
|
print('</span>')
|
||
|
|
||
|
print('</div>')
|
||
|
|
||
|
# filtering stopwords
|
||
|
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
|
||
|
print(tokens_without_stopwords)
|
||
|
|
||
|
|
||
|
print('<div class="top_words"> colonial words:')
|
||
|
|
||
|
frequency_word = FreqDist(tokens_without_stopwords)
|
||
|
top_words = tokens_without_stopwords.most_common(100)
|
||
|
|
||
|
for chosen_words, frequency in top_words:
|
||
|
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
|
||
|
|
||
5 years ago
|
|
||
|
|
||
|
|
||
5 years ago
|
# new_html = open('output.html', 'wb') # open the output file
|
||
|
# new_html.write('''</div></body></html>''')
|
||
|
# new_html.close() # close the output file
|
||
|
|
||
|
|
||
|
print('''</div></body></html>''')
|
||
|
|
||
|
|