You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

194 lines
4.8 KiB
Python

from __future__ import division
import glob
from nltk import *
import re
import nltk
import codecs
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from PIL import Image
import base64
nltk.download('stopwords')
#open the txt file, read, and tokenize
file = open('faceapp.txt','r')
text = file.read()
#not sure if this works..
x = 1
#stopwords
default_stopwords = set(stopwords.words('english'))
custom_stopwords = set(codecs.open('stopwords.txt', 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
print('''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title></title>
<style>
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-40th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-16th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "Belgika";
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.eot");
src: url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.woff") format("woff"),
url("http://bohyewoo.com/webfonts/belgika/belgika-8th-webfont.svg#filename") format("svg");
}
@font-face {
font-family: "SourceCodePro";
src: url("http://bohyewoo.com/webfonts/Source_Code_Pro/SourceCodePro-Regular.ttf");
}
body {
background-color: ghostwhite;
# font-family: Belgika;
# font-weight: 8th;
# letter-spacing: -0.3px;
font-size: 14px;
line-height: 1.2;
padding: 20px;
}
.tos_wrapper {
width: 100%;
float: left;
}
.NNP {
background-color: pink;
}
.VBP {
}
.VBP:hover {
background-color: gold;
}
.NN {
background-color: LightSkyBlue;
}
.NNS {
background-color: Aquamarine;
}
.t_img {
font-family: SourceCodePro;
font-size: 30pt;
float: left;
width: 20%;
clear: both;
}
.info {
font-family: SourceCodePro;
font-weight: regular;
font-size: 10pt;
width: 25%;
float: left;
border: 1px solid black;
padding:10px;
margin-bottom: 50px;
}
.paragraph {
font-family: SourceCodePro;
font-weight: regular;
letter-spacing: -0.5px;
width: 70%;
float: right;
}
.top_words {
font-family: Belgika;
font-weight: 8th;
font-size: 9pt;
width: 25%;
float: left;
}
</style>
</head>
<body>''')
print('<div class ="tos_wrapper">')
#insert an image
# https://upload.wikimedia.org/wikipedia/commons/1/15/Joffe_signing_the_Treaty_of_Tartu.jpg
FaceApp_img_url = base64.b64encode(open('img/faceapp_logo.png', 'rb').read()).decode('utf-8')
FaceApp_image = '<div class="t_img">FaceApp<img style="width:90%" src="data:img/faceapp_logo.png;base64,{}"></div>'.format(FaceApp_img_url)
print(FaceApp_image)
#info box
print('<div class ="info">')
infotext = [('Service', 'FaceApp'), ('Type', 'Image editing'), ('Initial release', 'December 31, 2016'), ('Type', 'Image editing'), ('source', '<a href="https://www.faceapp.com/terms-en.html">link</a>'), ('Description', 'FaceApp is a mobile application for iOS and Android developed by Russian company Wireless Lab. The app generates highly realistic transformations of human faces in photographs by using neural networks based on artificial intelligence. The app can transform a face to make it smile, look younger, look older, or change gender.')]
for title, info in infotext:
print('<span class="info-{0}"><div class="info-title" style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >{0}</div><div class="info-content">{1}</div></span><br>'.format(title, info))
print('</div>')
#ToS text
print('<div class ="paragraph">')
tokenized = word_tokenize(text)
tagged = pos_tag(tokenized)
for word, pos in tagged:
print('<span class="{}">{}</span>'.format(pos, word))
print('</div>')
#colonial words list
print('<div class="top_words"><span style="-webkit-text-decoration-line: underline; text-decoration-line: underline;" >colonial words:</span>')
tokens_without_stopwords = nltk.FreqDist(words.lower() for words in tokenized if words.lower() not in all_stopwords)
frequency_word = FreqDist(tokens_without_stopwords)
top_words = tokens_without_stopwords.most_common(100)
for chosen_words, frequency in top_words:
print('<br><span class="chosen_words">{}({}) </span>'.format(chosen_words, frequency))
print('</div>')
# new_html = open('output.html', 'wb') # open the output file
# new_html.write('''</div></body></html>''')
# new_html.close() # close the output file
print('''</div></body></html>''')