Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make
commit
3bfed0e3d0
@ -1,82 +0,0 @@
|
||||
import nltk
|
||||
from sys import stdin, stdout
|
||||
|
||||
# Define input
|
||||
input = stdin.read()
|
||||
|
||||
# FILTER FUNCTIONS
|
||||
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
|
||||
def postagger(string):
|
||||
words = nltk.word_tokenize(string)
|
||||
taggedwordlist = nltk.pos_tag(words)
|
||||
|
||||
for word, pos in nltk.pos_tag(words):
|
||||
taggedwordlist = nltk.pos_tag(words)
|
||||
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
|
||||
|
||||
taglist = [ pos for word,pos in taggedwordlist ]
|
||||
#print(taglist)
|
||||
return taglist;
|
||||
|
||||
# This function changes the tags to readable equivalents (NNP to noun for example)
|
||||
def postagger_readable(list):
|
||||
readabletaglist = []
|
||||
|
||||
for tag in list:
|
||||
if tag in {"NNP","NNS","NN","NNPS"}:
|
||||
readabletag = 'noun'
|
||||
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
|
||||
readabletag = 'verb'
|
||||
elif tag in {'RB','RBR','RBS','WRB'}:
|
||||
readabletag = 'adverb'
|
||||
elif tag in {'PRP','PRP$'}:
|
||||
readabletag = 'pronoun'
|
||||
elif tag in {'JJ','JJR','JJS'}:
|
||||
readabletag = 'adjective'
|
||||
elif tag == 'IN':
|
||||
readabletag = 'preposition'
|
||||
elif tag == 'WDT':
|
||||
readabletag = 'determiner'
|
||||
elif tag in {'WP','WP$'}:
|
||||
readabletag = 'pronoun'
|
||||
elif tag == 'UH':
|
||||
readabletag = 'interjection'
|
||||
elif tag == 'POS':
|
||||
readabletag = 'possesive ending'
|
||||
elif tag == 'SYM':
|
||||
readabletag = 'symbol'
|
||||
elif tag == 'EX':
|
||||
readabletag = 'existential there'
|
||||
elif tag == 'DT':
|
||||
readabletag = 'determiner'
|
||||
elif tag == 'MD':
|
||||
readabletag = 'modal'
|
||||
elif tag == 'LS':
|
||||
readabletag = 'list item marker'
|
||||
elif tag == 'FW':
|
||||
readabletag = 'foreign word'
|
||||
elif tag == 'CC':
|
||||
readabletag = 'coordinating conjunction '
|
||||
elif tag == 'CD':
|
||||
readabletag = 'cardinal number'
|
||||
elif tag == 'TO':
|
||||
readabletag = 'to'
|
||||
elif tag == '.':
|
||||
readabletag = 'line ending'
|
||||
elif tag == ',':
|
||||
readabletag = 'comma'
|
||||
else:
|
||||
readabletag = tag
|
||||
|
||||
readabletaglist.append(readabletag)
|
||||
return readabletaglist;
|
||||
|
||||
|
||||
# This function creates the output
|
||||
def main():
|
||||
taglist = postagger(input)
|
||||
readabletaglist = postagger_readable(taglist)
|
||||
stdout.write(' '.join(readabletaglist))
|
||||
stdout.write('\n')
|
||||
|
||||
main()
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,64 @@
|
||||
$(document).ready(function(){
|
||||
var state = 0;
|
||||
|
||||
$('.noun').addClass('fade-out');
|
||||
$('.preposition').addClass('red');
|
||||
$('.verb').addClass('blue');
|
||||
$('.determiner').addClass('cyan');
|
||||
|
||||
$(document).bind('contextmenu', function(e) { return false; });
|
||||
|
||||
$( ".word" ).contextmenu(function() {
|
||||
console.log($(this).hasClass('underline'));
|
||||
$(this).hasClass('underline') == false
|
||||
? $(this).addClass('underline')
|
||||
: $(this).removeClass('underline');
|
||||
});
|
||||
|
||||
$('.word').click( function() {
|
||||
var el = $('.word');
|
||||
console.log(state);
|
||||
|
||||
if (state == 0) {
|
||||
$('.word').removeClass('fade-out red blue cyan');
|
||||
|
||||
$('.stopword').addClass('fade-out');
|
||||
}
|
||||
|
||||
else if (state == 1) {
|
||||
$('.stopword').removeClass('fade-out');
|
||||
$('.neutral').addClass('fade-out');
|
||||
}
|
||||
|
||||
else if (state == 2) {
|
||||
$('.neutral').removeClass('fade-out');
|
||||
$('.noun').addClass('fade-out');
|
||||
$('.preposition').addClass('red');
|
||||
$('.verb').addClass('blue');
|
||||
state = -1;
|
||||
}
|
||||
|
||||
$('.word').each(function() {
|
||||
var el = $(this);
|
||||
|
||||
if (state == 0) {
|
||||
el.empty();
|
||||
el.html(el.data("stopword") + " ");
|
||||
}
|
||||
|
||||
else if (state == 1) {
|
||||
el.empty();
|
||||
el.html(el.data("sentiment") + " ");
|
||||
}
|
||||
|
||||
else {
|
||||
el.empty();
|
||||
el.html(el.data("pos") + " ");
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
state = state+1;
|
||||
});
|
||||
|
||||
});
|
@ -0,0 +1,86 @@
|
||||
* {
|
||||
min-height: 0;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
body {
|
||||
background: #639ab2;
|
||||
font-size: 15px;
|
||||
font-family: 'Ubuntu Mono', monospace;
|
||||
}
|
||||
|
||||
.prelative {
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
div.container {
|
||||
width: 100%;
|
||||
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
|
||||
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
|
||||
display: -ms-flexbox; /* TWEENER - IE 10 */
|
||||
display: -webkit-flex; /* NEW - Chrome */
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.word {
|
||||
font-size: 3rem;
|
||||
float: left;
|
||||
position: relative;
|
||||
text-align: center;
|
||||
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
|
||||
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
|
||||
display: -ms-flexbox; /* TWEENER - IE 10 */
|
||||
display: -webkit-flex; /* NEW - Chrome */
|
||||
display:flex;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.word:before,
|
||||
.word:after {
|
||||
content: '';
|
||||
color: #fff;
|
||||
position: absolute;
|
||||
font-family: 'PT Serif', serif;
|
||||
font-weight: bold;
|
||||
font-size: 1.5rem;
|
||||
font-style: italic;
|
||||
opacity: 0;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.word:before {
|
||||
content: attr(data-txt);
|
||||
flex-shrink: 1;
|
||||
}
|
||||
|
||||
.word:hover:before,
|
||||
.word:active:after {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
.fade-out {
|
||||
color: #275152;
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 1rem;
|
||||
}
|
||||
|
||||
.red {
|
||||
color: red;
|
||||
}
|
||||
|
||||
.blue {
|
||||
color: blue;
|
||||
}
|
||||
|
||||
.cyan {
|
||||
color: cyan;
|
||||
}
|
||||
|
||||
.underline {
|
||||
text-decoration: underline;
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Wordtagger</title>
|
||||
<meta charset="utf-8" />
|
||||
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
|
||||
<script type="text/javascript" src="jquery.min.js"></script>
|
||||
<script type="text/javascript" src="script.js"></script>
|
||||
<!--meta name="viewport" content="width=device-width"-->
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container"><p>
|
||||
{% for item, value in words_and_tags.items() %}
|
||||
<span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}} </span>
|
||||
{% endfor %}
|
||||
</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,156 @@
|
||||
# LIBS
|
||||
import nltk
|
||||
import json
|
||||
import os
|
||||
from sys import stdin, stdout
|
||||
from nltk import ne_chunk, pos_tag, word_tokenize
|
||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||
from nltk.corpus import stopwords
|
||||
from jinja2 import Template
|
||||
|
||||
# == INPUT AND TOKENIZE ==
|
||||
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
|
||||
input = stdin.read()
|
||||
words = nltk.word_tokenize(input)
|
||||
words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
|
||||
print(words_and_tags)
|
||||
|
||||
# == FILTER FUNCTIONS ==
|
||||
|
||||
# === 1. POS_tagger & Named Entity Recognizer ===
|
||||
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
|
||||
def POS_tagger(list):
|
||||
taggedwordlist = nltk.pos_tag(list)
|
||||
|
||||
|
||||
for word, pos in nltk.pos_tag(list):
|
||||
taggedwordlist = nltk.pos_tag(list)
|
||||
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
|
||||
print(taggedwordlist)
|
||||
taglist = [ pos for word,pos in taggedwordlist ]
|
||||
POS_tags = []
|
||||
|
||||
for tag in taglist:
|
||||
if tag in {"NNP","NNS","NN","NNPS"}:
|
||||
POS_tag = 'noun'
|
||||
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
|
||||
POS_tag = 'verb'
|
||||
elif tag in {'RB','RBR','RBS','WRB'}:
|
||||
POS_tag = 'adverb'
|
||||
elif tag in {'PRP','PRP$'}:
|
||||
POS_tag = 'pronoun'
|
||||
elif tag in {'JJ','JJR','JJS'}:
|
||||
POS_tag = 'adjective'
|
||||
elif tag == 'IN':
|
||||
POS_tag = 'preposition'
|
||||
elif tag == 'WDT':
|
||||
POS_tag = 'determiner'
|
||||
elif tag in {'WP','WP$'}:
|
||||
POS_tag = 'pronoun'
|
||||
elif tag == 'UH':
|
||||
POS_tag = 'interjection'
|
||||
elif tag == 'POS':
|
||||
POS_tag = 'possesive ending'
|
||||
elif tag == 'SYM':
|
||||
POS_tag = 'symbol'
|
||||
elif tag == 'EX':
|
||||
POS_tag = 'existential there'
|
||||
elif tag == 'DT':
|
||||
POS_tag = 'determiner'
|
||||
elif tag == 'MD':
|
||||
POS_tag = 'modal'
|
||||
elif tag == 'LS':
|
||||
POS_tag = 'list item marker'
|
||||
elif tag == 'FW':
|
||||
POS_tag = 'foreign word'
|
||||
elif tag == 'CC':
|
||||
POS_tag = 'coordinating conjunction '
|
||||
elif tag == 'CD':
|
||||
POS_tag = 'cardinal number'
|
||||
elif tag == 'TO':
|
||||
POS_tag = 'to'
|
||||
elif tag == '.':
|
||||
POS_tag = 'line ending'
|
||||
elif tag == ',':
|
||||
POS_tag = 'comma'
|
||||
else:
|
||||
POS_tag = tag
|
||||
POS_tags.append(POS_tag)
|
||||
#print(POS_tag)
|
||||
return POS_tags;
|
||||
|
||||
# === 2. Sentiment tagger ===
|
||||
# Sentiment analyzer based on the NLTK VADER tagger.
|
||||
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
|
||||
def sentiment_tagger(word):
|
||||
analyzer = SentimentIntensityAnalyzer()
|
||||
score = analyzer.polarity_scores(word).get("compound")
|
||||
|
||||
if score < 0:
|
||||
sentiment_tag = 'negative'
|
||||
elif score > 0:
|
||||
sentiment_tag = 'positive'
|
||||
else:
|
||||
sentiment_tag = 'neutral'
|
||||
|
||||
return sentiment_tag
|
||||
|
||||
# === 3. Stopword tagger ===
|
||||
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
|
||||
def stopword_tagger(word):
|
||||
|
||||
stopWords = set(stopwords.words('english'))
|
||||
|
||||
if word in stopWords:
|
||||
stopword_tag = 'stopword'
|
||||
else:
|
||||
stopword_tag = 'keyword'
|
||||
|
||||
return stopword_tag
|
||||
|
||||
|
||||
# Run POS tagger
|
||||
# This tagger outputs a list for all items in the dict at once
|
||||
# To avoid double work, it is better to keep this outside the for loop
|
||||
POS_tags = POS_tagger(words)
|
||||
i = 0
|
||||
|
||||
# Adding tags to words in dictionary, which will be exported as a json file
|
||||
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
|
||||
for item, value in words_and_tags.items():
|
||||
word = words_and_tags[item]['word']
|
||||
|
||||
# POS
|
||||
pos_tag = POS_tags[i]
|
||||
words_and_tags[item]['POS'] = pos_tag
|
||||
i = i+1
|
||||
|
||||
# Add sentiment tag
|
||||
sentiment_tag = sentiment_tagger(word)
|
||||
words_and_tags[item]['sentiment'] = sentiment_tag
|
||||
|
||||
# Add stopword tag
|
||||
stopword_tag = stopword_tagger(word)
|
||||
words_and_tags[item]['wordtype'] = stopword_tag
|
||||
|
||||
# Add entity tag
|
||||
# Not functional yet
|
||||
|
||||
# Save data into a json file
|
||||
print(words_and_tags)
|
||||
#with open("data.json", 'w') as f:
|
||||
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
|
||||
json.dump(words_and_tags, f, ensure_ascii=False)
|
||||
|
||||
#let's bind it to a jinja2 template
|
||||
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
|
||||
template_open = open("src/wordtagger/template.html", "r")
|
||||
template = Template(template_open.read())
|
||||
index_render = template.render(words_and_tags=words_and_tags)
|
||||
#print(text_render)
|
||||
|
||||
# And render an html file!
|
||||
print(index_render)
|
||||
index_open = open("output/wordtagger/index.html", "w")
|
||||
index_open.write(index_render)
|
||||
index_open.close()
|
Loading…
Reference in New Issue