Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

master
Alice 6 years ago
commit 3bfed0e3d0

@ -1,4 +1,5 @@
images=$(sort $(wildcard images/*.jpg))
images=$(sort $(wildcard images/*.jpg))
# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
output_ocr:=$(dir_ocr)/output.txt
tmpfile:= $(shell mktemp)
space:= $(empty) $(empty)
@ -22,7 +23,7 @@ help:
# CLEAN
clean: ## removes output (target) files
rm ocr/output.txt
rm ocr/output.txt
rm $(wildcard output/*)
rm $(tmpfile)
@ -31,12 +32,14 @@ clean: ## removes output (target) files
dirs: ## create the dirs in working dir
@-mkdir -p images/
@-mkdir -p images-tiff/
@-mkdir -p output/
@-mkdir -p output/wordtagger
@-mkdir -p ocr/
@-mkdir -p hocr/
@echo $(color_r)'Directories made': images/ output/
testif:
ifeq ($(OS),Darwin)
@echo $(OS)
@ -50,12 +53,28 @@ ocr/output.txt: ## ocr with tesseract
@echo $(basename $@ .txt)
tesseract $(@D)/list.txt $(basename $@ .txt)
tiffs: ## convert images/ to images-tiff/ Depends on IM
echo $(images)
for i in $(images); \
do tiff=`basename $$i .jpg`.tiff; \
convert -density 300 $$i images-tiff/$$tiff; \
echo $$tiff; \
done;
hocrs: ## hocr with tesseract and then change extension to .html
for i in images-tiff/*.tiff; \
do echo $$i; hocrfile=`basename $$i .tiff`; \
tesseract $$i hocr/$$hocrfile hocr; \
mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
done;
#OUTPUT GENERATION RECIPES
output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
cat $< | python3 src/wordtagger.py > $(@)
output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
cp src/wordtagger/jquery.min.js output/wordtagger
cp src/wordtagger/script.js output/wordtagger
cp src/wordtagger/style.css output/wordtagger
cat $< | python3 src/wordtagger/wordtagger.py
# install nltk's 'averaged_perceptron_tagger':
# $ python 3
# >>> import nltk

@ -1,82 +0,0 @@
import nltk
from sys import stdin, stdout
# Define input
input = stdin.read()
# FILTER FUNCTIONS
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
def postagger(string):
words = nltk.word_tokenize(string)
taggedwordlist = nltk.pos_tag(words)
for word, pos in nltk.pos_tag(words):
taggedwordlist = nltk.pos_tag(words)
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
taglist = [ pos for word,pos in taggedwordlist ]
#print(taglist)
return taglist;
# This function changes the tags to readable equivalents (NNP to noun for example)
def postagger_readable(list):
readabletaglist = []
for tag in list:
if tag in {"NNP","NNS","NN","NNPS"}:
readabletag = 'noun'
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
readabletag = 'verb'
elif tag in {'RB','RBR','RBS','WRB'}:
readabletag = 'adverb'
elif tag in {'PRP','PRP$'}:
readabletag = 'pronoun'
elif tag in {'JJ','JJR','JJS'}:
readabletag = 'adjective'
elif tag == 'IN':
readabletag = 'preposition'
elif tag == 'WDT':
readabletag = 'determiner'
elif tag in {'WP','WP$'}:
readabletag = 'pronoun'
elif tag == 'UH':
readabletag = 'interjection'
elif tag == 'POS':
readabletag = 'possesive ending'
elif tag == 'SYM':
readabletag = 'symbol'
elif tag == 'EX':
readabletag = 'existential there'
elif tag == 'DT':
readabletag = 'determiner'
elif tag == 'MD':
readabletag = 'modal'
elif tag == 'LS':
readabletag = 'list item marker'
elif tag == 'FW':
readabletag = 'foreign word'
elif tag == 'CC':
readabletag = 'coordinating conjunction '
elif tag == 'CD':
readabletag = 'cardinal number'
elif tag == 'TO':
readabletag = 'to'
elif tag == '.':
readabletag = 'line ending'
elif tag == ',':
readabletag = 'comma'
else:
readabletag = tag
readabletaglist.append(readabletag)
return readabletaglist;
# This function creates the output
def main():
taglist = postagger(input)
readabletaglist = postagger_readable(taglist)
stdout.write(' '.join(readabletaglist))
stdout.write('\n')
main()

File diff suppressed because one or more lines are too long

@ -0,0 +1,64 @@
$(document).ready(function(){
var state = 0;
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
$('.determiner').addClass('cyan');
$(document).bind('contextmenu', function(e) { return false; });
$( ".word" ).contextmenu(function() {
console.log($(this).hasClass('underline'));
$(this).hasClass('underline') == false
? $(this).addClass('underline')
: $(this).removeClass('underline');
});
$('.word').click( function() {
var el = $('.word');
console.log(state);
if (state == 0) {
$('.word').removeClass('fade-out red blue cyan');
$('.stopword').addClass('fade-out');
}
else if (state == 1) {
$('.stopword').removeClass('fade-out');
$('.neutral').addClass('fade-out');
}
else if (state == 2) {
$('.neutral').removeClass('fade-out');
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
state = -1;
}
$('.word').each(function() {
var el = $(this);
if (state == 0) {
el.empty();
el.html(el.data("stopword") + "&nbsp;");
}
else if (state == 1) {
el.empty();
el.html(el.data("sentiment") + "&nbsp;");
}
else {
el.empty();
el.html(el.data("pos") + "&nbsp;");
}
});
state = state+1;
});
});

@ -0,0 +1,86 @@
* {
min-height: 0;
min-width: 0;
}
body {
background: #639ab2;
font-size: 15px;
font-family: 'Ubuntu Mono', monospace;
}
.prelative {
flex-shrink: 0;
}
div.container {
width: 100%;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display: flex;
flex-wrap: wrap;
}
.word {
font-size: 3rem;
float: left;
position: relative;
text-align: center;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display:flex;
justify-content: center;
}
.word:before,
.word:after {
content: '';
color: #fff;
position: absolute;
font-family: 'PT Serif', serif;
font-weight: bold;
font-size: 1.5rem;
font-style: italic;
opacity: 0;
width: 100%;
}
.word:before {
content: attr(data-txt);
flex-shrink: 1;
}
.word:hover:before,
.word:active:after {
opacity: 1;
}
.fade-out {
color: #275152;
}
p {
margin: 1rem;
}
.red {
color: red;
}
.blue {
color: blue;
}
.cyan {
color: cyan;
}
.underline {
text-decoration: underline;
}

@ -0,0 +1,20 @@
<!DOCTYPE html>
<html>
<head>
<title>Wordtagger</title>
<meta charset="utf-8" />
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
<script type="text/javascript" src="jquery.min.js"></script>
<script type="text/javascript" src="script.js"></script>
<!--meta name="viewport" content="width=device-width"-->
</head>
<body>
<div class="container"><p>
{% for item, value in words_and_tags.items() %}
<span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
{% endfor %}
</p>
</div>
</body>
</html>

@ -0,0 +1,156 @@
# LIBS
import nltk
import json
import os
from sys import stdin, stdout
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from jinja2 import Template
# == INPUT AND TOKENIZE ==
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
input = stdin.read()
words = nltk.word_tokenize(input)
words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
print(words_and_tags)
# == FILTER FUNCTIONS ==
# === 1. POS_tagger & Named Entity Recognizer ===
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
def POS_tagger(list):
taggedwordlist = nltk.pos_tag(list)
for word, pos in nltk.pos_tag(list):
taggedwordlist = nltk.pos_tag(list)
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
print(taggedwordlist)
taglist = [ pos for word,pos in taggedwordlist ]
POS_tags = []
for tag in taglist:
if tag in {"NNP","NNS","NN","NNPS"}:
POS_tag = 'noun'
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
POS_tag = 'verb'
elif tag in {'RB','RBR','RBS','WRB'}:
POS_tag = 'adverb'
elif tag in {'PRP','PRP$'}:
POS_tag = 'pronoun'
elif tag in {'JJ','JJR','JJS'}:
POS_tag = 'adjective'
elif tag == 'IN':
POS_tag = 'preposition'
elif tag == 'WDT':
POS_tag = 'determiner'
elif tag in {'WP','WP$'}:
POS_tag = 'pronoun'
elif tag == 'UH':
POS_tag = 'interjection'
elif tag == 'POS':
POS_tag = 'possesive ending'
elif tag == 'SYM':
POS_tag = 'symbol'
elif tag == 'EX':
POS_tag = 'existential there'
elif tag == 'DT':
POS_tag = 'determiner'
elif tag == 'MD':
POS_tag = 'modal'
elif tag == 'LS':
POS_tag = 'list item marker'
elif tag == 'FW':
POS_tag = 'foreign word'
elif tag == 'CC':
POS_tag = 'coordinating conjunction '
elif tag == 'CD':
POS_tag = 'cardinal number'
elif tag == 'TO':
POS_tag = 'to'
elif tag == '.':
POS_tag = 'line ending'
elif tag == ',':
POS_tag = 'comma'
else:
POS_tag = tag
POS_tags.append(POS_tag)
#print(POS_tag)
return POS_tags;
# === 2. Sentiment tagger ===
# Sentiment analyzer based on the NLTK VADER tagger.
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
def sentiment_tagger(word):
analyzer = SentimentIntensityAnalyzer()
score = analyzer.polarity_scores(word).get("compound")
if score < 0:
sentiment_tag = 'negative'
elif score > 0:
sentiment_tag = 'positive'
else:
sentiment_tag = 'neutral'
return sentiment_tag
# === 3. Stopword tagger ===
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
def stopword_tagger(word):
stopWords = set(stopwords.words('english'))
if word in stopWords:
stopword_tag = 'stopword'
else:
stopword_tag = 'keyword'
return stopword_tag
# Run POS tagger
# This tagger outputs a list for all items in the dict at once
# To avoid double work, it is better to keep this outside the for loop
POS_tags = POS_tagger(words)
i = 0
# Adding tags to words in dictionary, which will be exported as a json file
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
for item, value in words_and_tags.items():
word = words_and_tags[item]['word']
# POS
pos_tag = POS_tags[i]
words_and_tags[item]['POS'] = pos_tag
i = i+1
# Add sentiment tag
sentiment_tag = sentiment_tagger(word)
words_and_tags[item]['sentiment'] = sentiment_tag
# Add stopword tag
stopword_tag = stopword_tagger(word)
words_and_tags[item]['wordtype'] = stopword_tag
# Add entity tag
# Not functional yet
# Save data into a json file
print(words_and_tags)
#with open("data.json", 'w') as f:
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
json.dump(words_and_tags, f, ensure_ascii=False)
#let's bind it to a jinja2 template
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
template_open = open("src/wordtagger/template.html", "r")
template = Template(template_open.read())
index_render = template.render(words_and_tags=words_and_tags)
#print(text_render)
# And render an html file!
print(index_render)
index_open = open("output/wordtagger/index.html", "w")
index_open.write(index_render)
index_open.close()
Loading…
Cancel
Save