Changed name of Wordtagger to reading_structure. Updated script to newest version. Updated makefile. WARNING: Weasyprint is an extra dependency.

master
jvdhorst 7 years ago
parent 087459e01b
commit 3deaaeac0b

@ -70,16 +70,14 @@ hocrs: ## hocr with tesseract and then change extension to .html
#OUTPUT GENERATION RECIPES #OUTPUT GENERATION RECIPES
output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2 output/reading_structure/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint
mkdir -p output/wordtagger mkdir -p output/reading_structure
cp src/wordtagger/jquery.min.js output/wordtagger cp src/reading_structure/jquery.min.js output/reading_structure
cp src/wordtagger/script.js output/wordtagger cp src/reading_structure/script.js output/reading_structure
cp src/wordtagger/style.css output/wordtagger cp src/reading_structure/style.css output/reading_structure
cat $< | python3 src/wordtagger/wordtagger.py cp src/reading_structure/print.css output/reading_structure
# install nltk's 'averaged_perceptron_tagger': cat $< | python3 src/reading_structure/reading_structure.py
# $ python 3 weasyprint -s output/reading_structure/print.css output/reading_structure/index.html output/reading_structure/poster.pdf
# >>> import nltk
# >>> nltk.download('averaged_perceptron_tagger')
output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot
cat $< | python3 src/textbotconversation.py $(@) cat $< | python3 src/textbotconversation.py $(@)

@ -0,0 +1,99 @@
@page {
/* dimensions for the whole page */
size: A3 portrait;
margin: 2rem 2rem 3em 2rem;
background-color: #003cb3;
@bottom-center {
content: 'make output/reading_structure';
font-family: 'Ubuntu Mono', monospace;
white-space: pre;
color: #fff;
padding-bottom: 2rem;
}
}
/* ---
BASIC ELEMENTS
--- */
body {
background-color: none !important;
font-size: 20px;
}
.container {
margin: 0 auto;
padding: 2rem 2.5rem;
max-width: 100% !important;
min-height: 90% !important;
background-color: #003cb3 !important;
border: none !important;
}
span.wrapper {
display: inline-block;
font-family: 'PT Serif', serif;
font-size: 1.5rem;
text-align:center;
position: relative;
margin-bottom: 1rem;
color: #fff;
}
.action-container {
display: none;
}
/* ---
ELEMENTS IN .WRAPPER
--- */
.word, .tag {
display: block;
position: relative;
margin: 0 auto;
clear:both;
}
.word {
margin-top: 0.2rem;
}
span.invisible {
}
span.tag {
color: #fff;
opacity: 0.5;
font-family: 'Ubuntu Mono', monospace;
font-size: 0.8rem;
}
/* Show original word when hovering label */
span.word_label, .stopword .word {
display: block;
opacity: 0;
width: 100%;
font-size: 1rem;
}
.stopword > .wordtype:hover, {
opacity: 1;
/*color: red;*/
}
.stopword > .wordtype {
opacity: 1;
}
/* Spacing fix for punctuation */
.punctuation {
margin-left: -4px;
}

@ -12,7 +12,7 @@ from jinja2 import Template
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word. # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
input = stdin.read() input = stdin.read()
words = nltk.word_tokenize(input) words = nltk.word_tokenize(input)
words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)} words_and_tags = {index : {'word':word} for index , word in enumerate(words)}
print(words_and_tags) print(words_and_tags)
# == FILTER FUNCTIONS == # == FILTER FUNCTIONS ==
@ -68,7 +68,7 @@ def POS_tagger(list):
elif tag == 'CD': elif tag == 'CD':
POS_tag = 'cardinal number' POS_tag = 'cardinal number'
elif tag == 'TO': elif tag == 'TO':
POS_tag = 'to' POS_tag = 'infinitival to'
elif tag == '.': elif tag == '.':
POS_tag = 'line ending' POS_tag = 'line ending'
elif tag == ',': elif tag == ',':
@ -82,8 +82,11 @@ def POS_tagger(list):
# === 2. Sentiment tagger === # === 2. Sentiment tagger ===
# Sentiment analyzer based on the NLTK VADER tagger. # Sentiment analyzer based on the NLTK VADER tagger.
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
def sentiment_tagger(word): def sentiment_tagger(list):
analyzer = SentimentIntensityAnalyzer() analyzer = SentimentIntensityAnalyzer()
sentiment_tags = []
for word in list:
score = analyzer.polarity_scores(word).get("compound") score = analyzer.polarity_scores(word).get("compound")
if score < 0: if score < 0:
@ -93,7 +96,9 @@ def sentiment_tagger(word):
else: else:
sentiment_tag = 'neutral' sentiment_tag = 'neutral'
return sentiment_tag sentiment_tags.append(sentiment_tag)
return sentiment_tags
# === 3. Stopword tagger === # === 3. Stopword tagger ===
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
@ -113,6 +118,7 @@ def stopword_tagger(word):
# This tagger outputs a list for all items in the dict at once # This tagger outputs a list for all items in the dict at once
# To avoid double work, it is better to keep this outside the for loop # To avoid double work, it is better to keep this outside the for loop
POS_tags = POS_tagger(words) POS_tags = POS_tagger(words)
sentiment_tags = sentiment_tagger(words)
i = 0 i = 0
# Adding tags to words in dictionary, which will be exported as a json file # Adding tags to words in dictionary, which will be exported as a json file
@ -123,11 +129,14 @@ for item, value in words_and_tags.items():
# POS # POS
pos_tag = POS_tags[i] pos_tag = POS_tags[i]
words_and_tags[item]['POS'] = pos_tag words_and_tags[item]['POS'] = pos_tag
i = i+1 #i = i+1
# Add sentiment tag # Add sentiment tag
sentiment_tag = sentiment_tagger(word) #sentiment_tag = sentiment_tagger(word)
#words_and_tags[item]['sentiment'] = sentiment_tag
sentiment_tag = sentiment_tags[i]
words_and_tags[item]['sentiment'] = sentiment_tag words_and_tags[item]['sentiment'] = sentiment_tag
i = i+1
# Add stopword tag # Add stopword tag
stopword_tag = stopword_tagger(word) stopword_tag = stopword_tagger(word)
@ -139,18 +148,18 @@ for item, value in words_and_tags.items():
# Save data into a json file # Save data into a json file
print(words_and_tags) print(words_and_tags)
#with open("data.json", 'w') as f: #with open("data.json", 'w') as f:
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f: with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f:
json.dump(words_and_tags, f, ensure_ascii=False) json.dump(words_and_tags, f, ensure_ascii=False)
#let's bind it to a jinja2 template #let's bind it to a jinja2 template
# Jinja moves up one level by default, so I do not need to do it myself as in line 141 # Jinja moves up one level by default, so I do not need to do it myself as in line 141
template_open = open("src/wordtagger/template.html", "r") template_open = open("src/reading_structure/template.html", "r")
template = Template(template_open.read()) template = Template(template_open.read())
index_render = template.render(words_and_tags=words_and_tags) index_render = template.render(words_and_tags=words_and_tags)
#print(text_render) #print(text_render)
# And render an html file! # And render an html file!
print(index_render) print(index_render)
index_open = open("output/wordtagger/index.html", "w") index_open = open("output/reading_structure/index.html", "w")
index_open.write(index_render) index_open.write(index_render)
index_open.close() index_open.close()

@ -0,0 +1,112 @@
/*
The index.html features all words and their labels. The function of this script is to show and hide
specific words based on their label. In this case the word is hidden, but its respective label is shown in the text.
Each collection of words and labels is inside a .wrapper with an id.
The classes of these wrapper feature the word, and the values of the labels (e.g. class="wrapper software noun keyword neutral")
by default all words inside the wrapper have the class .word. This class is visible.
by default all labels (noun, neutral etc.) have the class .tag and another class with the type of label (POS, sentiment, etc). .tag is by default display:none
If the user clicks on one of the .wrapper elements, the page changes the hidden words. The state changes.
What happens inside the wrapper if the state changes to hide all nouns?
- Previous filter is disabled. All tags invisible. All words visible.
- the words in the wrapper with class noun are selected. They get the class word_label. Which means: only visible on :hovering the wrapper.
- the span with the text 'noun' and class 'pos' will lose the class invisible. The tag is now visible in the text.
Aside: What about Weasyprint?
Weasyprint can't see the changes made by this script. It will only show the normal text, without any labels.
To print the text with the desired styling, there is a specific stylesheet called print.css.
*/
$(document).ready(function(){
// State 1 Selectors for stopword tagger: selecting the word, and the label 'stopword'
var stopword_word = $('.stopword > .word');
var stopword_label = $('.stopword > .wordtype');
// State 2 Selectors for the sentiment tagger, showing neutral words and their labels
var posneg_word = $('.negative > .word, .positive > .word');
var posneg_label = $('.negative > .sentiment, .positive > .sentiment');
// State 3 Selectors for the noun taggger
var noun_word = $('.noun > .word');
var noun_label = $('.noun > .pos');
// State 4 Selectors for the adjective and adverb taggger
var ad_word = $('.adjective > .word, .adverb > .word');
var ad_label = $('.adjective > .pos, .adverb > .pos');
// State 5 Selectors for the determiner, pronoun, preposition and infinitival to taggger
var dppt_word = $('.determiner > .word, .pronoun > .word, .to > .word, .preposition > .word');
var dppt_label = $('.determiner > .pos, .pronoun > .pos, .to > .pos, .preposition > .pos');
// State 6 Selectors for the sentiment tagger, showing only positive and negative words and their labels
var neutral_word = $('.neutral > .word');
var neutral_label = $('.neutral > .sentiment');
// On page load, prepare the right view for state one. Hiding all stopwords, showing the stopword label
var state = 1;
stopword_word.addClass('word_label');
stopword_label.removeClass('invisible');
// Here we run through the states
$('.container').click( function() {
console.log(state);
if (state == 1) {
stopword_word.removeClass('word_label');
stopword_label.addClass('invisible');
posneg_word.addClass('word_label');
posneg_label.removeClass('invisible');
}
if (state == 2) {
posneg_word.removeClass('word_label');
posneg_label.addClass('invisible');
noun_word.addClass('word_label');
noun_label.removeClass('invisible');
}
if (state == 3) {
noun_word.removeClass('word_label');
noun_label.addClass('invisible');
ad_word.addClass('word_label');
ad_label.removeClass('invisible');
}
if (state == 4) {
ad_word.removeClass('word_label');
ad_label.addClass('invisible');
dppt_word.addClass('word_label');
dppt_label.removeClass('invisible');
}
if (state == 5) {
dppt_word.removeClass('word_label');
dppt_label.addClass('invisible');
neutral_word.addClass('word_label');
neutral_label.removeClass('invisible');
}
if (state == 6) {
neutral_word.removeClass('word_label');
neutral_label.addClass('invisible');
stopword_word.addClass('word_label');
stopword_label.removeClass('invisible');
state = 0;
}
state = state+1;
});
});

@ -0,0 +1,140 @@
/* ---
BASIC ELEMENTS
--- */
body {
background-color: #dfdfdf;
}
.container {
margin: 2rem auto;
padding: 4rem 5rem;
min-width: 40rem;
max-width: 45%;
min-height: 90vh;
background-color: #fff;
cursor: pointer;
border: 0px solid #aeaeae;
box-shadow: 0 1px 15px rgba(0,0,0,0.12), 0 1px 3px rgba(0,0,0,0.24);
}
.action-container {
margin: 2rem;
padding: 2rem 2.5rem;
max-width: 45%;
float: right;
position: fixed;
bottom: 2rem;
right: 0;
}
span.wrapper {
display: inline-block;
font-family: 'PT Serif', serif;
font-size: 1.2rem;
text-align:center;
position: relative;
margin-bottom: 0.75rem;
}
/* ---
ELEMENTS IN ACTION CONTAINER
--- */
.action-container a {
font-family: 'PT Serif', serif;
padding: 0.4rem;
background: black;
color: white;
text-decoration: none;
box-shadow: none;
transition: all 0.3s cubic-bezier(.25,.8,.25,1);
}
.action-container a:hover {
box-shadow: 0 1px 10px rgba(0,0,0,0.12), 0 1px 10px rgba(0,0,0,0.24);
}
.action-container p {
margin: 1.25rem 2rem 0 0;
}
/* ---
ELEMENTS IN .WRAPPER
--- */
.word, .tag {
display: block;
position: relative;
margin: auto;
clear:both;
}
span.invisible {
display: none;
}
span.word {
margin-top: 1.25rem;
}
span.tag {
color: #fff;
font-family: 'Ubuntu Mono', monospace;
margin-bottom: -1.25rem;
}
/* Show original word when hovering label */
span.word_label {
display: block;
opacity: 0;
width: 100%;
font-family: 'Ubuntu Mono', monospace;
}
.wrapper:hover > span.word_label {
opacity: 0.5;
}
/* Spacing fix for punctuation */
.punctuation {
margin-left: -4px;
}
/* Colors for tags */
.noun .pos {
color: #003cb3;
}
.stopword .wordtype {
color: #b83e54;
}
.neutral .sentiment {
color: #8491a5;
}
.negative .sentiment {
color: #c9805b;
}
.positive .sentiment {
color: #44a889;
}
.adjective .pos, .adverb .pos {
color: #8e445e;
}
.pronoun .pos, .determiner .pos, .to .pos, .preposition .pos {
color: #2f7f40;
}

@ -0,0 +1,33 @@
<!DOCTYPE html>
<html>
<head>
<title>Reading the Structure</title>
<meta charset="utf-8" />
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
<script type="text/javascript" src="jquery.min.js"></script>
<script type="text/javascript" src="script.js"></script>
</head>
<body>
<div class="container"><p>
{% for item, value in words_and_tags.items() %}
<span id="{{item}}" class="wrapper {{ words_and_tags[item]['wordtype'] }} {{ words_and_tags[item]['sentiment'] }} {{ words_and_tags[item]['POS'] }}">
<span class ="tag wordtype invisible"> {{ words_and_tags[item]['wordtype'] }} </span>
<span class ="tag sentiment invisible"> {{ words_and_tags[item]['sentiment'] }}</span>
<span class ="tag pos invisible"> {{ words_and_tags[item]['POS'] }}</span>
<span class ="word {% if words_and_tags[item]['word'] in [',','.','(',')',';',':'] %} punctuation {% else %} {{ words_and_tags[item]['word'] }} {% endif %}"> {{ words_and_tags[item]['word'] }}</span>
</span>
{% endfor %}
</p></div>
<div class="action-container">
<p><a href="data.json" download>Export data as json</a> </p>
<p><a href="poster.pdf" download>Print this as a poster</a> </p>
</div>
</body>
</html>

@ -1,64 +0,0 @@
$(document).ready(function(){
var state = 0;
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
$('.determiner').addClass('cyan');
$(document).bind('contextmenu', function(e) { return false; });
$( ".word" ).contextmenu(function() {
console.log($(this).hasClass('underline'));
$(this).hasClass('underline') == false
? $(this).addClass('underline')
: $(this).removeClass('underline');
});
$('.word').click( function() {
var el = $('.word');
console.log(state);
if (state == 0) {
$('.word').removeClass('fade-out red blue cyan');
$('.stopword').addClass('fade-out');
}
else if (state == 1) {
$('.stopword').removeClass('fade-out');
$('.neutral').addClass('fade-out');
}
else if (state == 2) {
$('.neutral').removeClass('fade-out');
$('.noun').addClass('fade-out');
$('.preposition').addClass('red');
$('.verb').addClass('blue');
state = -1;
}
$('.word').each(function() {
var el = $(this);
if (state == 0) {
el.empty();
el.html(el.data("stopword") + "&nbsp;");
}
else if (state == 1) {
el.empty();
el.html(el.data("sentiment") + "&nbsp;");
}
else {
el.empty();
el.html(el.data("pos") + "&nbsp;");
}
});
state = state+1;
});
});

@ -1,86 +0,0 @@
* {
min-height: 0;
min-width: 0;
}
body {
background: #639ab2;
font-size: 15px;
font-family: 'Ubuntu Mono', monospace;
}
.prelative {
flex-shrink: 0;
}
div.container {
width: 100%;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display: flex;
flex-wrap: wrap;
}
.word {
font-size: 3rem;
float: left;
position: relative;
text-align: center;
display: -webkit-box; /* OLD - iOS 6-, Safari 3.1-6 */
display: -moz-box; /* OLD - Firefox 19- (buggy but mostly works) */
display: -ms-flexbox; /* TWEENER - IE 10 */
display: -webkit-flex; /* NEW - Chrome */
display:flex;
justify-content: center;
}
.word:before,
.word:after {
content: '';
color: #fff;
position: absolute;
font-family: 'PT Serif', serif;
font-weight: bold;
font-size: 1.5rem;
font-style: italic;
opacity: 0;
width: 100%;
}
.word:before {
content: attr(data-txt);
flex-shrink: 1;
}
.word:hover:before,
.word:active:after {
opacity: 1;
}
.fade-out {
color: #275152;
}
p {
margin: 1rem;
}
.red {
color: red;
}
.blue {
color: blue;
}
.cyan {
color: cyan;
}
.underline {
text-decoration: underline;
}

@ -1,20 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>Wordtagger</title>
<meta charset="utf-8" />
<link rel="stylesheet" href="style.css" type="text/css" media="screen" />
<script type="text/javascript" src="jquery.min.js"></script>
<script type="text/javascript" src="script.js"></script>
<!--meta name="viewport" content="width=device-width"-->
</head>
<body>
<div class="container"><p>
{% for item, value in words_and_tags.items() %}
<span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
{% endfor %}
</p>
</div>
</body>
</html>
Loading…
Cancel
Save