Changed name of Wordtagger to reading_structure. Updated script to newest version. Updated makefile. WARNING: Weasyprint is an extra dependency.

7 years ago · 3deaaeac0b
parent 087459e01b
commit 3deaaeac0b
10 changed files with 417 additions and 196 deletions
--- a/18
+++ b/18
@ -70,16 +70,14 @@ hocrs: ## hocr with tesseract and then change extension to .html

 #OUTPUT GENERATION RECIPES

-output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
-	mkdir -p output/wordtagger
-	cp src/wordtagger/jquery.min.js output/wordtagger
-	cp src/wordtagger/script.js output/wordtagger
-	cp src/wordtagger/style.css output/wordtagger
-	cat $< | python3 src/wordtagger/wordtagger.py
-#  install nltk's 'averaged_perceptron_tagger':
-#  $ python 3
-#  >>> import nltk
-#  >>> nltk.download('averaged_perceptron_tagger')
+output/reading_structure/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint
+	mkdir -p output/reading_structure
+	cp src/reading_structure/jquery.min.js output/reading_structure
+	cp src/reading_structure/script.js output/reading_structure
+	cp src/reading_structure/style.css output/reading_structure
+	cp src/reading_structure/print.css output/reading_structure
+	cat $< | python3 src/reading_structure/reading_structure.py
+	weasyprint -s output/reading_structure/print.css output/reading_structure/index.html output/reading_structure/poster.pdf

 output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot
 	cat $< | python3 src/textbotconversation.py $(@)
--- a/src/reading_structure/jquery.min.js
+++ b/src/reading_structure/jquery.min.js
--- a/src/reading_structure/print.css
+++ b/src/reading_structure/print.css
@ -0,0 +1,99 @@
+@page {
+  /* dimensions for the whole page */
+  size: A3 portrait;
+  margin: 2rem 2rem 3em 2rem;
+  background-color: #003cb3;
+
+  @bottom-center {
+    content: 'make output/reading_structure';
+    font-family: 'Ubuntu Mono', monospace;
+    white-space: pre;
+    color: #fff;
+    padding-bottom: 2rem;
+  }
+
+}
+
+/* ---
+
+ BASIC ELEMENTS
+
+ --- */
+
+body {
+  background-color: none !important;
+  font-size: 20px;
+}
+
+.container {
+margin: 0 auto;
+padding: 2rem 2.5rem;
+max-width: 100% !important;
+min-height: 90% !important;
+background-color: #003cb3 !important;
+border: none !important;
+}
+
+span.wrapper {
+  display: inline-block;
+  font-family: 'PT Serif', serif;
+  font-size: 1.5rem;
+  text-align:center;
+  position: relative;
+  margin-bottom: 1rem;
+  color: #fff;
+}
+
+.action-container {
+  display: none;
+}
+
+/* ---
+
+ ELEMENTS IN .WRAPPER
+
+ --- */
+
+.word, .tag {
+  display: block;
+  position: relative;
+  margin: 0 auto;
+  clear:both;
+}
+
+.word {
+  margin-top: 0.2rem;
+}
+
+span.invisible {
+
+}
+
+span.tag {
+  color: #fff;
+  opacity: 0.5;
+  font-family: 'Ubuntu Mono', monospace;
+  font-size: 0.8rem;
+}
+
+/* Show original word when hovering label */
+span.word_label, .stopword .word {
+  display: block;
+  opacity: 0;
+  width: 100%;
+  font-size: 1rem;
+}
+
+.stopword > .wordtype:hover, {
+  opacity: 1;
+  /*color: red;*/
+}
+
+.stopword > .wordtype {
+  opacity: 1;
+}
+
+/* Spacing fix for punctuation */
+.punctuation {
+  margin-left: -4px;
+}
--- a/src/reading_structure/reading_structure.py
+++ b/src/reading_structure/reading_structure.py
@ -12,7 +12,7 @@ from jinja2 import Template
 # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
 input = stdin.read()
 words = nltk.word_tokenize(input)
-words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
+words_and_tags = {index : {'word':word} for index , word in enumerate(words)}
 print(words_and_tags)

 # == FILTER FUNCTIONS ==
@ -68,7 +68,7 @@ def POS_tagger(list):
        elif tag == 'CD':
            POS_tag = 'cardinal number'
        elif tag == 'TO':
-            POS_tag = 'to'
+            POS_tag = 'infinitival to'
        elif tag == '.':
            POS_tag = 'line ending'
        elif tag == ',':
@ -82,18 +82,23 @@ def POS_tagger(list):
 # === 2. Sentiment tagger ===
 # Sentiment analyzer based on the NLTK VADER tagger.
 # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
-def sentiment_tagger(word):
+def sentiment_tagger(list):
    analyzer = SentimentIntensityAnalyzer()
-    score = analyzer.polarity_scores(word).get("compound")
+    sentiment_tags = []

-    if score < 0:
-        sentiment_tag = 'negative'
-    elif score > 0:
-        sentiment_tag = 'positive'
-    else:
-        sentiment_tag = 'neutral'
+    for word in list:
+        score = analyzer.polarity_scores(word).get("compound")
+
+        if score < 0:
+            sentiment_tag = 'negative'
+        elif score > 0:
+            sentiment_tag = 'positive'
+        else:
+            sentiment_tag = 'neutral'

-    return sentiment_tag
+        sentiment_tags.append(sentiment_tag)
+
+    return sentiment_tags

 # === 3. Stopword tagger ===
 # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
@ -113,6 +118,7 @@ def stopword_tagger(word):
 # This tagger outputs a list for all items in the dict at once
 # To avoid double work, it is better to keep this outside the for loop
 POS_tags = POS_tagger(words)
+sentiment_tags = sentiment_tagger(words)
 i = 0

 # Adding tags to words in dictionary, which will be exported as a json file
@ -123,11 +129,14 @@ for item, value in words_and_tags.items():
    # POS
    pos_tag = POS_tags[i]
    words_and_tags[item]['POS'] = pos_tag
-    i = i+1
+    #i = i+1

    # Add sentiment tag
-    sentiment_tag = sentiment_tagger(word)
+    #sentiment_tag = sentiment_tagger(word)
+    #words_and_tags[item]['sentiment'] = sentiment_tag
+    sentiment_tag = sentiment_tags[i]
    words_and_tags[item]['sentiment'] = sentiment_tag
+    i = i+1

    # Add stopword tag
    stopword_tag = stopword_tagger(word)
@ -139,18 +148,18 @@ for item, value in words_and_tags.items():
 # Save data into a json file
 print(words_and_tags)
 #with open("data.json", 'w') as f:
-with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
+with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f:
  json.dump(words_and_tags, f, ensure_ascii=False)

 #let's bind it to a jinja2 template
 # Jinja moves up one level by default, so I do not need to do it myself as in line 141
-template_open = open("src/wordtagger/template.html", "r")
+template_open = open("src/reading_structure/template.html", "r")
 template = Template(template_open.read())
 index_render = template.render(words_and_tags=words_and_tags)
 #print(text_render)

 # And render an html file!
 print(index_render)
-index_open = open("output/wordtagger/index.html", "w")
+index_open = open("output/reading_structure/index.html", "w")
 index_open.write(index_render)
 index_open.close()
--- a/src/reading_structure/script.js
+++ b/src/reading_structure/script.js
@ -0,0 +1,112 @@
+/*
+The index.html features all words and their labels. The function of this script is to show and hide
+specific words based on their label. In this case the word is hidden, but its respective label is shown in the text.
+
+Each collection of words and labels is inside a .wrapper with an id.
+The classes of these wrapper feature the word, and the values of the labels (e.g. class="wrapper software noun keyword neutral")
+by default all words inside the wrapper have the class .word. This class is visible.
+by default all labels (noun, neutral etc.) have the class .tag and another class with the type of label (POS, sentiment, etc). .tag is by default display:none
+
+If the user clicks on one of the .wrapper elements, the page changes the hidden words. The state changes.
+
+What happens inside the wrapper if the state changes to hide all nouns?
+
+- Previous filter is disabled. All tags invisible. All words visible.
+- the words in the wrapper with class noun are selected. They get the class word_label. Which means: only visible on :hovering the wrapper.
+- the span with the text 'noun' and class 'pos' will lose the class invisible. The tag is now visible in the text.
+
+Aside: What about Weasyprint?
+Weasyprint can't see the changes made by this script. It will only show the normal text, without any labels.
+To print the text with the desired styling, there is a specific stylesheet called print.css.
+*/
+
+
+$(document).ready(function(){
+
+  // State 1 Selectors for stopword tagger: selecting the word, and the label 'stopword'
+  var stopword_word = $('.stopword > .word');
+  var stopword_label = $('.stopword > .wordtype');
+
+  // State 2 Selectors for the sentiment tagger, showing neutral words and their labels
+  var posneg_word = $('.negative > .word, .positive > .word');
+  var posneg_label = $('.negative > .sentiment, .positive > .sentiment');
+
+  // State 3 Selectors for the noun taggger
+  var noun_word = $('.noun > .word');
+  var noun_label = $('.noun > .pos');
+
+  // State 4 Selectors for the adjective and adverb taggger
+  var ad_word = $('.adjective > .word, .adverb > .word');
+  var ad_label = $('.adjective > .pos, .adverb > .pos');
+
+  // State 5 Selectors for the determiner, pronoun, preposition and infinitival to taggger
+  var dppt_word = $('.determiner > .word, .pronoun > .word, .to > .word, .preposition > .word');
+  var dppt_label = $('.determiner > .pos, .pronoun > .pos, .to > .pos, .preposition > .pos');
+
+  // State 6 Selectors for the sentiment tagger, showing only positive and negative words and their labels
+  var neutral_word = $('.neutral > .word');
+  var neutral_label = $('.neutral > .sentiment');
+
+  // On page load, prepare the right view for state one. Hiding all stopwords, showing the stopword label
+  var state = 1;
+  stopword_word.addClass('word_label');
+  stopword_label.removeClass('invisible');
+
+  // Here we run through the states
+  $('.container').click( function() {
+    console.log(state);
+
+    if (state == 1) {
+      stopword_word.removeClass('word_label');
+      stopword_label.addClass('invisible');
+
+      posneg_word.addClass('word_label');
+      posneg_label.removeClass('invisible');
+    }
+
+    if (state == 2) {
+      posneg_word.removeClass('word_label');
+      posneg_label.addClass('invisible');
+
+      noun_word.addClass('word_label');
+      noun_label.removeClass('invisible');
+    }
+
+    if (state == 3) {
+      noun_word.removeClass('word_label');
+      noun_label.addClass('invisible');
+
+      ad_word.addClass('word_label');
+      ad_label.removeClass('invisible');
+    }
+
+    if (state == 4) {
+      ad_word.removeClass('word_label');
+      ad_label.addClass('invisible');
+
+      dppt_word.addClass('word_label');
+      dppt_label.removeClass('invisible');
+    }
+
+    if (state == 5) {
+      dppt_word.removeClass('word_label');
+      dppt_label.addClass('invisible');
+
+      neutral_word.addClass('word_label');
+      neutral_label.removeClass('invisible');
+    }
+
+    if (state == 6) {
+      neutral_word.removeClass('word_label');
+      neutral_label.addClass('invisible');
+
+      stopword_word.addClass('word_label');
+      stopword_label.removeClass('invisible');
+
+      state = 0;
+    }
+
+  state = state+1;
+});
+
+});
--- a/src/reading_structure/style.css
+++ b/src/reading_structure/style.css
@ -0,0 +1,140 @@
+/* ---
+
+ BASIC ELEMENTS
+
+ --- */
+
+body {
+  background-color: #dfdfdf;
+}
+
+.container {
+  margin: 2rem auto;
+  padding: 4rem 5rem;
+  min-width: 40rem;
+  max-width: 45%;
+  min-height: 90vh;
+  background-color: #fff;
+  cursor: pointer;
+  border: 0px solid #aeaeae;
+  box-shadow: 0 1px 15px rgba(0,0,0,0.12), 0 1px 3px rgba(0,0,0,0.24);
+}
+
+.action-container {
+    margin: 2rem;
+    padding: 2rem 2.5rem;
+    max-width: 45%;
+    float: right;
+    position: fixed;
+    bottom: 2rem;
+    right: 0;
+}
+
+span.wrapper {
+  display: inline-block;
+  font-family: 'PT Serif', serif;
+  font-size: 1.2rem;
+  text-align:center;
+  position: relative;
+  margin-bottom: 0.75rem;
+}
+
+/* ---
+
+ ELEMENTS IN ACTION CONTAINER
+
+ --- */
+
+ .action-container a {
+   font-family: 'PT Serif', serif;
+   padding: 0.4rem;
+   background: black;
+   color: white;
+   text-decoration: none;
+   box-shadow: none;
+transition: all 0.3s cubic-bezier(.25,.8,.25,1);
+ }
+
+ .action-container a:hover {
+   box-shadow: 0 1px 10px rgba(0,0,0,0.12), 0 1px 10px rgba(0,0,0,0.24);
+ }
+
+ .action-container p {
+   margin: 1.25rem 2rem 0 0;
+ }
+
+
+/* ---
+
+ ELEMENTS IN .WRAPPER
+
+ --- */
+
+.word, .tag {
+  display: block;
+  position: relative;
+  margin: auto;
+  clear:both;
+}
+
+span.invisible {
+  display: none;
+}
+
+span.word {
+  margin-top: 1.25rem;
+}
+
+span.tag {
+  color: #fff;
+  font-family: 'Ubuntu Mono', monospace;
+  margin-bottom: -1.25rem;
+}
+
+
+
+/* Show original word when hovering label */
+span.word_label {
+  display: block;
+  opacity: 0;
+  width: 100%;
+  font-family: 'Ubuntu Mono', monospace;
+}
+
+.wrapper:hover > span.word_label {
+  opacity: 0.5;
+}
+
+/* Spacing fix for punctuation */
+.punctuation {
+  margin-left: -4px;
+}
+
+/* Colors for tags */
+.noun .pos {
+  color: #003cb3;
+}
+
+.stopword .wordtype {
+  color: #b83e54;
+}
+
+.neutral .sentiment {
+  color: #8491a5;
+}
+
+.negative .sentiment {
+  color: #c9805b;
+}
+
+.positive .sentiment {
+  color: #44a889;
+}
+
+.adjective .pos, .adverb .pos {
+  color: #8e445e;
+}
+
+.pronoun .pos, .determiner .pos, .to .pos, .preposition .pos {
+  color: #2f7f40;
+}
--- a/src/reading_structure/template.html
+++ b/src/reading_structure/template.html
@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Reading the Structure</title>
+    <meta charset="utf-8" />
+    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
+    <script type="text/javascript" src="jquery.min.js"></script>
+    <script type="text/javascript" src="script.js"></script>
+
+  </head>
+
+  <body>
+    <div class="container"><p>
+      {% for item, value in words_and_tags.items() %}
+
+      <span id="{{item}}" class="wrapper {{ words_and_tags[item]['wordtype'] }} {{ words_and_tags[item]['sentiment'] }} {{ words_and_tags[item]['POS'] }}">
+          <span class ="tag wordtype invisible"> {{ words_and_tags[item]['wordtype'] }} </span>
+          <span class ="tag sentiment invisible"> {{ words_and_tags[item]['sentiment'] }}</span>
+          <span class ="tag pos invisible"> {{ words_and_tags[item]['POS'] }}</span>
+          <span class ="word {% if words_and_tags[item]['word'] in [',','.','(',')',';',':'] %} punctuation {% else %} {{ words_and_tags[item]['word'] }} {% endif %}"> {{ words_and_tags[item]['word'] }}</span>
+      </span>
+
+     {% endfor %}
+
+    </p></div>
+
+    <div class="action-container">
+      <p><a href="data.json" download>Export data as json</a> </p>
+      <p><a href="poster.pdf" download>Print this as a poster</a> </p>
+    </div>
+
+  </body>
+</html>
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -1,64 +0,0 @@
-$(document).ready(function(){
-  var state = 0;
-
-  $('.noun').addClass('fade-out');
-  $('.preposition').addClass('red');
-  $('.verb').addClass('blue');
-  $('.determiner').addClass('cyan');
-
-  $(document).bind('contextmenu', function(e) { return false; });
-
-  $( ".word" ).contextmenu(function() {
-    console.log($(this).hasClass('underline'));
-    $(this).hasClass('underline') == false
-    ? $(this).addClass('underline')
-    : $(this).removeClass('underline');
-  });
-
-  $('.word').click( function() {
-    var el = $('.word');
-    console.log(state);
-
-    if (state == 0) {
-      $('.word').removeClass('fade-out red blue cyan');
-
-      $('.stopword').addClass('fade-out');
-    }
-
-    else if (state == 1) {
-      $('.stopword').removeClass('fade-out');
-      $('.neutral').addClass('fade-out');
-    }
-
-    else if (state == 2) {
-      $('.neutral').removeClass('fade-out');
-      $('.noun').addClass('fade-out');
-      $('.preposition').addClass('red');
-      $('.verb').addClass('blue');
-      state = -1;
-    }
-
-    $('.word').each(function() {
-      var el = $(this);
-
-      if (state == 0) {
-        el.empty();
-        el.html(el.data("stopword") + "&nbsp;");
-      }
-
-      else if (state == 1) {
-        el.empty();
-        el.html(el.data("sentiment") + "&nbsp;");
-      }
-
-      else {
-        el.empty();
-        el.html(el.data("pos") + "&nbsp;");
-      }
-
-    });
-
-    state = state+1;
-  });
-
-});
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -1,86 +0,0 @@
-* {
-min-height: 0;
-min-width: 0;
-}
-
-body {
-  background: #639ab2;
-  font-size: 15px;
-  font-family: 'Ubuntu Mono', monospace;
-}
-
-.prelative {
-    flex-shrink: 0;
-}
-
-div.container {
-  width: 100%;
-  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
-  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
-  display: -ms-flexbox;      /* TWEENER - IE 10 */
-  display: -webkit-flex;     /* NEW - Chrome */
-  display:  flex;
-  flex-wrap: wrap;
-}
-
-.word {
-  font-size: 3rem;
-  float: left;
-  position: relative;
-  text-align: center;
-  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
-  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
-  display: -ms-flexbox;      /* TWEENER - IE 10 */
-  display: -webkit-flex;     /* NEW - Chrome */
-  display:flex;
-  justify-content: center;
-}
-
-.word:before,
-.word:after {
-    content: '';
-    color: #fff;
-    position: absolute;
-    font-family: 'PT Serif', serif;
-    font-weight: bold;
-    font-size: 1.5rem;
-    font-style: italic;
-    opacity: 0;
-    width: 100%;
-  }
-
-.word:before {
-    content: attr(data-txt);
-    flex-shrink: 1;
-}
-
-.word:hover:before,
-.word:active:after {
-    opacity: 1;
-}
-
-
-
-.fade-out {
-  color: #275152;
-}
-
-p {
-  margin: 1rem;
-}
-
-.red {
-  color: red;
-}
-
-.blue {
-  color: blue;
-}
-
-.cyan {
-  color: cyan;
-}
-
-.underline {
-  text-decoration: underline;
-}
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -1,20 +0,0 @@
-<!DOCTYPE html>
-<html>
-  <head>
-    <title>Wordtagger</title>
-    <meta charset="utf-8" />
-    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
-    <script type="text/javascript" src="jquery.min.js"></script>
-    <script type="text/javascript" src="script.js"></script>
-    <!--meta name="viewport" content="width=device-width"-->
-  </head>
-
-  <body>
-    <div class="container"><p>
-      {% for item, value in words_and_tags.items() %}
-      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
-     {% endfor %}
-    </p>
-      </div>
-  </body>
-</html>