Changed name of Wordtagger to reading_structure. Updated script to newest version. Updated makefile. WARNING: Weasyprint is an extra dependency.

7 years ago · 3deaaeac0b
parent 087459e01b
commit 3deaaeac0b
10 changed files with 417 additions and 196 deletions
--- a/18
+++ b/18
@ -70,16 +70,14 @@ hocrs: ## hocr with tesseract and then change extension to .html
 #OUTPUT GENERATION RECIPES
-output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
+output/reading_structure/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2, weasyprint
-	mkdir -p output/wordtagger
+	mkdir -p output/reading_structure
-	cp src/wordtagger/jquery.min.js output/wordtagger
+	cp src/reading_structure/jquery.min.js output/reading_structure
-	cp src/wordtagger/script.js output/wordtagger
+	cp src/reading_structure/script.js output/reading_structure
-	cp src/wordtagger/style.css output/wordtagger
+	cp src/reading_structure/style.css output/reading_structure
-	cat $< | python3 src/wordtagger/wordtagger.py
+	cp src/reading_structure/print.css output/reading_structure
-#  install nltk's 'averaged_perceptron_tagger':
+	cat $< | python3 src/reading_structure/reading_structure.py
-#  $ python 3
+	weasyprint -s output/reading_structure/print.css output/reading_structure/index.html output/reading_structure/poster.pdf
 #  >>> import nltk
 #  >>> nltk.download('averaged_perceptron_tagger')
 output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot
 	cat $< | python3 src/textbotconversation.py $(@)
--- a/src/reading_structure/jquery.min.js
+++ b/src/reading_structure/jquery.min.js
--- a/src/reading_structure/print.css
+++ b/src/reading_structure/print.css
@ -0,0 +1,99 @@
@page {
  /* dimensions for the whole page */
  size: A3 portrait;
  margin: 2rem 2rem 3em 2rem;
  background-color: #003cb3;
  @bottom-center {
    content: 'make output/reading_structure';
    font-family: 'Ubuntu Mono', monospace;
    white-space: pre;
    color: #fff;
    padding-bottom: 2rem;
  }
 }
 /* ---
 BASIC ELEMENTS
 --- */
 body {
  background-color: none !important;
  font-size: 20px;
 }
 .container {
 margin: 0 auto;
 padding: 2rem 2.5rem;
 max-width: 100% !important;
 min-height: 90% !important;
 background-color: #003cb3 !important;
 border: none !important;
 }
 span.wrapper {
  display: inline-block;
  font-family: 'PT Serif', serif;
  font-size: 1.5rem;
  text-align:center;
  position: relative;
  margin-bottom: 1rem;
  color: #fff;
 }
 .action-container {
  display: none;
 }
 /* ---
 ELEMENTS IN .WRAPPER
 --- */
 .word, .tag {
  display: block;
  position: relative;
  margin: 0 auto;
  clear:both;
 }
 .word {
  margin-top: 0.2rem;
 }
 span.invisible {
 }
 span.tag {
  color: #fff;
  opacity: 0.5;
  font-family: 'Ubuntu Mono', monospace;
  font-size: 0.8rem;
 }
 /* Show original word when hovering label */
 span.word_label, .stopword .word {
  display: block;
  opacity: 0;
  width: 100%;
  font-size: 1rem;
 }
 .stopword > .wordtype:hover, {
  opacity: 1;
  /*color: red;*/
 }
 .stopword > .wordtype {
  opacity: 1;
 }
 /* Spacing fix for punctuation */
 .punctuation {
  margin-left: -4px;
 }
--- a/src/reading_structure/reading_structure.py
+++ b/src/reading_structure/reading_structure.py
@ -12,7 +12,7 @@ from jinja2 import Template
 # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
 input = stdin.read()
 words = nltk.word_tokenize(input)
-words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
+words_and_tags = {index : {'word':word} for index , word in enumerate(words)}
 print(words_and_tags)
 # == FILTER FUNCTIONS ==
@ -68,7 +68,7 @@ def POS_tagger(list):
        elif tag == 'CD':
            POS_tag = 'cardinal number'
        elif tag == 'TO':
-            POS_tag = 'to'
+            POS_tag = 'infinitival to'
        elif tag == '.':
            POS_tag = 'line ending'
        elif tag == ',':
@ -82,8 +82,11 @@ def POS_tagger(list):
 # === 2. Sentiment tagger ===
 # Sentiment analyzer based on the NLTK VADER tagger.
 # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
-def sentiment_tagger(word):
+def sentiment_tagger(list):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_tags = []
    for word in list:
        score = analyzer.polarity_scores(word).get("compound")
        if score < 0:
@ -93,7 +96,9 @@ def sentiment_tagger(word):
        else:
            sentiment_tag = 'neutral'
-    return sentiment_tag
+        sentiment_tags.append(sentiment_tag)
    return sentiment_tags
 # === 3. Stopword tagger ===
 # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
@ -113,6 +118,7 @@ def stopword_tagger(word):
 # This tagger outputs a list for all items in the dict at once
 # To avoid double work, it is better to keep this outside the for loop
 POS_tags = POS_tagger(words)
 sentiment_tags = sentiment_tagger(words)
 i = 0
 # Adding tags to words in dictionary, which will be exported as a json file
@ -123,11 +129,14 @@ for item, value in words_and_tags.items():
    # POS
    pos_tag = POS_tags[i]
    words_and_tags[item]['POS'] = pos_tag
-    i = i+1
+    #i = i+1
    # Add sentiment tag
-    sentiment_tag = sentiment_tagger(word)
+    #sentiment_tag = sentiment_tagger(word)
    #words_and_tags[item]['sentiment'] = sentiment_tag
    sentiment_tag = sentiment_tags[i]
    words_and_tags[item]['sentiment'] = sentiment_tag
    i = i+1
    # Add stopword tag
    stopword_tag = stopword_tagger(word)
@ -139,18 +148,18 @@ for item, value in words_and_tags.items():
 # Save data into a json file
 print(words_and_tags)
 #with open("data.json", 'w') as f:
-with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
+with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f:
  json.dump(words_and_tags, f, ensure_ascii=False)
 #let's bind it to a jinja2 template
 # Jinja moves up one level by default, so I do not need to do it myself as in line 141
-template_open = open("src/wordtagger/template.html", "r")
+template_open = open("src/reading_structure/template.html", "r")
 template = Template(template_open.read())
 index_render = template.render(words_and_tags=words_and_tags)
 #print(text_render)
 # And render an html file!
 print(index_render)
-index_open = open("output/wordtagger/index.html", "w")
+index_open = open("output/reading_structure/index.html", "w")
 index_open.write(index_render)
 index_open.close()
--- a/src/reading_structure/script.js
+++ b/src/reading_structure/script.js
@ -0,0 +1,112 @@
 /*
 The index.html features all words and their labels. The function of this script is to show and hide
 specific words based on their label. In this case the word is hidden, but its respective label is shown in the text.
 Each collection of words and labels is inside a .wrapper with an id.
 The classes of these wrapper feature the word, and the values of the labels (e.g. class="wrapper software noun keyword neutral")
 by default all words inside the wrapper have the class .word. This class is visible.
 by default all labels (noun, neutral etc.) have the class .tag and another class with the type of label (POS, sentiment, etc). .tag is by default display:none
 If the user clicks on one of the .wrapper elements, the page changes the hidden words. The state changes.
 What happens inside the wrapper if the state changes to hide all nouns?
 - Previous filter is disabled. All tags invisible. All words visible.
 - the words in the wrapper with class noun are selected. They get the class word_label. Which means: only visible on :hovering the wrapper.
 - the span with the text 'noun' and class 'pos' will lose the class invisible. The tag is now visible in the text.
 Aside: What about Weasyprint?
 Weasyprint can't see the changes made by this script. It will only show the normal text, without any labels.
 To print the text with the desired styling, there is a specific stylesheet called print.css.
 */
 $(document).ready(function(){
  // State 1 Selectors for stopword tagger: selecting the word, and the label 'stopword'
  var stopword_word = $('.stopword > .word');
  var stopword_label = $('.stopword > .wordtype');
  // State 2 Selectors for the sentiment tagger, showing neutral words and their labels
  var posneg_word = $('.negative > .word, .positive > .word');
  var posneg_label = $('.negative > .sentiment, .positive > .sentiment');
  // State 3 Selectors for the noun taggger
  var noun_word = $('.noun > .word');
  var noun_label = $('.noun > .pos');
  // State 4 Selectors for the adjective and adverb taggger
  var ad_word = $('.adjective > .word, .adverb > .word');
  var ad_label = $('.adjective > .pos, .adverb > .pos');
  // State 5 Selectors for the determiner, pronoun, preposition and infinitival to taggger
  var dppt_word = $('.determiner > .word, .pronoun > .word, .to > .word, .preposition > .word');
  var dppt_label = $('.determiner > .pos, .pronoun > .pos, .to > .pos, .preposition > .pos');
  // State 6 Selectors for the sentiment tagger, showing only positive and negative words and their labels
  var neutral_word = $('.neutral > .word');
  var neutral_label = $('.neutral > .sentiment');
  // On page load, prepare the right view for state one. Hiding all stopwords, showing the stopword label
  var state = 1;
  stopword_word.addClass('word_label');
  stopword_label.removeClass('invisible');
  // Here we run through the states
  $('.container').click( function() {
    console.log(state);
    if (state == 1) {
      stopword_word.removeClass('word_label');
      stopword_label.addClass('invisible');
      posneg_word.addClass('word_label');
      posneg_label.removeClass('invisible');
    }
    if (state == 2) {
      posneg_word.removeClass('word_label');
      posneg_label.addClass('invisible');
      noun_word.addClass('word_label');
      noun_label.removeClass('invisible');
    }
    if (state == 3) {
      noun_word.removeClass('word_label');
      noun_label.addClass('invisible');
      ad_word.addClass('word_label');
      ad_label.removeClass('invisible');
    }
    if (state == 4) {
      ad_word.removeClass('word_label');
      ad_label.addClass('invisible');
      dppt_word.addClass('word_label');
      dppt_label.removeClass('invisible');
    }
    if (state == 5) {
      dppt_word.removeClass('word_label');
      dppt_label.addClass('invisible');
      neutral_word.addClass('word_label');
      neutral_label.removeClass('invisible');
    }
    if (state == 6) {
      neutral_word.removeClass('word_label');
      neutral_label.addClass('invisible');
      stopword_word.addClass('word_label');
      stopword_label.removeClass('invisible');
      state = 0;
    }
  state = state+1;
 });
 });
--- a/src/reading_structure/style.css
+++ b/src/reading_structure/style.css
@ -0,0 +1,140 @@
 /* ---
 BASIC ELEMENTS
 --- */
 body {
  background-color: #dfdfdf;
 }
 .container {
  margin: 2rem auto;
  padding: 4rem 5rem;
  min-width: 40rem;
  max-width: 45%;
  min-height: 90vh;
  background-color: #fff;
  cursor: pointer;
  border: 0px solid #aeaeae;
  box-shadow: 0 1px 15px rgba(0,0,0,0.12), 0 1px 3px rgba(0,0,0,0.24);
 }
 .action-container {
    margin: 2rem;
    padding: 2rem 2.5rem;
    max-width: 45%;
    float: right;
    position: fixed;
    bottom: 2rem;
    right: 0;
 }
 span.wrapper {
  display: inline-block;
  font-family: 'PT Serif', serif;
  font-size: 1.2rem;
  text-align:center;
  position: relative;
  margin-bottom: 0.75rem;
 }
 /* ---
 ELEMENTS IN ACTION CONTAINER
 --- */
 .action-container a {
   font-family: 'PT Serif', serif;
   padding: 0.4rem;
   background: black;
   color: white;
   text-decoration: none;
   box-shadow: none;
 transition: all 0.3s cubic-bezier(.25,.8,.25,1);
 }
 .action-container a:hover {
   box-shadow: 0 1px 10px rgba(0,0,0,0.12), 0 1px 10px rgba(0,0,0,0.24);
 }
 .action-container p {
   margin: 1.25rem 2rem 0 0;
 }
 /* ---
 ELEMENTS IN .WRAPPER
 --- */
 .word, .tag {
  display: block;
  position: relative;
  margin: auto;
  clear:both;
 }
 span.invisible {
  display: none;
 }
 span.word {
  margin-top: 1.25rem;
 }
 span.tag {
  color: #fff;
  font-family: 'Ubuntu Mono', monospace;
  margin-bottom: -1.25rem;
 }
 /* Show original word when hovering label */
 span.word_label {
  display: block;
  opacity: 0;
  width: 100%;
  font-family: 'Ubuntu Mono', monospace;
 }
 .wrapper:hover > span.word_label {
  opacity: 0.5;
 }
 /* Spacing fix for punctuation */
 .punctuation {
  margin-left: -4px;
 }
 /* Colors for tags */
 .noun .pos {
  color: #003cb3;
 }
 .stopword .wordtype {
  color: #b83e54;
 }
 .neutral .sentiment {
  color: #8491a5;
 }
 .negative .sentiment {
  color: #c9805b;
 }
 .positive .sentiment {
  color: #44a889;
 }
 .adjective .pos, .adverb .pos {
  color: #8e445e;
 }
 .pronoun .pos, .determiner .pos, .to .pos, .preposition .pos {
  color: #2f7f40;
 }
--- a/src/reading_structure/template.html
+++ b/src/reading_structure/template.html
@ -0,0 +1,33 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>Reading the Structure</title>
    <meta charset="utf-8" />
    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
    <script type="text/javascript" src="jquery.min.js"></script>
    <script type="text/javascript" src="script.js"></script>
  </head>
  <body>
    <div class="container"><p>
      {% for item, value in words_and_tags.items() %}
      <span id="{{item}}" class="wrapper {{ words_and_tags[item]['wordtype'] }} {{ words_and_tags[item]['sentiment'] }} {{ words_and_tags[item]['POS'] }}">
          <span class ="tag wordtype invisible"> {{ words_and_tags[item]['wordtype'] }} </span>
          <span class ="tag sentiment invisible"> {{ words_and_tags[item]['sentiment'] }}</span>
          <span class ="tag pos invisible"> {{ words_and_tags[item]['POS'] }}</span>
          <span class ="word {% if words_and_tags[item]['word'] in [',','.','(',')',';',':'] %} punctuation {% else %} {{ words_and_tags[item]['word'] }} {% endif %}"> {{ words_and_tags[item]['word'] }}</span>
      </span>
     {% endfor %}
    </p></div>
    <div class="action-container">
      <p><a href="data.json" download>Export data as json</a> </p>
      <p><a href="poster.pdf" download>Print this as a poster</a> </p>
    </div>
  </body>
 </html>
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -1,64 +0,0 @@
 $(document).ready(function(){
  var state = 0;
  $('.noun').addClass('fade-out');
  $('.preposition').addClass('red');
  $('.verb').addClass('blue');
  $('.determiner').addClass('cyan');
  $(document).bind('contextmenu', function(e) { return false; });
  $( ".word" ).contextmenu(function() {
    console.log($(this).hasClass('underline'));
    $(this).hasClass('underline') == false
    ? $(this).addClass('underline')
    : $(this).removeClass('underline');
  });
  $('.word').click( function() {
    var el = $('.word');
    console.log(state);
    if (state == 0) {
      $('.word').removeClass('fade-out red blue cyan');
      $('.stopword').addClass('fade-out');
    }
    else if (state == 1) {
      $('.stopword').removeClass('fade-out');
      $('.neutral').addClass('fade-out');
    }
    else if (state == 2) {
      $('.neutral').removeClass('fade-out');
      $('.noun').addClass('fade-out');
      $('.preposition').addClass('red');
      $('.verb').addClass('blue');
      state = -1;
    }
    $('.word').each(function() {
      var el = $(this);
      if (state == 0) {
        el.empty();
        el.html(el.data("stopword") + "&nbsp;");
      }
      else if (state == 1) {
        el.empty();
        el.html(el.data("sentiment") + "&nbsp;");
      }
      else {
        el.empty();
        el.html(el.data("pos") + "&nbsp;");
      }
    });
    state = state+1;
  });
 });
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -1,86 +0,0 @@
 * {
 min-height: 0;
 min-width: 0;
 }
 body {
  background: #639ab2;
  font-size: 15px;
  font-family: 'Ubuntu Mono', monospace;
 }
 .prelative {
    flex-shrink: 0;
 }
 div.container {
  width: 100%;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:  flex;
  flex-wrap: wrap;
 }
 .word {
  font-size: 3rem;
  float: left;
  position: relative;
  text-align: center;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:flex;
  justify-content: center;
 }
 .word:before,
 .word:after {
    content: '';
    color: #fff;
    position: absolute;
    font-family: 'PT Serif', serif;
    font-weight: bold;
    font-size: 1.5rem;
    font-style: italic;
    opacity: 0;
    width: 100%;
  }
 .word:before {
    content: attr(data-txt);
    flex-shrink: 1;
 }
 .word:hover:before,
 .word:active:after {
    opacity: 1;
 }
 .fade-out {
  color: #275152;
 }
 p {
  margin: 1rem;
 }
 .red {
  color: red;
 }
 .blue {
  color: blue;
 }
 .cyan {
  color: cyan;
 }
 .underline {
  text-decoration: underline;
 }
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -1,20 +0,0 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>Wordtagger</title>
    <meta charset="utf-8" />
    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
    <script type="text/javascript" src="jquery.min.js"></script>
    <script type="text/javascript" src="script.js"></script>
    <!--meta name="viewport" content="width=device-width"-->
  </head>
  <body>
    <div class="container"><p>
      {% for item, value in words_and_tags.items() %}
      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
     {% endfor %}
    </p>
      </div>
  </body>
 </html>