Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

7 years ago · 3bfed0e3d0
parent 6b2983ee93 d4232c80ef
commit 3bfed0e3d0
7 changed files with 352 additions and 87 deletions
--- a/29
+++ b/29
@ -1,4 +1,5 @@
-images=$(sort $(wildcard images/*.jpg))
+images=$(sort $(wildcard images/*.jpg)) 
+# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -22,7 +23,7 @@ help:

 # CLEAN
 clean: ## removes output (target) files
-	rm ocr/output.txt 
+	rm ocr/output.txt
 	rm $(wildcard output/*)
 	rm $(tmpfile)

@ -31,12 +32,14 @@ clean: ## removes output (target) files

 dirs: ## create the dirs in working dir
 	@-mkdir -p images/
+	@-mkdir -p images-tiff/
 	@-mkdir -p output/
+	@-mkdir -p output/wordtagger
 	@-mkdir -p ocr/
+	@-mkdir -p hocr/
 	@echo $(color_r)'Directories made': images/ output/


-
 testif:
 ifeq ($(OS),Darwin)
 	@echo $(OS)
@ -50,12 +53,28 @@ ocr/output.txt:  ## ocr with tesseract
 	@echo $(basename $@ .txt)
 	tesseract $(@D)/list.txt $(basename $@ .txt)

+tiffs: ## convert images/ to images-tiff/ Depends on IM
+	echo $(images)
+	for i in $(images); \
+	do tiff=`basename $$i .jpg`.tiff; \
+	convert -density 300 $$i images-tiff/$$tiff; \
+	echo $$tiff; \
+	done;

+hocrs: ## hocr with tesseract and then change extension to .html
+	for i in images-tiff/*.tiff; \
+	do echo $$i; hocrfile=`basename $$i .tiff`; \
+	tesseract $$i hocr/$$hocrfile hocr; \
+	mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
+	done; 

 #OUTPUT GENERATION RECIPES

-output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
-	cat $< | python3 src/wordtagger.py > $(@)
+output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
+	cp src/wordtagger/jquery.min.js output/wordtagger
+	cp src/wordtagger/script.js output/wordtagger
+	cp src/wordtagger/style.css output/wordtagger
+	cat $< | python3 src/wordtagger/wordtagger.py
 #  install nltk's 'averaged_perceptron_tagger':
 #  $ python 3
 #  >>> import nltk
--- a/src/wordtagger.py
+++ b/src/wordtagger.py
@ -1,82 +0,0 @@
-import nltk
-from sys import stdin, stdout
-
-# Define input
-input = stdin.read()
-
-# FILTER FUNCTIONS
-# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
-def postagger(string):
-    words = nltk.word_tokenize(string)
-    taggedwordlist = nltk.pos_tag(words)
-
-    for word, pos in nltk.pos_tag(words):
-        taggedwordlist = nltk.pos_tag(words)
-        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
-
-    taglist = [ pos for word,pos in taggedwordlist ]
-    #print(taglist)
-    return taglist;
-
-# This function changes the tags to readable equivalents (NNP to noun for example)
-def postagger_readable(list):
-    readabletaglist = []
-
-    for tag in list:
-        if tag in {"NNP","NNS","NN","NNPS"}:
-            readabletag = 'noun'
-        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
-            readabletag = 'verb'
-        elif tag in {'RB','RBR','RBS','WRB'}:
-            readabletag = 'adverb'
-        elif tag in {'PRP','PRP$'}:
-            readabletag = 'pronoun'
-        elif tag in {'JJ','JJR','JJS'}:
-            readabletag = 'adjective'
-        elif tag == 'IN':
-            readabletag = 'preposition'
-        elif tag == 'WDT':
-            readabletag = 'determiner'
-        elif tag in {'WP','WP$'}:
-            readabletag = 'pronoun'
-        elif tag == 'UH':
-            readabletag = 'interjection'
-        elif tag == 'POS':
-            readabletag = 'possesive ending'
-        elif tag == 'SYM':
-            readabletag = 'symbol'
-        elif tag == 'EX':
-            readabletag = 'existential there'
-        elif tag == 'DT':
-            readabletag = 'determiner'
-        elif tag == 'MD':
-            readabletag = 'modal'
-        elif tag == 'LS':
-            readabletag = 'list item marker'
-        elif tag == 'FW':
-            readabletag = 'foreign word'
-        elif tag == 'CC':
-            readabletag = 'coordinating conjunction '
-        elif tag == 'CD':
-            readabletag = 'cardinal number'
-        elif tag == 'TO':
-            readabletag = 'to'
-        elif tag == '.':
-            readabletag = 'line ending'
-        elif tag == ',':
-            readabletag = 'comma'
-        else:
-            readabletag = tag
-
-        readabletaglist.append(readabletag)
-    return readabletaglist;
-
-
-# This function creates the output
-def main():
-    taglist = postagger(input)
-    readabletaglist = postagger_readable(taglist)
-    stdout.write(' '.join(readabletaglist))
-    stdout.write('\n')
-
-main()
--- a/src/wordtagger/jquery.min.js
+++ b/src/wordtagger/jquery.min.js
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -0,0 +1,64 @@
+$(document).ready(function(){
+  var state = 0;
+
+  $('.noun').addClass('fade-out');
+  $('.preposition').addClass('red');
+  $('.verb').addClass('blue');
+  $('.determiner').addClass('cyan');
+
+  $(document).bind('contextmenu', function(e) { return false; });
+
+  $( ".word" ).contextmenu(function() {
+    console.log($(this).hasClass('underline'));
+    $(this).hasClass('underline') == false
+    ? $(this).addClass('underline')
+    : $(this).removeClass('underline');
+  });
+
+  $('.word').click( function() {
+    var el = $('.word');
+    console.log(state);
+
+    if (state == 0) {
+      $('.word').removeClass('fade-out red blue cyan');
+
+      $('.stopword').addClass('fade-out');
+    }
+
+    else if (state == 1) {
+      $('.stopword').removeClass('fade-out');
+      $('.neutral').addClass('fade-out');
+    }
+
+    else if (state == 2) {
+      $('.neutral').removeClass('fade-out');
+      $('.noun').addClass('fade-out');
+      $('.preposition').addClass('red');
+      $('.verb').addClass('blue');
+      state = -1;
+    }
+
+    $('.word').each(function() {
+      var el = $(this);
+
+      if (state == 0) {
+        el.empty();
+        el.html(el.data("stopword") + "&nbsp;");
+      }
+
+      else if (state == 1) {
+        el.empty();
+        el.html(el.data("sentiment") + "&nbsp;");
+      }
+
+      else {
+        el.empty();
+        el.html(el.data("pos") + "&nbsp;");
+      }
+
+    });
+
+    state = state+1;
+  });
+
+});
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -0,0 +1,86 @@
+* {
+min-height: 0;
+min-width: 0;
+}
+
+body {
+  background: #639ab2;
+  font-size: 15px;
+  font-family: 'Ubuntu Mono', monospace;
+}
+
+.prelative {
+    flex-shrink: 0;
+}
+
+div.container {
+  width: 100%;
+  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
+  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
+  display: -ms-flexbox;      /* TWEENER - IE 10 */
+  display: -webkit-flex;     /* NEW - Chrome */
+  display:  flex;
+  flex-wrap: wrap;
+}
+
+.word {
+  font-size: 3rem;
+  float: left;
+  position: relative;
+  text-align: center;
+  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
+  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
+  display: -ms-flexbox;      /* TWEENER - IE 10 */
+  display: -webkit-flex;     /* NEW - Chrome */
+  display:flex;
+  justify-content: center;
+}
+
+.word:before,
+.word:after {
+    content: '';
+    color: #fff;
+    position: absolute;
+    font-family: 'PT Serif', serif;
+    font-weight: bold;
+    font-size: 1.5rem;
+    font-style: italic;
+    opacity: 0;
+    width: 100%;
+  }
+
+.word:before {
+    content: attr(data-txt);
+    flex-shrink: 1;
+}
+
+.word:hover:before,
+.word:active:after {
+    opacity: 1;
+}
+
+
+
+.fade-out {
+  color: #275152;
+}
+
+p {
+  margin: 1rem;
+}
+
+.red {
+  color: red;
+}
+
+.blue {
+  color: blue;
+}
+
+.cyan {
+  color: cyan;
+}
+
+.underline {
+  text-decoration: underline;
+}
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -0,0 +1,20 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Wordtagger</title>
+    <meta charset="utf-8" />
+    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
+    <script type="text/javascript" src="jquery.min.js"></script>
+    <script type="text/javascript" src="script.js"></script>
+    <!--meta name="viewport" content="width=device-width"-->
+  </head>
+
+  <body>
+    <div class="container"><p>
+      {% for item, value in words_and_tags.items() %}
+      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
+     {% endfor %}
+    </p>
+      </div>
+  </body>
+</html>
--- a/src/wordtagger/wordtagger.py
+++ b/src/wordtagger/wordtagger.py
@ -0,0 +1,156 @@
+# LIBS
+import nltk
+import json
+import os
+from sys import stdin, stdout
+from nltk import ne_chunk, pos_tag, word_tokenize
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from nltk.corpus import stopwords
+from jinja2 import Template
+
+# == INPUT AND TOKENIZE ==
+# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
+input = stdin.read()
+words = nltk.word_tokenize(input)
+words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
+print(words_and_tags)
+
+# == FILTER FUNCTIONS ==
+
+# === 1. POS_tagger & Named Entity Recognizer ===
+# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
+def POS_tagger(list):
+    taggedwordlist = nltk.pos_tag(list)
+
+
+    for word, pos in nltk.pos_tag(list):
+        taggedwordlist = nltk.pos_tag(list)
+        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
+        print(taggedwordlist)
+    taglist = [ pos for word,pos in taggedwordlist ]
+    POS_tags = []
+
+    for tag in taglist:
+        if tag in {"NNP","NNS","NN","NNPS"}:
+            POS_tag = 'noun'
+        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
+            POS_tag = 'verb'
+        elif tag in {'RB','RBR','RBS','WRB'}:
+            POS_tag = 'adverb'
+        elif tag in {'PRP','PRP$'}:
+            POS_tag = 'pronoun'
+        elif tag in {'JJ','JJR','JJS'}:
+            POS_tag = 'adjective'
+        elif tag == 'IN':
+            POS_tag = 'preposition'
+        elif tag == 'WDT':
+            POS_tag = 'determiner'
+        elif tag in {'WP','WP$'}:
+            POS_tag = 'pronoun'
+        elif tag == 'UH':
+            POS_tag = 'interjection'
+        elif tag == 'POS':
+            POS_tag = 'possesive ending'
+        elif tag == 'SYM':
+            POS_tag = 'symbol'
+        elif tag == 'EX':
+            POS_tag = 'existential there'
+        elif tag == 'DT':
+            POS_tag = 'determiner'
+        elif tag == 'MD':
+            POS_tag = 'modal'
+        elif tag == 'LS':
+            POS_tag = 'list item marker'
+        elif tag == 'FW':
+            POS_tag = 'foreign word'
+        elif tag == 'CC':
+            POS_tag = 'coordinating conjunction '
+        elif tag == 'CD':
+            POS_tag = 'cardinal number'
+        elif tag == 'TO':
+            POS_tag = 'to'
+        elif tag == '.':
+            POS_tag = 'line ending'
+        elif tag == ',':
+            POS_tag = 'comma'
+        else:
+            POS_tag = tag
+        POS_tags.append(POS_tag)
+    #print(POS_tag)
+    return POS_tags;
+
+# === 2. Sentiment tagger ===
+# Sentiment analyzer based on the NLTK VADER tagger.
+# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
+def sentiment_tagger(word):
+    analyzer = SentimentIntensityAnalyzer()
+    score = analyzer.polarity_scores(word).get("compound")
+
+    if score < 0:
+        sentiment_tag = 'negative'
+    elif score > 0:
+        sentiment_tag = 'positive'
+    else:
+        sentiment_tag = 'neutral'
+
+    return sentiment_tag
+
+# === 3. Stopword tagger ===
+# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
+def stopword_tagger(word):
+
+    stopWords = set(stopwords.words('english'))
+
+    if word in stopWords:
+        stopword_tag = 'stopword'
+    else:
+        stopword_tag = 'keyword'
+
+    return stopword_tag
+
+
+# Run POS tagger
+# This tagger outputs a list for all items in the dict at once
+# To avoid double work, it is better to keep this outside the for loop
+POS_tags = POS_tagger(words)
+i = 0
+
+# Adding tags to words in dictionary, which will be exported as a json file
+# {'item 0' : {'word' : word, 'tagger 1': value 1}}
+for item, value in words_and_tags.items():
+    word = words_and_tags[item]['word']
+
+    # POS
+    pos_tag = POS_tags[i]
+    words_and_tags[item]['POS'] = pos_tag
+    i = i+1
+
+    # Add sentiment tag
+    sentiment_tag = sentiment_tagger(word)
+    words_and_tags[item]['sentiment'] = sentiment_tag
+
+    # Add stopword tag
+    stopword_tag = stopword_tagger(word)
+    words_and_tags[item]['wordtype'] = stopword_tag
+
+    # Add entity tag
+    # Not functional yet
+
+# Save data into a json file
+print(words_and_tags)
+#with open("data.json", 'w') as f:
+with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
+  json.dump(words_and_tags, f, ensure_ascii=False)
+
+#let's bind it to a jinja2 template
+# Jinja moves up one level by default, so I do not need to do it myself as in line 141
+template_open = open("src/wordtagger/template.html", "r")
+template = Template(template_open.read())
+index_render = template.render(words_and_tags=words_and_tags)
+#print(text_render)
+
+# And render an html file!
+print(index_render)
+index_open = open("output/wordtagger/index.html", "w")
+index_open.write(index_render)
+index_open.close()