Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

7 years ago · 3bfed0e3d0
parent 6b2983ee93 d4232c80ef
commit 3bfed0e3d0
7 changed files with 352 additions and 87 deletions
--- a/29
+++ b/29
@ -1,4 +1,5 @@
-images=$(sort $(wildcard images/*.jpg))
+images=$(sort $(wildcard images/*.jpg)) 
 # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -22,7 +23,7 @@ help:
 # CLEAN
 clean: ## removes output (target) files
-	rm ocr/output.txt 
+	rm ocr/output.txt
 	rm $(wildcard output/*)
 	rm $(tmpfile)
@ -31,12 +32,14 @@ clean: ## removes output (target) files
 dirs: ## create the dirs in working dir
 	@-mkdir -p images/
 	@-mkdir -p images-tiff/
 	@-mkdir -p output/
 	@-mkdir -p output/wordtagger
 	@-mkdir -p ocr/
 	@-mkdir -p hocr/
 	@echo $(color_r)'Directories made': images/ output/
 testif:
 ifeq ($(OS),Darwin)
 	@echo $(OS)
@ -50,12 +53,28 @@ ocr/output.txt:  ## ocr with tesseract
 	@echo $(basename $@ .txt)
 	tesseract $(@D)/list.txt $(basename $@ .txt)
 tiffs: ## convert images/ to images-tiff/ Depends on IM
 	echo $(images)
 	for i in $(images); \
 	do tiff=`basename $$i .jpg`.tiff; \
 	convert -density 300 $$i images-tiff/$$tiff; \
 	echo $$tiff; \
 	done;
 hocrs: ## hocr with tesseract and then change extension to .html
 	for i in images-tiff/*.tiff; \
 	do echo $$i; hocrfile=`basename $$i .tiff`; \
 	tesseract $$i hocr/$$hocrfile hocr; \
 	mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
 	done; 
 #OUTPUT GENERATION RECIPES
-output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
+output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
-	cat $< | python3 src/wordtagger.py > $(@)
+	cp src/wordtagger/jquery.min.js output/wordtagger
 	cp src/wordtagger/script.js output/wordtagger
 	cp src/wordtagger/style.css output/wordtagger
 	cat $< | python3 src/wordtagger/wordtagger.py
 #  install nltk's 'averaged_perceptron_tagger':
 #  $ python 3
 #  >>> import nltk
--- a/src/wordtagger.py
+++ b/src/wordtagger.py
@ -1,82 +0,0 @@
 import nltk
 from sys import stdin, stdout
 # Define input
 input = stdin.read()
 # FILTER FUNCTIONS
 # This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
 def postagger(string):
    words = nltk.word_tokenize(string)
    taggedwordlist = nltk.pos_tag(words)
    for word, pos in nltk.pos_tag(words):
        taggedwordlist = nltk.pos_tag(words)
        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
    taglist = [ pos for word,pos in taggedwordlist ]
    #print(taglist)
    return taglist;
 # This function changes the tags to readable equivalents (NNP to noun for example)
 def postagger_readable(list):
    readabletaglist = []
    for tag in list:
        if tag in {"NNP","NNS","NN","NNPS"}:
            readabletag = 'noun'
        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
            readabletag = 'verb'
        elif tag in {'RB','RBR','RBS','WRB'}:
            readabletag = 'adverb'
        elif tag in {'PRP','PRP$'}:
            readabletag = 'pronoun'
        elif tag in {'JJ','JJR','JJS'}:
            readabletag = 'adjective'
        elif tag == 'IN':
            readabletag = 'preposition'
        elif tag == 'WDT':
            readabletag = 'determiner'
        elif tag in {'WP','WP$'}:
            readabletag = 'pronoun'
        elif tag == 'UH':
            readabletag = 'interjection'
        elif tag == 'POS':
            readabletag = 'possesive ending'
        elif tag == 'SYM':
            readabletag = 'symbol'
        elif tag == 'EX':
            readabletag = 'existential there'
        elif tag == 'DT':
            readabletag = 'determiner'
        elif tag == 'MD':
            readabletag = 'modal'
        elif tag == 'LS':
            readabletag = 'list item marker'
        elif tag == 'FW':
            readabletag = 'foreign word'
        elif tag == 'CC':
            readabletag = 'coordinating conjunction '
        elif tag == 'CD':
            readabletag = 'cardinal number'
        elif tag == 'TO':
            readabletag = 'to'
        elif tag == '.':
            readabletag = 'line ending'
        elif tag == ',':
            readabletag = 'comma'
        else:
            readabletag = tag
        readabletaglist.append(readabletag)
    return readabletaglist;
 # This function creates the output
 def main():
    taglist = postagger(input)
    readabletaglist = postagger_readable(taglist)
    stdout.write(' '.join(readabletaglist))
    stdout.write('\n')
 main()
--- a/src/wordtagger/jquery.min.js
+++ b/src/wordtagger/jquery.min.js
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -0,0 +1,64 @@
 $(document).ready(function(){
  var state = 0;
  $('.noun').addClass('fade-out');
  $('.preposition').addClass('red');
  $('.verb').addClass('blue');
  $('.determiner').addClass('cyan');
  $(document).bind('contextmenu', function(e) { return false; });
  $( ".word" ).contextmenu(function() {
    console.log($(this).hasClass('underline'));
    $(this).hasClass('underline') == false
    ? $(this).addClass('underline')
    : $(this).removeClass('underline');
  });
  $('.word').click( function() {
    var el = $('.word');
    console.log(state);
    if (state == 0) {
      $('.word').removeClass('fade-out red blue cyan');
      $('.stopword').addClass('fade-out');
    }
    else if (state == 1) {
      $('.stopword').removeClass('fade-out');
      $('.neutral').addClass('fade-out');
    }
    else if (state == 2) {
      $('.neutral').removeClass('fade-out');
      $('.noun').addClass('fade-out');
      $('.preposition').addClass('red');
      $('.verb').addClass('blue');
      state = -1;
    }
    $('.word').each(function() {
      var el = $(this);
      if (state == 0) {
        el.empty();
        el.html(el.data("stopword") + "&nbsp;");
      }
      else if (state == 1) {
        el.empty();
        el.html(el.data("sentiment") + "&nbsp;");
      }
      else {
        el.empty();
        el.html(el.data("pos") + "&nbsp;");
      }
    });
    state = state+1;
  });
 });
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -0,0 +1,86 @@
 * {
 min-height: 0;
 min-width: 0;
 }
 body {
  background: #639ab2;
  font-size: 15px;
  font-family: 'Ubuntu Mono', monospace;
 }
 .prelative {
    flex-shrink: 0;
 }
 div.container {
  width: 100%;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:  flex;
  flex-wrap: wrap;
 }
 .word {
  font-size: 3rem;
  float: left;
  position: relative;
  text-align: center;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:flex;
  justify-content: center;
 }
 .word:before,
 .word:after {
    content: '';
    color: #fff;
    position: absolute;
    font-family: 'PT Serif', serif;
    font-weight: bold;
    font-size: 1.5rem;
    font-style: italic;
    opacity: 0;
    width: 100%;
  }
 .word:before {
    content: attr(data-txt);
    flex-shrink: 1;
 }
 .word:hover:before,
 .word:active:after {
    opacity: 1;
 }
 .fade-out {
  color: #275152;
 }
 p {
  margin: 1rem;
 }
 .red {
  color: red;
 }
 .blue {
  color: blue;
 }
 .cyan {
  color: cyan;
 }
 .underline {
  text-decoration: underline;
 }
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -0,0 +1,20 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>Wordtagger</title>
    <meta charset="utf-8" />
    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
    <script type="text/javascript" src="jquery.min.js"></script>
    <script type="text/javascript" src="script.js"></script>
    <!--meta name="viewport" content="width=device-width"-->
  </head>
  <body>
    <div class="container"><p>
      {% for item, value in words_and_tags.items() %}
      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
     {% endfor %}
    </p>
      </div>
  </body>
 </html>
--- a/src/wordtagger/wordtagger.py
+++ b/src/wordtagger/wordtagger.py
@ -0,0 +1,156 @@
 # LIBS
 import nltk
 import json
 import os
 from sys import stdin, stdout
 from nltk import ne_chunk, pos_tag, word_tokenize
 from nltk.sentiment.vader import SentimentIntensityAnalyzer
 from nltk.corpus import stopwords
 from jinja2 import Template
 # == INPUT AND TOKENIZE ==
 # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
 input = stdin.read()
 words = nltk.word_tokenize(input)
 words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
 print(words_and_tags)
 # == FILTER FUNCTIONS ==
 # === 1. POS_tagger & Named Entity Recognizer ===
 # This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
 def POS_tagger(list):
    taggedwordlist = nltk.pos_tag(list)
    for word, pos in nltk.pos_tag(list):
        taggedwordlist = nltk.pos_tag(list)
        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
        print(taggedwordlist)
    taglist = [ pos for word,pos in taggedwordlist ]
    POS_tags = []
    for tag in taglist:
        if tag in {"NNP","NNS","NN","NNPS"}:
            POS_tag = 'noun'
        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
            POS_tag = 'verb'
        elif tag in {'RB','RBR','RBS','WRB'}:
            POS_tag = 'adverb'
        elif tag in {'PRP','PRP$'}:
            POS_tag = 'pronoun'
        elif tag in {'JJ','JJR','JJS'}:
            POS_tag = 'adjective'
        elif tag == 'IN':
            POS_tag = 'preposition'
        elif tag == 'WDT':
            POS_tag = 'determiner'
        elif tag in {'WP','WP$'}:
            POS_tag = 'pronoun'
        elif tag == 'UH':
            POS_tag = 'interjection'
        elif tag == 'POS':
            POS_tag = 'possesive ending'
        elif tag == 'SYM':
            POS_tag = 'symbol'
        elif tag == 'EX':
            POS_tag = 'existential there'
        elif tag == 'DT':
            POS_tag = 'determiner'
        elif tag == 'MD':
            POS_tag = 'modal'
        elif tag == 'LS':
            POS_tag = 'list item marker'
        elif tag == 'FW':
            POS_tag = 'foreign word'
        elif tag == 'CC':
            POS_tag = 'coordinating conjunction '
        elif tag == 'CD':
            POS_tag = 'cardinal number'
        elif tag == 'TO':
            POS_tag = 'to'
        elif tag == '.':
            POS_tag = 'line ending'
        elif tag == ',':
            POS_tag = 'comma'
        else:
            POS_tag = tag
        POS_tags.append(POS_tag)
    #print(POS_tag)
    return POS_tags;
 # === 2. Sentiment tagger ===
 # Sentiment analyzer based on the NLTK VADER tagger.
 # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
 def sentiment_tagger(word):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(word).get("compound")
    if score < 0:
        sentiment_tag = 'negative'
    elif score > 0:
        sentiment_tag = 'positive'
    else:
        sentiment_tag = 'neutral'
    return sentiment_tag
 # === 3. Stopword tagger ===
 # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
 def stopword_tagger(word):
    stopWords = set(stopwords.words('english'))
    if word in stopWords:
        stopword_tag = 'stopword'
    else:
        stopword_tag = 'keyword'
    return stopword_tag
 # Run POS tagger
 # This tagger outputs a list for all items in the dict at once
 # To avoid double work, it is better to keep this outside the for loop
 POS_tags = POS_tagger(words)
 i = 0
 # Adding tags to words in dictionary, which will be exported as a json file
 # {'item 0' : {'word' : word, 'tagger 1': value 1}}
 for item, value in words_and_tags.items():
    word = words_and_tags[item]['word']
    # POS
    pos_tag = POS_tags[i]
    words_and_tags[item]['POS'] = pos_tag
    i = i+1
    # Add sentiment tag
    sentiment_tag = sentiment_tagger(word)
    words_and_tags[item]['sentiment'] = sentiment_tag
    # Add stopword tag
    stopword_tag = stopword_tagger(word)
    words_and_tags[item]['wordtype'] = stopword_tag
    # Add entity tag
    # Not functional yet
 # Save data into a json file
 print(words_and_tags)
 #with open("data.json", 'w') as f:
 with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
  json.dump(words_and_tags, f, ensure_ascii=False)
 #let's bind it to a jinja2 template
 # Jinja moves up one level by default, so I do not need to do it myself as in line 141
 template_open = open("src/wordtagger/template.html", "r")
 template = Template(template_open.read())
 index_render = template.render(words_and_tags=words_and_tags)
 #print(text_render)
 # And render an html file!
 print(index_render)
 index_open = open("output/wordtagger/index.html", "w")
 index_open.write(index_render)
 index_open.close()