Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

7 years ago · b66c5f7d54
parent 13a3edd10d a1051862f5
commit b66c5f7d54
16 changed files with 743 additions and 90 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,3 @@
 images/**
 output/**
-
+src/index.json
--- a/40
+++ b/40
@ -1,4 +1,5 @@
 images=$(sort $(wildcard images/*.jpg))
 # @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -31,12 +32,13 @@ clean: ## removes output (target) files
 dirs: ## create the dirs in working dir
 	@-mkdir -p images/
 	@-mkdir -p images-tiff/
 	@-mkdir -p output/
 	@-mkdir -p ocr/
 	@-mkdir -p hocr/
 	@echo $(color_r)'Directories made': images/ output/
 testif:
 ifeq ($(OS),Darwin)
 	@echo $(OS)
@ -49,13 +51,31 @@ ocr/output.txt:  ## ocr with tesseract
 	echo $(listimgs) > $(@D)/list.txt
 	@echo $(basename $@ .txt)
 	tesseract $(@D)/list.txt $(basename $@ .txt)
 	python3 src/build_database.py $(@)
 tiffs: ## convert images/ to images-tiff/ Depends on IM
 	echo $(images)
 	for i in $(images); \
 	do tiff=`basename $$i .jpg`.tiff; \
 	convert -density 300 $$i -alpha on images-tiff/$$tiff; \
 	echo $$tiff; \
 	done;
-
+hocrs: ## hocr with tesseract and then change extension to .html
 	for i in images-tiff/*.tiff; \
 	do echo $$i; hocrfile=`basename $$i .tiff`; \
 	tesseract $$i hocr/$$hocrfile hocr; \
 	mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
 	done;
 #OUTPUT GENERATION RECIPES
-output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
+output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
-	cat $< | python3 src/wordtagger.py > $(@)
+	mkdir -p output/wordtagger
 	cp src/wordtagger/jquery.min.js output/wordtagger
 	cp src/wordtagger/script.js output/wordtagger
 	cp src/wordtagger/style.css output/wordtagger
 	cat $< | python3 src/wordtagger/wordtagger.py
 #  install nltk's 'averaged_perceptron_tagger':
 #  $ python 3
 #  >>> import nltk
@ -65,9 +85,16 @@ output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dep
 	cat $< | python3 src/textbotconversation.py $(@)
-output/n7.txt: ocr/output.txt ## DESCRIBE WHAT IT DOES. Dependencies: python3's chatterbot
+output/n7.txt: ocr/output.txt ## Replaces nouns with the 7th noun that follows. Dependencies: 91k_nouns
 	cat $< | python3 src/n_7.py > $(@)
 output/carlandre.txt: ocr/output.txt ## Alice: Creates visual poetry out of a text. Dependencies: pytest
 	cat $< | python3 src/carlandre.py > $(@)
 # cat $(@) > /dev/usb/lp0
 output/overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
 	python3 src/overunder.py
 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
 	@echo $(tmpfile)
@ -83,3 +110,6 @@ endif
 ttssr-human-only: ocr/output.txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
 	bash src/ttssr-loop-human-only.sh ocr/output.txt
 chatbook: ocr/output.txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
 	python3 src/chatbook.py
--- a/ocr/list.txt
+++ b/ocr/list.txt
@ -0,0 +1,3 @@
 images/0029.jpg
--- a/ocr/output.txt
+++ b/ocr/output.txt
@ -0,0 +1,37 @@
 ZEROS + ONES DIGITAL WOMEN 4|» THE NEWTECHNOCULTURE
 moments of unknown, disconnected lives, ”invisible voices
 conducted through the tips of her ﬁngers."
 Poised as an interface between man and the world, she is
 also wired to a network of digital machines: typists connected to
 QWERTY alphabets, bodies shaped by the motion of the keys,
 one hundred words a minute, viral speed, Thousands oi opera
 tors, relays, calls, exchanges humming in Virtual conjunction,
 learning the same phrases, ﬂipping the same switches,
 repeating the same responses, pushing plugs into the
 answering iacks, maybe two hundred, three hundred times an
 hours She has "a ﬁngertip mastery of the ringing. listening, dial,
 and other keys on her key shelf; of the row or rows of cords for
 making connections; of the location and meaning of all parts of
 the honey combed formation of jacks and trunks for recording,
 for switching, for toll circuits, for tandem, for information-" It
 becomes second nature it grows on her, "Having done this stufl
 a few hundred thousand times, you become quite good at it. In
 fact you're plugging, and connecting, and disconnecting ten,
 twenty, forty cords at a time." After a while these processes
 become "quite satisfying in a way, rather like weaving on an
 upright loom,"
 102
--- a/src/build_database.py
+++ b/src/build_database.py
@ -0,0 +1,38 @@
 import json
 import argparse
 import sys
 from nltk.tokenize import sent_tokenize, word_tokenize
 from rake_nltk import Rake
 r= Rake()
 ap = argparse.ArgumentParser("JSON Dumper")
 ap.add_argument("text", nargs="+", help="text sources")
 args=ap.parse_args()
 with open('src/index.json') as f:
    try:
        index = json.load(f)
    except:
        index={}
 # build the index of sentences organized by keywords
 alltext = ""
 for n in args.text:
    text = open(n).read()
    text = text.replace("\n", " ")
    sentences = sent_tokenize(text)
    for sentence in sentences:
        r.extract_keywords_from_text(sentence)
        keys = r.get_ranked_phrases()
        for key in keys:
            if key not in index:
                index[key] = []
            index[key].append({'filename': n, 'sentence': sentence, 'key': key})
    alltext += text
 #print(index)
 with open('src/index.json', 'w') as outfile:
    json.dump(index, outfile)
--- a/src/carlandre.py
+++ b/src/carlandre.py
@ -0,0 +1,129 @@
 import pytest
 from math import ceil
 import sys
 from sys import stdout
 import time
 import os.path
 def pop_items(words, num_items):
    ''' Removes num_items from words.'''
    if not words:
         return [], []
    if num_items > len(words):
        raise ValueError('Not enough items!')
    popped = []
    for number in range(num_items):
        removed = words.pop(0)
        popped.append(removed)
    return popped, words
 def all_words_less_than(words, maxlength):
    ''' Checks if the words have the correct length given in maxlength'''
    for word in words:
        if len(word) > maxlength:
            return False
    return True
 def filterwords(words, maxlength):
    ''' Puts the words which have the correct length in a new list '''
    goodwords = []
    for word in words:
        if len(word) <= maxlength and len(word) >=2:
            goodwords.append(word)
    return goodwords
 def pattern(words, maxlength):
    goodwords = filterwords(words, maxlength)
    items_pattern = maxlength + (maxlength -4)
    if len(goodwords) % items_pattern != 0:
        rest = len(goodwords) % items_pattern
        difference = len(goodwords) - rest
        goodwords = goodwords[:difference]
    times = int(len(words) / items_pattern)
    final_pattern = []
    for each_time in range(times):
        popped, whatisleft = pop_items(goodwords, items_pattern)
        if not popped:
            continue
        goodwords = whatisleft
        middle = ceil(len(popped)/2)
        ascending = sorted(popped[:middle], key=len)
        descending = sorted(popped[middle:], key=len, reverse=True)
        sorted_pattern = ascending + descending
        final_pattern.append(sorted_pattern)
    return final_pattern
 def test_pattern_returns_list():
    list_items = ['a', 'b', 'c', 'd', 'e']
    assert type(pattern(list_items, 3)) == type([])
 def test_pattern_removes_over_max_len():
    list_words_right_length = [['a', 'aa', 'aaa', 'aa', 'a']]
    words_wrong_length = list_words_right_length[0] + ['aaaaa']
    assert pattern(words_wrong_length, 3) == list_words_right_length
 def test_pop_items():
    assert pop_items(['a', 'aaa'], 1) == (['a'], ['aaa'])
 def test_pop_items_empty_list():
    assert pop_items([], 70) == ([], [])
 def test_pop_items_num_too_big():
    with pytest.raises(ValueError):
        pop_items(['a', 'b'], 3)
 def test_cuts_for_pattern():
    list_with_nine = ['a'] * 9
    result = pattern(list_with_nine, 3)
    assert len(result[0]) == 5
 def test_empty_list_for_pattern():
    result = pattern([], 3)
    assert result == []
 def test_list_too_short_for_pattern():
    list_too_short = ['a', 'aa']
    result = pattern(list_too_short, 3)
    assert result == []
 if __name__ == '__main__':
    with open('ocr/output.txt', 'r') as handle:
        contents = handle.read()
    splitted = contents.split()
    ll = (pattern(splitted, 8))
    my_list = []
    for l in ll:
        for x in l:
            my_list.append(x)
    joined_list = '\n'.join(my_list)
 my_path = '/dev/usb/lp0'
 if os.path.exists(my_path):
    sys.stdout = open(my_path, 'w')
 escpos = {
    "init_printer":  "\x1B\x40",
    'papercut':'\x1D\x56\x00',
 }
 for i in range(10):
    print(escpos['init_printer'])
    print(joined_list)
    print(escpos['papercut'])
--- a/src/chatbook.py
+++ b/src/chatbook.py
@ -0,0 +1,79 @@
 import irc.bot
 from rake_nltk import Rake
 import random
 from nltk.tokenize import sent_tokenize, word_tokenize
 import json
 #from thread import start_new_thread
 import os
 r = Rake()
 def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]
 class HelloBot(irc.bot.SingleServerIRCBot):
    def __init__(self, channel, nickname, server, port=6667, index=None):
        print("connecting to chatroom...")
        irc.bot.SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
        self.channel = channel
        self.index = index
    def on_welcome(self, c, e):
        c.join(self.channel)
        print("joined chatroom")
    def on_privmsg(self, c, e):
        pass
    def on_pubmsg(self, c, e):
        print(e.arguments, e.source)
        msg=e.arguments[0]
        print(e.source.split("!")[0][:1])
        incoming_msg = e.arguments[0]
        r.extract_keywords_from_text(msg)
        listOfKeys = r.get_ranked_phrases()
        msg_where = ""
        if e.source.split("!")[0][-3:] != "bot" or e.source.split("!")[0][:1] != "A":
            print("true")
            for keyWord in listOfKeys:
                if keyWord in self.index:
                    msg = (index.get(keyWord)[0].get('sentence'))
                    msg_where = "I found this in {}".format(index.get(keyWord)[0].get('filename'))
                else:
                    msg = "I don't know anything about that"
                    msg_where = ""
            for chunk in chunks(msg, 400):
                print(chunk)
                c.privmsg(self.channel, chunk)
        else:
            print("bot")
 if __name__ == "__main__":
    import argparse
    import sys
    ap = argparse.ArgumentParser("IRC Bot")
    ap.add_argument("--server", default="irc.freenode.net")
    ap.add_argument("--port", type=int, default=6667)
    ap.add_argument("--channel", default="#pzi")
    ap.add_argument("--nickname", default="scanbot")
    ap.add_argument("--text", nargs="+", help="database to use", default="index.json")
    args=ap.parse_args()
    # build the index of sentences organized by keywords
    with open("src/index.json") as f:
        try:
            index = json.load(f)
        except:
            index={}
    #print(index)
    myhost = os.uname()[1]
    bot = HelloBot(args.channel, "A-2{}-bot".format(len(index)), args.server, args.port, index)
    bot.start()
--- a/src/index.json
+++ b/src/index.json
--- a/src/overunder.py
+++ b/src/overunder.py
@ -0,0 +1,90 @@
 import linecache
 import textwrap
 import sys
 from sys import exit
 class LeavingProgram(Exception):
    pass
 def parse(program):
    cmds = program.split(',')
    splitted_cmds = []
    for cmd in cmds:
        splitted = cmd.split()
        splitted_cmds.append(splitted)
    return splitted_cmds
    #return tokenize(program)
 def tokenize(s):
    return s.split()
 def repl():
    while True:
        try:
            val = eval(parse(input('> ')))
            if val is not None:
                print(val)
        except LeavingProgram:
            break
 text = None
 line_number = 0
 last_index = 0
 def eval(cmds):
    global text
    global line_number
    global last_index
    for cmd in cmds:
        if cmd == []:
            line_number += 1
            last_index = 0
        elif cmd[0] == 'load':
            contents = open('ocr/output.txt').read()
            text = textwrap.wrap(contents, 40, break_long_words=True)
            print('\n'.join(text))
            line_number = 0
            last_index = 0
        elif cmd[0] == 'show':
            print(text[line_number])
        elif cmd[0] == 'under':
            current_line = text[line_number]
            char_number = int(cmd[1]) - 1
            char_list = list(current_line)
            x=range(last_index, char_number + last_index + 1)
            for time in x:
                if time < len(char_list):
                    char_list[time] = u'\u21e2'
            last_index += char_number + 1
            joined = ''.join(char_list)
            text[line_number] = joined
        elif cmd[0] == 'over':
            last_index += int(cmd[1])
        elif cmd[0] == 'pattern':
            pattern = text[0:line_number + 1]
            print('\n'.join(pattern))
        elif cmd[0] == 'quit':
            print('Come back soon!')
            raise LeavingProgram()
        else:
            joined = ' '.join(cmd)
            print('Did not understand command {}'.format(joined))
 if __name__ == '__main__':
    repl()
--- a/src/wordtagger.py
+++ b/src/wordtagger.py
@ -1,82 +0,0 @@
 import nltk
 from sys import stdin, stdout
 # Define input
 input = stdin.read()
 # FILTER FUNCTIONS
 # This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
 def postagger(string):
    words = nltk.word_tokenize(string)
    taggedwordlist = nltk.pos_tag(words)
    for word, pos in nltk.pos_tag(words):
        taggedwordlist = nltk.pos_tag(words)
        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
    taglist = [ pos for word,pos in taggedwordlist ]
    #print(taglist)
    return taglist;
 # This function changes the tags to readable equivalents (NNP to noun for example)
 def postagger_readable(list):
    readabletaglist = []
    for tag in list:
        if tag in {"NNP","NNS","NN","NNPS"}:
            readabletag = 'noun'
        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
            readabletag = 'verb'
        elif tag in {'RB','RBR','RBS','WRB'}:
            readabletag = 'adverb'
        elif tag in {'PRP','PRP$'}:
            readabletag = 'pronoun'
        elif tag in {'JJ','JJR','JJS'}:
            readabletag = 'adjective'
        elif tag == 'IN':
            readabletag = 'preposition'
        elif tag == 'WDT':
            readabletag = 'determiner'
        elif tag in {'WP','WP$'}:
            readabletag = 'pronoun'
        elif tag == 'UH':
            readabletag = 'interjection'
        elif tag == 'POS':
            readabletag = 'possesive ending'
        elif tag == 'SYM':
            readabletag = 'symbol'
        elif tag == 'EX':
            readabletag = 'existential there'
        elif tag == 'DT':
            readabletag = 'determiner'
        elif tag == 'MD':
            readabletag = 'modal'
        elif tag == 'LS':
            readabletag = 'list item marker'
        elif tag == 'FW':
            readabletag = 'foreign word'
        elif tag == 'CC':
            readabletag = 'coordinating conjunction '
        elif tag == 'CD':
            readabletag = 'cardinal number'
        elif tag == 'TO':
            readabletag = 'to'
        elif tag == '.':
            readabletag = 'line ending'
        elif tag == ',':
            readabletag = 'comma'
        else:
            readabletag = tag
        readabletaglist.append(readabletag)
    return readabletaglist;
 # This function creates the output
 def main():
    taglist = postagger(input)
    readabletaglist = postagger_readable(taglist)
    stdout.write(' '.join(readabletaglist))
    stdout.write('\n')
 main()
--- a/src/wordtagger/jquery.min.js
+++ b/src/wordtagger/jquery.min.js
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -0,0 +1,64 @@
 $(document).ready(function(){
  var state = 0;
  $('.noun').addClass('fade-out');
  $('.preposition').addClass('red');
  $('.verb').addClass('blue');
  $('.determiner').addClass('cyan');
  $(document).bind('contextmenu', function(e) { return false; });
  $( ".word" ).contextmenu(function() {
    console.log($(this).hasClass('underline'));
    $(this).hasClass('underline') == false
    ? $(this).addClass('underline')
    : $(this).removeClass('underline');
  });
  $('.word').click( function() {
    var el = $('.word');
    console.log(state);
    if (state == 0) {
      $('.word').removeClass('fade-out red blue cyan');
      $('.stopword').addClass('fade-out');
    }
    else if (state == 1) {
      $('.stopword').removeClass('fade-out');
      $('.neutral').addClass('fade-out');
    }
    else if (state == 2) {
      $('.neutral').removeClass('fade-out');
      $('.noun').addClass('fade-out');
      $('.preposition').addClass('red');
      $('.verb').addClass('blue');
      state = -1;
    }
    $('.word').each(function() {
      var el = $(this);
      if (state == 0) {
        el.empty();
        el.html(el.data("stopword") + "&nbsp;");
      }
      else if (state == 1) {
        el.empty();
        el.html(el.data("sentiment") + "&nbsp;");
      }
      else {
        el.empty();
        el.html(el.data("pos") + "&nbsp;");
      }
    });
    state = state+1;
  });
 });
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -0,0 +1,86 @@
 * {
 min-height: 0;
 min-width: 0;
 }
 body {
  background: #639ab2;
  font-size: 15px;
  font-family: 'Ubuntu Mono', monospace;
 }
 .prelative {
    flex-shrink: 0;
 }
 div.container {
  width: 100%;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:  flex;
  flex-wrap: wrap;
 }
 .word {
  font-size: 3rem;
  float: left;
  position: relative;
  text-align: center;
  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
  display: -ms-flexbox;      /* TWEENER - IE 10 */
  display: -webkit-flex;     /* NEW - Chrome */
  display:flex;
  justify-content: center;
 }
 .word:before,
 .word:after {
    content: '';
    color: #fff;
    position: absolute;
    font-family: 'PT Serif', serif;
    font-weight: bold;
    font-size: 1.5rem;
    font-style: italic;
    opacity: 0;
    width: 100%;
  }
 .word:before {
    content: attr(data-txt);
    flex-shrink: 1;
 }
 .word:hover:before,
 .word:active:after {
    opacity: 1;
 }
 .fade-out {
  color: #275152;
 }
 p {
  margin: 1rem;
 }
 .red {
  color: red;
 }
 .blue {
  color: blue;
 }
 .cyan {
  color: cyan;
 }
 .underline {
  text-decoration: underline;
 }
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -0,0 +1,20 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>Wordtagger</title>
    <meta charset="utf-8" />
    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
    <script type="text/javascript" src="jquery.min.js"></script>
    <script type="text/javascript" src="script.js"></script>
    <!--meta name="viewport" content="width=device-width"-->
  </head>
  <body>
    <div class="container"><p>
      {% for item, value in words_and_tags.items() %}
      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
     {% endfor %}
    </p>
      </div>
  </body>
 </html>
--- a/src/wordtagger/wordtagger.py
+++ b/src/wordtagger/wordtagger.py
@ -0,0 +1,156 @@
 # LIBS
 import nltk
 import json
 import os
 from sys import stdin, stdout
 from nltk import ne_chunk, pos_tag, word_tokenize
 from nltk.sentiment.vader import SentimentIntensityAnalyzer
 from nltk.corpus import stopwords
 from jinja2 import Template
 # == INPUT AND TOKENIZE ==
 # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
 input = stdin.read()
 words = nltk.word_tokenize(input)
 words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
 print(words_and_tags)
 # == FILTER FUNCTIONS ==
 # === 1. POS_tagger & Named Entity Recognizer ===
 # This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
 def POS_tagger(list):
    taggedwordlist = nltk.pos_tag(list)
    for word, pos in nltk.pos_tag(list):
        taggedwordlist = nltk.pos_tag(list)
        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
        print(taggedwordlist)
    taglist = [ pos for word,pos in taggedwordlist ]
    POS_tags = []
    for tag in taglist:
        if tag in {"NNP","NNS","NN","NNPS"}:
            POS_tag = 'noun'
        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
            POS_tag = 'verb'
        elif tag in {'RB','RBR','RBS','WRB'}:
            POS_tag = 'adverb'
        elif tag in {'PRP','PRP$'}:
            POS_tag = 'pronoun'
        elif tag in {'JJ','JJR','JJS'}:
            POS_tag = 'adjective'
        elif tag == 'IN':
            POS_tag = 'preposition'
        elif tag == 'WDT':
            POS_tag = 'determiner'
        elif tag in {'WP','WP$'}:
            POS_tag = 'pronoun'
        elif tag == 'UH':
            POS_tag = 'interjection'
        elif tag == 'POS':
            POS_tag = 'possesive ending'
        elif tag == 'SYM':
            POS_tag = 'symbol'
        elif tag == 'EX':
            POS_tag = 'existential there'
        elif tag == 'DT':
            POS_tag = 'determiner'
        elif tag == 'MD':
            POS_tag = 'modal'
        elif tag == 'LS':
            POS_tag = 'list item marker'
        elif tag == 'FW':
            POS_tag = 'foreign word'
        elif tag == 'CC':
            POS_tag = 'coordinating conjunction '
        elif tag == 'CD':
            POS_tag = 'cardinal number'
        elif tag == 'TO':
            POS_tag = 'to'
        elif tag == '.':
            POS_tag = 'line ending'
        elif tag == ',':
            POS_tag = 'comma'
        else:
            POS_tag = tag
        POS_tags.append(POS_tag)
    #print(POS_tag)
    return POS_tags;
 # === 2. Sentiment tagger ===
 # Sentiment analyzer based on the NLTK VADER tagger.
 # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
 def sentiment_tagger(word):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(word).get("compound")
    if score < 0:
        sentiment_tag = 'negative'
    elif score > 0:
        sentiment_tag = 'positive'
    else:
        sentiment_tag = 'neutral'
    return sentiment_tag
 # === 3. Stopword tagger ===
 # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
 def stopword_tagger(word):
    stopWords = set(stopwords.words('english'))
    if word in stopWords:
        stopword_tag = 'stopword'
    else:
        stopword_tag = 'keyword'
    return stopword_tag
 # Run POS tagger
 # This tagger outputs a list for all items in the dict at once
 # To avoid double work, it is better to keep this outside the for loop
 POS_tags = POS_tagger(words)
 i = 0
 # Adding tags to words in dictionary, which will be exported as a json file
 # {'item 0' : {'word' : word, 'tagger 1': value 1}}
 for item, value in words_and_tags.items():
    word = words_and_tags[item]['word']
    # POS
    pos_tag = POS_tags[i]
    words_and_tags[item]['POS'] = pos_tag
    i = i+1
    # Add sentiment tag
    sentiment_tag = sentiment_tagger(word)
    words_and_tags[item]['sentiment'] = sentiment_tag
    # Add stopword tag
    stopword_tag = stopword_tagger(word)
    words_and_tags[item]['wordtype'] = stopword_tag
    # Add entity tag
    # Not functional yet
 # Save data into a json file
 print(words_and_tags)
 #with open("data.json", 'w') as f:
 with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
  json.dump(words_and_tags, f, ensure_ascii=False)
 #let's bind it to a jinja2 template
 # Jinja moves up one level by default, so I do not need to do it myself as in line 141
 template_open = open("src/wordtagger/template.html", "r")
 template = Template(template_open.read())
 index_render = template.render(words_and_tags=words_and_tags)
 #print(text_render)
 # And render an html file!
 print(index_render)
 index_open = open("output/wordtagger/index.html", "w")
 index_open.write(index_render)
 index_open.close()