Merge branch 'master' of git.xpub.nl:/var/www/git.xpub.nl/repos/OuNoPo-make

7 years ago · b66c5f7d54
parent 13a3edd10d a1051862f5
commit b66c5f7d54
16 changed files with 743 additions and 90 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,3 @@
 images/**
 output/**
-
+src/index.json
--- a/40
+++ b/40
@ -1,4 +1,5 @@
 images=$(sort $(wildcard images/*.jpg))
+# @andre make wildcard so that it takes any image file but doesn't take the listimg.txt file
 output_ocr:=$(dir_ocr)/output.txt
 tmpfile:= $(shell mktemp)
 space:= $(empty) $(empty)
@ -31,12 +32,13 @@ clean: ## removes output (target) files

 dirs: ## create the dirs in working dir
 	@-mkdir -p images/
+	@-mkdir -p images-tiff/
 	@-mkdir -p output/
 	@-mkdir -p ocr/
+	@-mkdir -p hocr/
 	@echo $(color_r)'Directories made': images/ output/


-
 testif:
 ifeq ($(OS),Darwin)
 	@echo $(OS)
@ -49,13 +51,31 @@ ocr/output.txt:  ## ocr with tesseract
 	echo $(listimgs) > $(@D)/list.txt
 	@echo $(basename $@ .txt)
 	tesseract $(@D)/list.txt $(basename $@ .txt)
+	python3 src/build_database.py $(@)
+
+tiffs: ## convert images/ to images-tiff/ Depends on IM
+	echo $(images)
+	for i in $(images); \
+	do tiff=`basename $$i .jpg`.tiff; \
+	convert -density 300 $$i -alpha on images-tiff/$$tiff; \
+	echo $$tiff; \
+	done;

-
+hocrs: ## hocr with tesseract and then change extension to .html
+	for i in images-tiff/*.tiff; \
+	do echo $$i; hocrfile=`basename $$i .tiff`; \
+	tesseract $$i hocr/$$hocrfile hocr; \
+	mv hocr/$$hocrfile.hocr hocr/$$hocrfile.html; \
+	done;

 #OUTPUT GENERATION RECIPES

-output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, nltk's averaged_perceptron_tagger
-	cat $< | python3 src/wordtagger.py > $(@)
+output/wordtagger/index.html: ocr/output.txt ## Analyzes OCR'ed text using a Part of Speech (POS) tagger. Outputs a string of tags (e.g. nouns, verbs, adjectives, and adverbs). Dependencies: python3's nltk, jinja2
+	mkdir -p output/wordtagger
+	cp src/wordtagger/jquery.min.js output/wordtagger
+	cp src/wordtagger/script.js output/wordtagger
+	cp src/wordtagger/style.css output/wordtagger
+	cat $< | python3 src/wordtagger/wordtagger.py
 #  install nltk's 'averaged_perceptron_tagger':
 #  $ python 3
 #  >>> import nltk
@ -65,9 +85,16 @@ output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dep
 	cat $< | python3 src/textbotconversation.py $(@)


-output/n7.txt: ocr/output.txt ## DESCRIBE WHAT IT DOES. Dependencies: python3's chatterbot
+output/n7.txt: ocr/output.txt ## Replaces nouns with the 7th noun that follows. Dependencies: 91k_nouns
 	cat $< | python3 src/n_7.py > $(@)

+output/carlandre.txt: ocr/output.txt ## Alice: Creates visual poetry out of a text. Dependencies: pytest
+	cat $< | python3 src/carlandre.py > $(@)
+# cat $(@) > /dev/usb/lp0
+
+output/overunder: ocr/output.txt ## Alice: An interpreted language that translate simple weaving instructions and creates a weaving pattern on text.
+	python3 src/overunder.py
+

 visualization: $(images) $(tmpfile) ##Creates data visualization from images/*.jpg. Dependencies: mplayer
 	@echo $(tmpfile)
@ -83,3 +110,6 @@ endif

 ttssr-human-only: ocr/output.txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
 	bash src/ttssr-loop-human-only.sh ocr/output.txt
+
+chatbook: ocr/output.txt #chatbot based on the knowledge of the scans Dependencies: nltk_rake, irc, nltk
+	python3 src/chatbook.py
--- a/ocr/list.txt
+++ b/ocr/list.txt
@ -0,0 +1,3 @@
+
+images/0029.jpg
+
--- a/ocr/output.txt
+++ b/ocr/output.txt
@ -0,0 +1,37 @@
+ 
+
+ 
+
+ZEROS + ONES DIGITAL WOMEN 4|» THE NEWTECHNOCULTURE
+
+moments of unknown, disconnected lives, ”invisible voices
+conducted through the tips of her ﬁngers."
+
+Poised as an interface between man and the world, she is
+also wired to a network of digital machines: typists connected to
+QWERTY alphabets, bodies shaped by the motion of the keys,
+one hundred words a minute, viral speed, Thousands oi opera
+tors, relays, calls, exchanges humming in Virtual conjunction,
+learning the same phrases, ﬂipping the same switches,
+repeating the same responses, pushing plugs into the
+answering iacks, maybe two hundred, three hundred times an
+hours She has "a ﬁngertip mastery of the ringing. listening, dial,
+and other keys on her key shelf; of the row or rows of cords for
+making connections; of the location and meaning of all parts of
+the honey combed formation of jacks and trunks for recording,
+for switching, for toll circuits, for tandem, for information-" It
+becomes second nature it grows on her, "Having done this stufl
+a few hundred thousand times, you become quite good at it. In
+fact you're plugging, and connecting, and disconnecting ten,
+twenty, forty cords at a time." After a while these processes
+become "quite satisfying in a way, rather like weaving on an
+upright loom,"
+
+102
+
+ 
+
+ 
+
+ 
+
--- a/src/build_database.py
+++ b/src/build_database.py
@ -0,0 +1,38 @@
+import json
+import argparse
+import sys
+from nltk.tokenize import sent_tokenize, word_tokenize
+from rake_nltk import Rake
+
+r= Rake()
+
+ap = argparse.ArgumentParser("JSON Dumper")
+ap.add_argument("text", nargs="+", help="text sources")
+args=ap.parse_args()
+
+
+with open('src/index.json') as f:
+    try:
+        index = json.load(f)
+    except:
+        index={}
+# build the index of sentences organized by keywords
+alltext = ""
+
+for n in args.text:
+    text = open(n).read()
+    text = text.replace("\n", " ")
+    sentences = sent_tokenize(text)
+    for sentence in sentences:
+        r.extract_keywords_from_text(sentence)
+        keys = r.get_ranked_phrases()
+        for key in keys:
+            if key not in index:
+                index[key] = []
+            index[key].append({'filename': n, 'sentence': sentence, 'key': key})
+    alltext += text
+
+#print(index)
+
+with open('src/index.json', 'w') as outfile:
+    json.dump(index, outfile)
--- a/src/carlandre.py
+++ b/src/carlandre.py
@ -0,0 +1,129 @@
+import pytest
+from math import ceil
+import sys
+from sys import stdout
+import time
+import os.path
+
+
+def pop_items(words, num_items):
+    ''' Removes num_items from words.'''
+    if not words:
+         return [], []
+
+    if num_items > len(words):
+        raise ValueError('Not enough items!')
+
+    popped = []
+    for number in range(num_items):
+        removed = words.pop(0)
+        popped.append(removed)
+    return popped, words
+
+def all_words_less_than(words, maxlength):
+    ''' Checks if the words have the correct length given in maxlength'''
+    for word in words:
+        if len(word) > maxlength:
+            return False
+    return True
+
+def filterwords(words, maxlength):
+    ''' Puts the words which have the correct length in a new list '''
+    goodwords = []
+    for word in words:
+        if len(word) <= maxlength and len(word) >=2:
+            goodwords.append(word)
+    return goodwords
+
+
+def pattern(words, maxlength):
+    goodwords = filterwords(words, maxlength)
+    items_pattern = maxlength + (maxlength -4)
+
+    if len(goodwords) % items_pattern != 0:
+        rest = len(goodwords) % items_pattern
+        difference = len(goodwords) - rest
+        goodwords = goodwords[:difference]
+
+    times = int(len(words) / items_pattern)
+
+    final_pattern = []
+    for each_time in range(times):
+        popped, whatisleft = pop_items(goodwords, items_pattern)
+        if not popped:
+            continue
+        goodwords = whatisleft
+
+        middle = ceil(len(popped)/2)
+
+        ascending = sorted(popped[:middle], key=len)
+        descending = sorted(popped[middle:], key=len, reverse=True)
+
+
+        sorted_pattern = ascending + descending
+        final_pattern.append(sorted_pattern)
+
+    return final_pattern
+
+
+def test_pattern_returns_list():
+    list_items = ['a', 'b', 'c', 'd', 'e']
+    assert type(pattern(list_items, 3)) == type([])
+
+def test_pattern_removes_over_max_len():
+    list_words_right_length = [['a', 'aa', 'aaa', 'aa', 'a']]
+    words_wrong_length = list_words_right_length[0] + ['aaaaa']
+    assert pattern(words_wrong_length, 3) == list_words_right_length
+
+def test_pop_items():
+    assert pop_items(['a', 'aaa'], 1) == (['a'], ['aaa'])
+
+def test_pop_items_empty_list():
+    assert pop_items([], 70) == ([], [])
+
+def test_pop_items_num_too_big():
+    with pytest.raises(ValueError):
+        pop_items(['a', 'b'], 3)
+
+def test_cuts_for_pattern():
+    list_with_nine = ['a'] * 9
+    result = pattern(list_with_nine, 3)
+    assert len(result[0]) == 5
+
+def test_empty_list_for_pattern():
+    result = pattern([], 3)
+    assert result == []
+
+def test_list_too_short_for_pattern():
+    list_too_short = ['a', 'aa']
+    result = pattern(list_too_short, 3)
+    assert result == []
+
+if __name__ == '__main__':
+    with open('ocr/output.txt', 'r') as handle:
+        contents = handle.read()
+    splitted = contents.split()
+    ll = (pattern(splitted, 8))
+    my_list = []
+    for l in ll:
+        for x in l:
+            my_list.append(x)
+    joined_list = '\n'.join(my_list)
+
+
+
+my_path = '/dev/usb/lp0'
+if os.path.exists(my_path):
+    sys.stdout = open(my_path, 'w')
+
+
+
+escpos = {
+    "init_printer":  "\x1B\x40",
+    'papercut':'\x1D\x56\x00',
+}
+
+for i in range(10):
+    print(escpos['init_printer'])
+    print(joined_list)
+    print(escpos['papercut'])
--- a/src/chatbook.py
+++ b/src/chatbook.py
@ -0,0 +1,79 @@
+import irc.bot
+from rake_nltk import Rake
+import random
+from nltk.tokenize import sent_tokenize, word_tokenize
+import json
+#from thread import start_new_thread
+import os
+
+r = Rake()
+
+def chunks(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i+n]
+
+class HelloBot(irc.bot.SingleServerIRCBot):
+    def __init__(self, channel, nickname, server, port=6667, index=None):
+        print("connecting to chatroom...")
+        irc.bot.SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
+        self.channel = channel
+        self.index = index
+
+    def on_welcome(self, c, e):
+        c.join(self.channel)
+        print("joined chatroom")
+
+    def on_privmsg(self, c, e):
+        pass
+
+    def on_pubmsg(self, c, e):
+        print(e.arguments, e.source)
+        msg=e.arguments[0]
+        print(e.source.split("!")[0][:1])
+        incoming_msg = e.arguments[0]
+        r.extract_keywords_from_text(msg)
+        listOfKeys = r.get_ranked_phrases()
+
+        msg_where = ""
+
+        if e.source.split("!")[0][-3:] != "bot" or e.source.split("!")[0][:1] != "A":
+            print("true")
+            for keyWord in listOfKeys:
+                if keyWord in self.index:
+                    msg = (index.get(keyWord)[0].get('sentence'))
+                    msg_where = "I found this in {}".format(index.get(keyWord)[0].get('filename'))
+                else:
+                    msg = "I don't know anything about that"
+                    msg_where = ""
+            for chunk in chunks(msg, 400):
+                print(chunk)
+                c.privmsg(self.channel, chunk)
+        else:
+            print("bot")
+
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+
+    ap = argparse.ArgumentParser("IRC Bot")
+    ap.add_argument("--server", default="irc.freenode.net")
+    ap.add_argument("--port", type=int, default=6667)
+    ap.add_argument("--channel", default="#pzi")
+    ap.add_argument("--nickname", default="scanbot")
+    ap.add_argument("--text", nargs="+", help="database to use", default="index.json")
+    args=ap.parse_args()
+
+    # build the index of sentences organized by keywords
+    with open("src/index.json") as f:
+        try:
+            index = json.load(f)
+        except:
+            index={}
+
+    #print(index)
+    myhost = os.uname()[1]
+
+
+    bot = HelloBot(args.channel, "A-2{}-bot".format(len(index)), args.server, args.port, index)
+    bot.start()
--- a/src/index.json
+++ b/src/index.json
--- a/src/overunder.py
+++ b/src/overunder.py
@ -0,0 +1,90 @@
+import linecache
+import textwrap
+import sys
+from sys import exit
+
+class LeavingProgram(Exception):
+    pass
+
+def parse(program):
+    cmds = program.split(',')
+    splitted_cmds = []
+    for cmd in cmds:
+        splitted = cmd.split()
+        splitted_cmds.append(splitted)
+    return splitted_cmds
+
+    #return tokenize(program)
+def tokenize(s):
+    return s.split()
+
+def repl():
+    while True:
+        try:
+            val = eval(parse(input('> ')))
+            if val is not None:
+                print(val)
+        except LeavingProgram:
+            break
+
+text = None
+line_number = 0
+last_index = 0
+
+
+def eval(cmds):
+    global text
+    global line_number
+    global last_index
+
+    for cmd in cmds:
+        if cmd == []:
+            line_number += 1
+            last_index = 0
+
+        elif cmd[0] == 'load':
+            contents = open('ocr/output.txt').read()
+            text = textwrap.wrap(contents, 40, break_long_words=True)
+            print('\n'.join(text))
+            line_number = 0
+            last_index = 0
+
+        elif cmd[0] == 'show':
+            print(text[line_number])
+
+        elif cmd[0] == 'under':
+            current_line = text[line_number]
+            char_number = int(cmd[1]) - 1
+            char_list = list(current_line)
+
+            x=range(last_index, char_number + last_index + 1)
+            for time in x:
+                if time < len(char_list):
+                    char_list[time] = u'\u21e2'
+
+            last_index += char_number + 1
+
+            joined = ''.join(char_list)
+            text[line_number] = joined
+
+        elif cmd[0] == 'over':
+            last_index += int(cmd[1])
+
+        elif cmd[0] == 'pattern':
+
+            pattern = text[0:line_number + 1]
+            print('\n'.join(pattern))
+
+
+        elif cmd[0] == 'quit':
+            print('Come back soon!')
+            raise LeavingProgram()
+        else:
+            joined = ' '.join(cmd)
+            print('Did not understand command {}'.format(joined))
+
+
+
+
+if __name__ == '__main__':
+    repl()
--- a/src/wordtagger.py
+++ b/src/wordtagger.py
@ -1,82 +0,0 @@
-import nltk
-from sys import stdin, stdout
-
-# Define input
-input = stdin.read()
-
-# FILTER FUNCTIONS
-# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
-def postagger(string):
-    words = nltk.word_tokenize(string)
-    taggedwordlist = nltk.pos_tag(words)
-
-    for word, pos in nltk.pos_tag(words):
-        taggedwordlist = nltk.pos_tag(words)
-        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
-
-    taglist = [ pos for word,pos in taggedwordlist ]
-    #print(taglist)
-    return taglist;
-
-# This function changes the tags to readable equivalents (NNP to noun for example)
-def postagger_readable(list):
-    readabletaglist = []
-
-    for tag in list:
-        if tag in {"NNP","NNS","NN","NNPS"}:
-            readabletag = 'noun'
-        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
-            readabletag = 'verb'
-        elif tag in {'RB','RBR','RBS','WRB'}:
-            readabletag = 'adverb'
-        elif tag in {'PRP','PRP$'}:
-            readabletag = 'pronoun'
-        elif tag in {'JJ','JJR','JJS'}:
-            readabletag = 'adjective'
-        elif tag == 'IN':
-            readabletag = 'preposition'
-        elif tag == 'WDT':
-            readabletag = 'determiner'
-        elif tag in {'WP','WP$'}:
-            readabletag = 'pronoun'
-        elif tag == 'UH':
-            readabletag = 'interjection'
-        elif tag == 'POS':
-            readabletag = 'possesive ending'
-        elif tag == 'SYM':
-            readabletag = 'symbol'
-        elif tag == 'EX':
-            readabletag = 'existential there'
-        elif tag == 'DT':
-            readabletag = 'determiner'
-        elif tag == 'MD':
-            readabletag = 'modal'
-        elif tag == 'LS':
-            readabletag = 'list item marker'
-        elif tag == 'FW':
-            readabletag = 'foreign word'
-        elif tag == 'CC':
-            readabletag = 'coordinating conjunction '
-        elif tag == 'CD':
-            readabletag = 'cardinal number'
-        elif tag == 'TO':
-            readabletag = 'to'
-        elif tag == '.':
-            readabletag = 'line ending'
-        elif tag == ',':
-            readabletag = 'comma'
-        else:
-            readabletag = tag
-
-        readabletaglist.append(readabletag)
-    return readabletaglist;
-
-
-# This function creates the output
-def main():
-    taglist = postagger(input)
-    readabletaglist = postagger_readable(taglist)
-    stdout.write(' '.join(readabletaglist))
-    stdout.write('\n')
-
-main()
--- a/src/wordtagger/jquery.min.js
+++ b/src/wordtagger/jquery.min.js
--- a/src/wordtagger/script.js
+++ b/src/wordtagger/script.js
@ -0,0 +1,64 @@
+$(document).ready(function(){
+  var state = 0;
+
+  $('.noun').addClass('fade-out');
+  $('.preposition').addClass('red');
+  $('.verb').addClass('blue');
+  $('.determiner').addClass('cyan');
+
+  $(document).bind('contextmenu', function(e) { return false; });
+
+  $( ".word" ).contextmenu(function() {
+    console.log($(this).hasClass('underline'));
+    $(this).hasClass('underline') == false
+    ? $(this).addClass('underline')
+    : $(this).removeClass('underline');
+  });
+
+  $('.word').click( function() {
+    var el = $('.word');
+    console.log(state);
+
+    if (state == 0) {
+      $('.word').removeClass('fade-out red blue cyan');
+
+      $('.stopword').addClass('fade-out');
+    }
+
+    else if (state == 1) {
+      $('.stopword').removeClass('fade-out');
+      $('.neutral').addClass('fade-out');
+    }
+
+    else if (state == 2) {
+      $('.neutral').removeClass('fade-out');
+      $('.noun').addClass('fade-out');
+      $('.preposition').addClass('red');
+      $('.verb').addClass('blue');
+      state = -1;
+    }
+
+    $('.word').each(function() {
+      var el = $(this);
+
+      if (state == 0) {
+        el.empty();
+        el.html(el.data("stopword") + "&nbsp;");
+      }
+
+      else if (state == 1) {
+        el.empty();
+        el.html(el.data("sentiment") + "&nbsp;");
+      }
+
+      else {
+        el.empty();
+        el.html(el.data("pos") + "&nbsp;");
+      }
+
+    });
+
+    state = state+1;
+  });
+
+});
--- a/src/wordtagger/style.css
+++ b/src/wordtagger/style.css
@ -0,0 +1,86 @@
+* {
+min-height: 0;
+min-width: 0;
+}
+
+body {
+  background: #639ab2;
+  font-size: 15px;
+  font-family: 'Ubuntu Mono', monospace;
+}
+
+.prelative {
+    flex-shrink: 0;
+}
+
+div.container {
+  width: 100%;
+  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
+  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
+  display: -ms-flexbox;      /* TWEENER - IE 10 */
+  display: -webkit-flex;     /* NEW - Chrome */
+  display:  flex;
+  flex-wrap: wrap;
+}
+
+.word {
+  font-size: 3rem;
+  float: left;
+  position: relative;
+  text-align: center;
+  display: -webkit-box;      /* OLD - iOS 6-, Safari 3.1-6 */
+  display: -moz-box;         /* OLD - Firefox 19- (buggy but mostly works) */
+  display: -ms-flexbox;      /* TWEENER - IE 10 */
+  display: -webkit-flex;     /* NEW - Chrome */
+  display:flex;
+  justify-content: center;
+}
+
+.word:before,
+.word:after {
+    content: '';
+    color: #fff;
+    position: absolute;
+    font-family: 'PT Serif', serif;
+    font-weight: bold;
+    font-size: 1.5rem;
+    font-style: italic;
+    opacity: 0;
+    width: 100%;
+  }
+
+.word:before {
+    content: attr(data-txt);
+    flex-shrink: 1;
+}
+
+.word:hover:before,
+.word:active:after {
+    opacity: 1;
+}
+
+
+
+.fade-out {
+  color: #275152;
+}
+
+p {
+  margin: 1rem;
+}
+
+.red {
+  color: red;
+}
+
+.blue {
+  color: blue;
+}
+
+.cyan {
+  color: cyan;
+}
+
+.underline {
+  text-decoration: underline;
+}
--- a/src/wordtagger/template.html
+++ b/src/wordtagger/template.html
@ -0,0 +1,20 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Wordtagger</title>
+    <meta charset="utf-8" />
+    <link rel="stylesheet" href="style.css" type="text/css" media="screen" />
+    <script type="text/javascript" src="jquery.min.js"></script>
+    <script type="text/javascript" src="script.js"></script>
+    <!--meta name="viewport" content="width=device-width"-->
+  </head>
+
+  <body>
+    <div class="container"><p>
+      {% for item, value in words_and_tags.items() %}
+      <span id="{{item}}" class="word {{words_and_tags[item]['sentiment']}} {{words_and_tags[item]['wordtype']}} {{words_and_tags[item]['POS']}}" data-txt="{{ words_and_tags[item]['word'] }}" data-pos="{{words_and_tags[item]['POS']}}" {% if words_and_tags[item]['word'] in [',','.','(',')'] %} data-sentiment= "{{ words_and_tags[item]['word'] }}" {% else %} data-sentiment= '{{ words_and_tags[item]['sentiment'] }}' {% endif %} {% if words_and_tags[item]['wordtype'] == 'stopword' %} data-stopword= "stopword" {% else %} data-stopword= '{{ words_and_tags[item]['word'] }}' {% endif %} >{{words_and_tags[item]['POS']}}&nbsp;</span>
+     {% endfor %}
+    </p>
+      </div>
+  </body>
+</html>
--- a/src/wordtagger/wordtagger.py
+++ b/src/wordtagger/wordtagger.py
@ -0,0 +1,156 @@
+# LIBS
+import nltk
+import json
+import os
+from sys import stdin, stdout
+from nltk import ne_chunk, pos_tag, word_tokenize
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+from nltk.corpus import stopwords
+from jinja2 import Template
+
+# == INPUT AND TOKENIZE ==
+# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
+input = stdin.read()
+words = nltk.word_tokenize(input)
+words_and_tags = {'item ' + str(index) : {'word':word} for index , word in enumerate(words)}
+print(words_and_tags)
+
+# == FILTER FUNCTIONS ==
+
+# === 1. POS_tagger & Named Entity Recognizer ===
+# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
+def POS_tagger(list):
+    taggedwordlist = nltk.pos_tag(list)
+
+
+    for word, pos in nltk.pos_tag(list):
+        taggedwordlist = nltk.pos_tag(list)
+        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
+        print(taggedwordlist)
+    taglist = [ pos for word,pos in taggedwordlist ]
+    POS_tags = []
+
+    for tag in taglist:
+        if tag in {"NNP","NNS","NN","NNPS"}:
+            POS_tag = 'noun'
+        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
+            POS_tag = 'verb'
+        elif tag in {'RB','RBR','RBS','WRB'}:
+            POS_tag = 'adverb'
+        elif tag in {'PRP','PRP$'}:
+            POS_tag = 'pronoun'
+        elif tag in {'JJ','JJR','JJS'}:
+            POS_tag = 'adjective'
+        elif tag == 'IN':
+            POS_tag = 'preposition'
+        elif tag == 'WDT':
+            POS_tag = 'determiner'
+        elif tag in {'WP','WP$'}:
+            POS_tag = 'pronoun'
+        elif tag == 'UH':
+            POS_tag = 'interjection'
+        elif tag == 'POS':
+            POS_tag = 'possesive ending'
+        elif tag == 'SYM':
+            POS_tag = 'symbol'
+        elif tag == 'EX':
+            POS_tag = 'existential there'
+        elif tag == 'DT':
+            POS_tag = 'determiner'
+        elif tag == 'MD':
+            POS_tag = 'modal'
+        elif tag == 'LS':
+            POS_tag = 'list item marker'
+        elif tag == 'FW':
+            POS_tag = 'foreign word'
+        elif tag == 'CC':
+            POS_tag = 'coordinating conjunction '
+        elif tag == 'CD':
+            POS_tag = 'cardinal number'
+        elif tag == 'TO':
+            POS_tag = 'to'
+        elif tag == '.':
+            POS_tag = 'line ending'
+        elif tag == ',':
+            POS_tag = 'comma'
+        else:
+            POS_tag = tag
+        POS_tags.append(POS_tag)
+    #print(POS_tag)
+    return POS_tags;
+
+# === 2. Sentiment tagger ===
+# Sentiment analyzer based on the NLTK VADER tagger.
+# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
+def sentiment_tagger(word):
+    analyzer = SentimentIntensityAnalyzer()
+    score = analyzer.polarity_scores(word).get("compound")
+
+    if score < 0:
+        sentiment_tag = 'negative'
+    elif score > 0:
+        sentiment_tag = 'positive'
+    else:
+        sentiment_tag = 'neutral'
+
+    return sentiment_tag
+
+# === 3. Stopword tagger ===
+# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
+def stopword_tagger(word):
+
+    stopWords = set(stopwords.words('english'))
+
+    if word in stopWords:
+        stopword_tag = 'stopword'
+    else:
+        stopword_tag = 'keyword'
+
+    return stopword_tag
+
+
+# Run POS tagger
+# This tagger outputs a list for all items in the dict at once
+# To avoid double work, it is better to keep this outside the for loop
+POS_tags = POS_tagger(words)
+i = 0
+
+# Adding tags to words in dictionary, which will be exported as a json file
+# {'item 0' : {'word' : word, 'tagger 1': value 1}}
+for item, value in words_and_tags.items():
+    word = words_and_tags[item]['word']
+
+    # POS
+    pos_tag = POS_tags[i]
+    words_and_tags[item]['POS'] = pos_tag
+    i = i+1
+
+    # Add sentiment tag
+    sentiment_tag = sentiment_tagger(word)
+    words_and_tags[item]['sentiment'] = sentiment_tag
+
+    # Add stopword tag
+    stopword_tag = stopword_tagger(word)
+    words_and_tags[item]['wordtype'] = stopword_tag
+
+    # Add entity tag
+    # Not functional yet
+
+# Save data into a json file
+print(words_and_tags)
+#with open("data.json", 'w') as f:
+with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/wordtagger/data.json", 'w') as f:
+  json.dump(words_and_tags, f, ensure_ascii=False)
+
+#let's bind it to a jinja2 template
+# Jinja moves up one level by default, so I do not need to do it myself as in line 141
+template_open = open("src/wordtagger/template.html", "r")
+template = Template(template_open.read())
+index_render = template.render(words_and_tags=words_and_tags)
+#print(text_render)
+
+# And render an html file!
+print(index_render)
+index_open = open("output/wordtagger/index.html", "w")
+index_open.write(index_render)
+index_open.close()