OuNuPo/src/reading_structure/reading_structure.py

# LIBS
import nltk
import json
import os
from sys import stdin, stdout
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from jinja2 import Template

# == INPUT AND TOKENIZE ==
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
input = stdin.read()
words = nltk.word_tokenize(input)
words_and_tags = {index : {'word':word} for index , word in enumerate(words)}
print(words_and_tags)

# == FILTER FUNCTIONS ==

# === 1. POS_tagger & Named Entity Recognizer ===
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
def POS_tagger(list):
    taggedwordlist = nltk.pos_tag(list)


    for word, pos in nltk.pos_tag(list):
        taggedwordlist = nltk.pos_tag(list)
        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
        print(taggedwordlist)
    taglist = [ pos for word,pos in taggedwordlist ]
    POS_tags = []

    for tag in taglist:
        if tag in {"NNP","NNS","NN","NNPS"}:
            POS_tag = 'noun'
        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
            POS_tag = 'verb'
        elif tag in {'RB','RBR','RBS','WRB'}:
            POS_tag = 'adverb'
        elif tag in {'PRP','PRP$'}:
            POS_tag = 'pronoun'
        elif tag in {'JJ','JJR','JJS'}:
            POS_tag = 'adjective'
        elif tag == 'IN':
            POS_tag = 'preposition'
        elif tag == 'WDT':
            POS_tag = 'determiner'
        elif tag in {'WP','WP$'}:
            POS_tag = 'pronoun'
        elif tag == 'UH':
            POS_tag = 'interjection'
        elif tag == 'POS':
            POS_tag = 'possesive ending'
        elif tag == 'SYM':
            POS_tag = 'symbol'
        elif tag == 'EX':
            POS_tag = 'existential there'
        elif tag == 'DT':
            POS_tag = 'determiner'
        elif tag == 'MD':
            POS_tag = 'modal'
        elif tag == 'LS':
            POS_tag = 'list item marker'
        elif tag == 'FW':
            POS_tag = 'foreign word'
        elif tag == 'CC':
            POS_tag = 'coordinating conjunction '
        elif tag == 'CD':
            POS_tag = 'cardinal number'
        elif tag == 'TO':
            POS_tag = 'infinitival to'
        elif tag == '.':
            POS_tag = 'line ending'
        elif tag == ',':
            POS_tag = 'comma'
        else:
            POS_tag = tag
        POS_tags.append(POS_tag)
    #print(POS_tag)
    return POS_tags;

# === 2. Sentiment tagger ===
# Sentiment analyzer based on the NLTK VADER tagger.
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
def sentiment_tagger(list):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_tags = []

    for word in list:
        score = analyzer.polarity_scores(word).get("compound")

        if score < 0:
            sentiment_tag = 'negative'
        elif score > 0:
            sentiment_tag = 'positive'
        else:
            sentiment_tag = 'neutral'

        sentiment_tags.append(sentiment_tag)

    return sentiment_tags

# === 3. Stopword tagger ===
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
def stopword_tagger(word):

    stopWords = set(stopwords.words('english'))

    if word in stopWords:
        stopword_tag = 'stopword'
    else:
        stopword_tag = 'keyword'

    return stopword_tag


# Run POS tagger
# This tagger outputs a list for all items in the dict at once
# To avoid double work, it is better to keep this outside the for loop
POS_tags = POS_tagger(words)
sentiment_tags = sentiment_tagger(words)
i = 0

# Adding tags to words in dictionary, which will be exported as a json file
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
for item, value in words_and_tags.items():
    word = words_and_tags[item]['word']

    # POS
    pos_tag = POS_tags[i]
    words_and_tags[item]['POS'] = pos_tag
    #i = i+1

    # Add sentiment tag
    #sentiment_tag = sentiment_tagger(word)
    #words_and_tags[item]['sentiment'] = sentiment_tag
    sentiment_tag = sentiment_tags[i]
    words_and_tags[item]['sentiment'] = sentiment_tag
    i = i+1

    # Add stopword tag
    stopword_tag = stopword_tagger(word)
    words_and_tags[item]['wordtype'] = stopword_tag

    # Add entity tag
    # Not functional yet

# Save data into a json file
print(words_and_tags)
#with open("data.json", 'w') as f:
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f:
  json.dump(words_and_tags, f, ensure_ascii=False)

#let's bind it to a jinja2 template
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
template_open = open("src/reading_structure/template.html", "r")
template = Template(template_open.read())
index_render = template.render(words_and_tags=words_and_tags)
#print(text_render)

# And render an html file!
print(index_render)
index_open = open("output/reading_structure/index.html", "w")
index_open.write(index_render)
index_open.close()