# LIBS import nltk import json import os from sys import stdin, stdout from nltk import ne_chunk, pos_tag, word_tokenize from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk.corpus import stopwords from jinja2 import Template # == INPUT AND TOKENIZE == # Define input, tokenize and safe tokens to dictionary. Use index as ID for each word. input = stdin.read() words = nltk.word_tokenize(input) words_and_tags = {index : {'word':word} for index , word in enumerate(words)} print(words_and_tags) # == FILTER FUNCTIONS == # === 1. POS_tagger & Named Entity Recognizer === # This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags def POS_tagger(list): taggedwordlist = nltk.pos_tag(list) for word, pos in nltk.pos_tag(list): taggedwordlist = nltk.pos_tag(list) #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step print(taggedwordlist) taglist = [ pos for word,pos in taggedwordlist ] POS_tags = [] for tag in taglist: if tag in {"NNP","NNS","NN","NNPS"}: POS_tag = 'noun' elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}: POS_tag = 'verb' elif tag in {'RB','RBR','RBS','WRB'}: POS_tag = 'adverb' elif tag in {'PRP','PRP$'}: POS_tag = 'pronoun' elif tag in {'JJ','JJR','JJS'}: POS_tag = 'adjective' elif tag == 'IN': POS_tag = 'preposition' elif tag == 'WDT': POS_tag = 'determiner' elif tag in {'WP','WP$'}: POS_tag = 'pronoun' elif tag == 'UH': POS_tag = 'interjection' elif tag == 'POS': POS_tag = 'possesive ending' elif tag == 'SYM': POS_tag = 'symbol' elif tag == 'EX': POS_tag = 'existential there' elif tag == 'DT': POS_tag = 'determiner' elif tag == 'MD': POS_tag = 'modal' elif tag == 'LS': POS_tag = 'list item marker' elif tag == 'FW': POS_tag = 'foreign word' elif tag == 'CC': POS_tag = 'coordinating conjunction ' elif tag == 'CD': POS_tag = 'cardinal number' elif tag == 'TO': POS_tag = 'infinitival to' elif tag == '.': POS_tag = 'line ending' elif tag == ',': POS_tag = 'comma' else: POS_tag = tag POS_tags.append(POS_tag) #print(POS_tag) return POS_tags; # === 2. Sentiment tagger === # Sentiment analyzer based on the NLTK VADER tagger. # This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive def sentiment_tagger(list): analyzer = SentimentIntensityAnalyzer() sentiment_tags = [] for word in list: score = analyzer.polarity_scores(word).get("compound") if score < 0: sentiment_tag = 'negative' elif score > 0: sentiment_tag = 'positive' else: sentiment_tag = 'neutral' sentiment_tags.append(sentiment_tag) return sentiment_tags # === 3. Stopword tagger === # Labels words on being a keyword or a stopword, based on the list in the NLTK corpus def stopword_tagger(word): stopWords = set(stopwords.words('english')) if word in stopWords: stopword_tag = 'stopword' else: stopword_tag = 'keyword' return stopword_tag # Run POS tagger # This tagger outputs a list for all items in the dict at once # To avoid double work, it is better to keep this outside the for loop POS_tags = POS_tagger(words) sentiment_tags = sentiment_tagger(words) i = 0 # Adding tags to words in dictionary, which will be exported as a json file # {'item 0' : {'word' : word, 'tagger 1': value 1}} for item, value in words_and_tags.items(): word = words_and_tags[item]['word'] # POS pos_tag = POS_tags[i] words_and_tags[item]['POS'] = pos_tag #i = i+1 # Add sentiment tag #sentiment_tag = sentiment_tagger(word) #words_and_tags[item]['sentiment'] = sentiment_tag sentiment_tag = sentiment_tags[i] words_and_tags[item]['sentiment'] = sentiment_tag i = i+1 # Add stopword tag stopword_tag = stopword_tagger(word) words_and_tags[item]['wordtype'] = stopword_tag # Add entity tag # Not functional yet # Save data into a json file print(words_and_tags) #with open("data.json", 'w') as f: with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f: json.dump(words_and_tags, f, ensure_ascii=False) #let's bind it to a jinja2 template # Jinja moves up one level by default, so I do not need to do it myself as in line 141 template_open = open("src/reading_structure/template.html", "r") template = Template(template_open.read()) index_render = template.render(words_and_tags=words_and_tags) #print(text_render) # And render an html file! print(index_render) index_open = open("output/reading_structure/index.html", "w") index_open.write(index_render) index_open.close()