You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
228 lines
7.0 KiB
Python
228 lines
7.0 KiB
Python
# LIBS
|
|
import nltk
|
|
import json
|
|
import os
|
|
from sys import stdin, stdout
|
|
from nltk import pos_tag, word_tokenize
|
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
|
from nltk.corpus import stopwords
|
|
import spacy
|
|
from jinja2 import Template
|
|
|
|
# == INPUT AND TOKENIZE ==
|
|
# Define input, tokenize and safe tokens to dictionary. Use index as ID for each word.
|
|
input = stdin.read()
|
|
words = nltk.word_tokenize(input)
|
|
words_and_tags = {index : {'word':word} for index , word in enumerate(words)}
|
|
#print(words_and_tags)
|
|
|
|
# == TAGGING FUNCTIONS ==
|
|
|
|
# === 1. POS_tagger & Named Entity Recognizer ===
|
|
# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
|
|
def POS_tagger(list):
|
|
taggedwordlist = nltk.pos_tag(list)
|
|
|
|
for word, pos in nltk.pos_tag(list):
|
|
taggedwordlist = nltk.pos_tag(list)
|
|
#print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
|
|
#print(taggedwordlist)
|
|
|
|
taglist = [ pos for word,pos in taggedwordlist ]
|
|
POS_tags = []
|
|
|
|
for tag in taglist:
|
|
if tag in {"NNP","NNS","NN","NNPS"}:
|
|
POS_tag = 'noun'
|
|
elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
|
|
POS_tag = 'verb'
|
|
elif tag in {'RB','RBR','RBS','WRB', 'RP'}:
|
|
POS_tag = 'adverb'
|
|
elif tag in {'PRP','PRP$'}:
|
|
POS_tag = 'pronoun'
|
|
elif tag in {'JJ','JJR','JJS'}:
|
|
POS_tag = 'adjective'
|
|
elif tag == 'IN':
|
|
POS_tag = 'preposition'
|
|
elif tag == 'WDT':
|
|
POS_tag = 'determiner'
|
|
elif tag in {'WP','WP$'}:
|
|
POS_tag = 'pronoun'
|
|
elif tag == 'UH':
|
|
POS_tag = 'interjection'
|
|
elif tag == 'POS':
|
|
POS_tag = 'possesive ending'
|
|
elif tag == 'SYM':
|
|
POS_tag = 'symbol'
|
|
elif tag == 'EX':
|
|
POS_tag = 'existential there'
|
|
elif tag == 'DT':
|
|
POS_tag = 'determiner'
|
|
elif tag == 'MD':
|
|
POS_tag = 'modal'
|
|
elif tag == 'LS':
|
|
POS_tag = 'list item marker'
|
|
elif tag == 'FW':
|
|
POS_tag = 'foreign word'
|
|
elif tag == 'CC':
|
|
POS_tag = 'coordinating conjunction '
|
|
elif tag == 'CD':
|
|
POS_tag = 'cardinal number'
|
|
elif tag == 'TO':
|
|
POS_tag = 'to'
|
|
elif tag == '.':
|
|
POS_tag = 'line ending'
|
|
elif tag == ',':
|
|
POS_tag = 'comma'
|
|
else:
|
|
POS_tag = tag
|
|
|
|
POS_tags.append(POS_tag)
|
|
|
|
#print(POS_tag)
|
|
return POS_tags;
|
|
|
|
# === 2. Sentiment tagger ===
|
|
# Sentiment analyzer based on the NLTK VADER tagger.
|
|
# This function uses words as an input. It tags each word based on its sentiment: negative, neutral or positive
|
|
def sentiment_tagger(list):
|
|
analyzer = SentimentIntensityAnalyzer()
|
|
sentiment_tags = []
|
|
|
|
for word in list:
|
|
score = analyzer.polarity_scores(word).get("compound")
|
|
|
|
if score < 0:
|
|
sentiment_tag = 'negative'
|
|
elif score > 0:
|
|
sentiment_tag = 'positive'
|
|
else:
|
|
sentiment_tag = 'neutral'
|
|
|
|
sentiment_tags.append(sentiment_tag)
|
|
|
|
return sentiment_tags
|
|
|
|
# === 3. Stopword tagger ===
|
|
# Labels words on being a keyword or a stopword, based on the list in the NLTK corpus
|
|
def stopword_tagger(word):
|
|
|
|
stopWords = set(stopwords.words('english'))
|
|
|
|
if word in stopWords:
|
|
stopword_tag = 'stopword'
|
|
else:
|
|
stopword_tag = 'keyword'
|
|
|
|
return stopword_tag
|
|
|
|
# === 4. Named Entity tagger ===
|
|
def ner_tagger(list):
|
|
ner_en = spacy.load('en_core_web_sm') # use English model for SpaCy
|
|
#ner_en = spacy.load('xx_ent_wiki_sm') # use multilingual model (performance was less than the English model in my use case)
|
|
text = " ".join(list) # Convert NLTK tokens back to string, so SpaCy can change it to its own format
|
|
|
|
ner_spacy_doc = ner_en(text)
|
|
ner_tags = []
|
|
|
|
for word in ner_spacy_doc:
|
|
#print(word.ent_type_)
|
|
|
|
if word.ent_type_ == "PERSON":
|
|
ner_tag = 'geographical entity'
|
|
elif word.ent_type_ == 'NORP':
|
|
ner_tag = 'group'
|
|
elif word.ent_type_ == 'FACILITY':
|
|
ner_tag = 'facility'
|
|
elif word.ent_type_ == 'ORG':
|
|
ner_tag = 'organisation'
|
|
elif word.ent_type_ in {'GPE','LOC'}:
|
|
ner_tag = 'location'
|
|
elif word.ent_type_ == 'PRODUCT':
|
|
ner_tag = 'product'
|
|
elif word.ent_type_ == 'EVENT':
|
|
ner_tag = 'event'
|
|
elif word.ent_type_ == 'WORK_OF_ART':
|
|
ner_tag = 'artwork'
|
|
elif word.ent_type_ == 'LAW':
|
|
ner_tag = 'law'
|
|
elif word.ent_type_ == 'LANGUAGE':
|
|
ner_tag = 'language'
|
|
elif word.ent_type_ == 'DATE':
|
|
ner_tag = 'date'
|
|
elif word.ent_type_ == 'TIME':
|
|
ner_tag = 'time'
|
|
elif word.ent_type_ == 'PERCENT':
|
|
ner_tag = 'percentage'
|
|
elif word.ent_type_ == 'MONEY':
|
|
ner_tag = 'monetary value'
|
|
elif word.ent_type_ == 'QUANTITY':
|
|
ner_tag = 'measurement'
|
|
elif word.ent_type_ == 'ORDINAL':
|
|
ner_tag = 'ordinal'
|
|
elif word.ent_type_ == 'CARDINAL':
|
|
ner_tag = 'numeral'
|
|
else:
|
|
ner_tag = 'no entity'
|
|
|
|
ner_tags.append(ner_tag)
|
|
|
|
print(ner_tags)
|
|
return ner_tags
|
|
|
|
|
|
# Run POS tagger
|
|
# These tagger outputs a list for all items in the dict at once
|
|
# To avoid double work, it is better to keep this outside the for loop
|
|
POS_tags = POS_tagger(words)
|
|
sentiment_tags = sentiment_tagger(words)
|
|
ner_tags = ner_tagger(words)
|
|
i = 0
|
|
|
|
# Adding tags to words in dictionary, which will be exported as a json file
|
|
# {'item 0' : {'word' : word, 'tagger 1': value 1}}
|
|
for item, value in words_and_tags.items():
|
|
word = words_and_tags[item]['word']
|
|
|
|
# POS
|
|
pos_tag = POS_tags[i]
|
|
words_and_tags[item]['POS'] = pos_tag
|
|
|
|
# Add sentiment tag
|
|
#sentiment_tag = sentiment_tagger(word)
|
|
#words_and_tags[item]['sentiment'] = sentiment_tag
|
|
sentiment_tag = sentiment_tags[i]
|
|
words_and_tags[item]['sentiment'] = sentiment_tag
|
|
|
|
# Named Entity Recognition
|
|
ner_tag = ner_tags[i]
|
|
words_and_tags[item]['named entity'] = ner_tag
|
|
|
|
# Move to the next word in the tokenized words dictionary
|
|
i = i+1
|
|
|
|
# Add stopword tag
|
|
stopword_tag = stopword_tagger(word)
|
|
words_and_tags[item]['wordtype'] = stopword_tag
|
|
|
|
|
|
# Save data into a json file
|
|
#print(words_and_tags)
|
|
|
|
#with open("data.json", 'w') as f:
|
|
with open(os.path.dirname(os.path.dirname(os.path.dirname( __file__ ))) + "output/reading_structure/data.json", 'w') as f:
|
|
json.dump(words_and_tags, f, ensure_ascii=False)
|
|
|
|
#let's bind it to a jinja2 template
|
|
# Jinja moves up one level by default, so I do not need to do it myself as in line 141
|
|
template_open = open("src/reading_structure/template.html", "r")
|
|
template = Template(template_open.read())
|
|
index_render = template.render(words_and_tags=words_and_tags)
|
|
#print(text_render)
|
|
|
|
# And render an html file!
|
|
#print(index_render)
|
|
index_open = open("output/reading_structure/index.html", "w")
|
|
index_open.write(index_render)
|
|
index_open.close()
|