|
|
|
import json
|
|
|
|
import argparse
|
|
|
|
import sys
|
|
|
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
import nltk
|
|
|
|
import numpy as np
|
|
|
|
from rake_nltk import Rake
|
|
|
|
from textblob import TextBlob
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
r= Rake()
|
|
|
|
|
|
|
|
ap = argparse.ArgumentParser("JSON Dumper")
|
|
|
|
ap.add_argument("text", nargs="+", help="text sources")
|
|
|
|
args=ap.parse_args()
|
|
|
|
|
|
|
|
def cleanedText(text):
|
|
|
|
word_tokens = word_tokenize(text)
|
|
|
|
word_tokens = [word.lower() for word in word_tokens]
|
|
|
|
word_tokens = [word for word in word_tokens if word.isalpha()]
|
|
|
|
filtered_sentence = [w for w in word_tokens if not w in stop_words]
|
|
|
|
text = " ".join(filtered_sentence)
|
|
|
|
print(text)
|
|
|
|
return text
|
|
|
|
|
|
|
|
#### INDEX (DEPR)
|
|
|
|
|
|
|
|
with open('src/index.json', 'w') as f:
|
|
|
|
try:
|
|
|
|
index = json.load(f)
|
|
|
|
except:
|
|
|
|
index={}
|
|
|
|
# build the index of sentences organized by keywords
|
|
|
|
alltext = ""
|
|
|
|
|
|
|
|
for n in args.text:
|
|
|
|
text = open(n).read()
|
|
|
|
text = text.replace("\n", " ")
|
|
|
|
sentences = sent_tokenize(text)
|
|
|
|
for sentence in sentences:
|
|
|
|
r.extract_keywords_from_text(sentence)
|
|
|
|
keys = r.get_ranked_phrases()
|
|
|
|
for key in keys:
|
|
|
|
if key not in index:
|
|
|
|
index[key] = []
|
|
|
|
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
|
|
|
|
alltext += text
|
|
|
|
|
|
|
|
#print(index)
|
|
|
|
|
|
|
|
with open('src/index.json', 'w') as outfile:
|
|
|
|
json.dump(index, outfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
######## DATABASE
|
|
|
|
|
|
|
|
with open('src/database.json', 'w') as f:
|
|
|
|
try:
|
|
|
|
index = json.load(f)
|
|
|
|
except:
|
|
|
|
index={}
|
|
|
|
|
|
|
|
|
|
|
|
nouns = []
|
|
|
|
verbs = []
|
|
|
|
adverbs = []
|
|
|
|
pronouns = ["I", "you", "we", "they"]
|
|
|
|
adjectives = []
|
|
|
|
keywords = []
|
|
|
|
keywordspersentence = {}
|
|
|
|
alltext = ""
|
|
|
|
allcharacters = []
|
|
|
|
|
|
|
|
for n in args.text:
|
|
|
|
text = open(n).read()
|
|
|
|
rawtext = text.replace("\n", " ")
|
|
|
|
#get pos
|
|
|
|
for letter in list(rawtext):
|
|
|
|
if letter not in allcharacters and not letter.isalpha():
|
|
|
|
allcharacters.append(letter)
|
|
|
|
|
|
|
|
alltext += rawtext
|
|
|
|
text = cleanedText(text)
|
|
|
|
thistext = TextBlob(text)
|
|
|
|
words = word_tokenize(text)
|
|
|
|
listWithPos = nltk.pos_tag(words)
|
|
|
|
print(thistext.tags)
|
|
|
|
for key, tag in thistext.tags:
|
|
|
|
if(tag == "VBP"):
|
|
|
|
verbs.append(key)
|
|
|
|
if(tag == "NN" or tag == "NNS"):
|
|
|
|
nouns.append(key)
|
|
|
|
if(tag == "RB"):
|
|
|
|
adverbs.append(key)
|
|
|
|
if(tag == "JJ"):
|
|
|
|
adjectives.append(key)
|
|
|
|
|
|
|
|
print("Verbs: {}".format(verbs))
|
|
|
|
print("Nouns: {}".format(nouns))
|
|
|
|
print("Adverbs: {}".format(adverbs))
|
|
|
|
print("Adjectives: {}".format(adjectives))
|
|
|
|
#keywords
|
|
|
|
r.extract_keywords_from_text(rawtext)
|
|
|
|
phrases = r.get_ranked_phrases_with_scores()
|
|
|
|
for key, phrase in phrases:
|
|
|
|
if key > 2:
|
|
|
|
keywords.append(phrase)
|
|
|
|
|
|
|
|
print("Keywords: {}".format(keywords))
|
|
|
|
|
|
|
|
# keywordsofsentences
|
|
|
|
sentences = sent_tokenize(text)
|
|
|
|
for sentence in sentences:
|
|
|
|
|
|
|
|
r.extract_keywords_from_text(sentence)
|
|
|
|
keys = r.get_ranked_phrases()
|
|
|
|
for key in keys:
|
|
|
|
if key not in keywordspersentence:
|
|
|
|
keywordspersentence[key] = []
|
|
|
|
keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
|
|
|
|
|
|
|
|
print(allcharacters)
|
|
|
|
index["nouns"]=nouns
|
|
|
|
index["verbs"]=verbs
|
|
|
|
index["pronouns"]=pronouns
|
|
|
|
index["adverbs"]=adverbs
|
|
|
|
index["adjectives"]=adjectives
|
|
|
|
index["keywords"]=keywords
|
|
|
|
index["keywordspersentence"]=keywordspersentence
|
|
|
|
index["rawtext"]=alltext
|
|
|
|
index["chars"] = allcharacters
|
|
|
|
|
|
|
|
with open('src/database.json', 'w') as outfile:
|
|
|
|
json.dump(index, outfile)
|