You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

171 lines
4.5 KiB
Python

import json
import argparse
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np
from rake_nltk import Rake
from textblob import TextBlob
stop_words = set(stopwords.words('english'))
r= Rake()
ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args()
def cleanedText(text):
word_tokens = word_tokenize(text)
word_tokens = [word.lower() for word in word_tokens]
word_tokens = [word for word in word_tokens if word.isalpha()]
filtered_sentence = [w for w in word_tokens if not w in stop_words]
text = " ".join(filtered_sentence)
print(text)
return text
#### INDEX (DEPR)
with open('src/index.json', 'r') as f:
try:
index = json.load(f)
except:
index={}
# build the index of sentences organized by keywords
alltext = ""
for n in args.text:
text = open(n).read()
text = text.replace("\n", " ")
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in index:
index[key] = []
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
alltext += text
#print(index)
with open('src/index.json', 'w') as outfile:
json.dump(index, outfile)
######## DATABASE
with open('src/database.json') as f:
try:
index = json.load(f)
except:
index = {}
nouns = []
verbs = []
adverbs = []
pronouns = ["I", "you", "we", "they"]
adjectives = []
keywords = []
keywordspersentence = {}
alltext = ""
allcharacters = []
for n in args.text:
text = open(n).read()
rawtext = text.replace("\n", " ")
#get pos
for letter in list(rawtext):
if letter not in allcharacters and not letter.isalpha():
allcharacters.append(letter)
alltext += rawtext
text = cleanedText(text)
thistext = TextBlob(text)
words = word_tokenize(text)
listWithPos = nltk.pos_tag(words)
print(thistext.tags)
for key, tag in thistext.tags:
if(tag == "VBP"):
verbs.append(key)
if(tag == "NN" or tag == "NNS"):
nouns.append(key)
if(tag == "RB"):
adverbs.append(key)
if(tag == "JJ"):
adjectives.append(key)
print("Verbs: {}".format(verbs))
print("Nouns: {}".format(nouns))
print("Adverbs: {}".format(adverbs))
print("Adjectives: {}".format(adjectives))
#keywords
r.extract_keywords_from_text(rawtext)
phrases = r.get_ranked_phrases_with_scores()
for key, phrase in phrases:
if key > 2:
keywords.append(phrase)
print("Keywords: {}".format(keywords))
# keywordsofsentences
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in keywordspersentence:
keywordspersentence[key] = []
keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
print(allcharacters)
if not index:
index["nouns"]=nouns
index["verbs"]=verbs
index["pronouns"]=pronouns
index["adverbs"]=adverbs
index["adjectives"]=adjectives
index["keywords"]=keywords
index["keywordspersentence"]=keywordspersentence
index["rawtext"]=alltext
index["chars"] = allcharacters
else:
if not index["nouns"]:
index["nouns"]=nouns
else:
index["nouns"].extend(nouns)
if not index["verbs"]:
index["verbs"]=verbs
else:
index["verbs"].extend(verbs)
if not index["pronouns"]:
index["pronouns"]=pronouns
else:
index["pronouns"].extend(pronouns)
if not index["adverbs"]:
index["adverbs"]=adverbs
else:
index["adverbs"].extend(adverbs)
if not index["adjectives"]:
index["adjectives"]=adjectives
else:
index["adjectives"].extend(adjectives)
if not index["keywords"]:
index["keywords"]=keywords
else:
index["keywords"].extend(keywords)
if not index["keywordspersentence"]:
index["keywordspersentence"]=keywordspersentence
else:
index["keywordspersentence"].update(keywordspersentence)
if not index["chars"]:
index["chars"] = allcharacters
else:
index["chars"].extend(allcharacters)
with open('src/database.json', 'w') as outfile:
json.dump(index, outfile)