You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
3.5 KiB
Python

import json
import argparse
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np
from rake_nltk import Rake
from textblob import TextBlob
stop_words = set(stopwords.words('english'))
r= Rake()
ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args()
def cleanedText(text):
word_tokens = word_tokenize(text)
word_tokens = [word.lower() for word in word_tokens]
word_tokens = [word for word in word_tokens if word.isalpha()]
filtered_sentence = [w for w in word_tokens if not w in stop_words]
text = " ".join(filtered_sentence)
print(text)
return text
#### INDEX (DEPR)
with open('src/index.json', 'w') as f:
try:
index = json.load(f)
except:
index={}
# build the index of sentences organized by keywords
alltext = ""
for n in args.text:
text = open(n).read()
text = text.replace("\n", " ")
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in index:
index[key] = []
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
alltext += text
#print(index)
7 years ago
with open('src/index.json', 'w') as outfile:
json.dump(index, outfile)
######## DATABASE
with open('src/database.json', 'w') as f:
try:
index = json.load(f)
except:
index={}
nouns = []
verbs = []
adverbs = []
pronouns = ["I", "you", "we", "they"]
adjectives = []
keywords = []
keywordspersentence = {}
alltext = ""
allcharacters = []
for n in args.text:
text = open(n).read()
rawtext = text.replace("\n", " ")
#get pos
for letter in list(rawtext):
if letter not in allcharacters and not letter.isalpha():
allcharacters.append(letter)
alltext += rawtext
text = cleanedText(text)
thistext = TextBlob(text)
words = word_tokenize(text)
listWithPos = nltk.pos_tag(words)
print(thistext.tags)
for key, tag in thistext.tags:
if(tag == "VBP"):
verbs.append(key)
if(tag == "NN" or tag == "NNS"):
nouns.append(key)
if(tag == "RB"):
adverbs.append(key)
if(tag == "JJ"):
adjectives.append(key)
print("Verbs: {}".format(verbs))
print("Nouns: {}".format(nouns))
print("Adverbs: {}".format(adverbs))
print("Adjectives: {}".format(adjectives))
#keywords
r.extract_keywords_from_text(rawtext)
phrases = r.get_ranked_phrases_with_scores()
for key, phrase in phrases:
if key > 2:
keywords.append(phrase)
print("Keywords: {}".format(keywords))
# keywordsofsentences
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in keywordspersentence:
keywordspersentence[key] = []
keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
print(allcharacters)
index["nouns"]=nouns
index["verbs"]=verbs
index["pronouns"]=pronouns
index["adverbs"]=adverbs
index["adjectives"]=adjectives
index["keywords"]=keywords
index["keywordspersentence"]=keywordspersentence
index["rawtext"]=alltext
index["chars"] = allcharacters
with open('src/database.json', 'w') as outfile:
json.dump(index, outfile)