You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
39 lines
939 B
Python
39 lines
939 B
Python
import json
|
|
import argparse
|
|
import sys
|
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
from rake_nltk import Rake
|
|
|
|
r= Rake()
|
|
|
|
ap = argparse.ArgumentParser("JSON Dumper")
|
|
ap.add_argument("text", nargs="+", help="text sources")
|
|
args=ap.parse_args()
|
|
|
|
|
|
with open('src/index.json') as f:
|
|
try:
|
|
index = json.load(f)
|
|
except:
|
|
index={}
|
|
# build the index of sentences organized by keywords
|
|
alltext = ""
|
|
|
|
for n in args.text:
|
|
text = open(n).read()
|
|
text = text.replace("\n", " ")
|
|
sentences = sent_tokenize(text)
|
|
for sentence in sentences:
|
|
r.extract_keywords_from_text(sentence)
|
|
keys = r.get_ranked_phrases()
|
|
for key in keys:
|
|
if key not in index:
|
|
index[key] = []
|
|
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
|
|
alltext += text
|
|
|
|
#print(index)
|
|
|
|
with open('src/index.json', 'w') as outfile:
|
|
json.dump(index, outfile)
|