You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

39 lines
935 B
Python

import json
import argparse
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from rake_nltk import Rake
r= Rake()
ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args()
with open('src/index.json') as f:
try:
index = json.load(f)
except:
index={}
# build the index of sentences organized by keywords
alltext = ""
for n in args.text:
text = open(n).read()
text = text.replace("\n", " ")
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in index:
index[key] = []
index[key].append({'filename': n, 'sentence': sentence, 'key': key})
alltext += text
#print(index)
with open('index.json', 'w') as outfile:
json.dump(index, outfile)