import json import argparse import sys from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords import nltk import numpy as np from rake_nltk import Rake from textblob import TextBlob stop_words = set(stopwords.words('english')) r= Rake() ap = argparse.ArgumentParser("JSON Dumper") ap.add_argument("text", nargs="+", help="text sources") args=ap.parse_args() def cleanedText(text): word_tokens = word_tokenize(text) word_tokens = [word.lower() for word in word_tokens] word_tokens = [word for word in word_tokens if word.isalpha()] filtered_sentence = [w for w in word_tokens if not w in stop_words] text = " ".join(filtered_sentence) print(text) return text #### INDEX (DEPR) with open('src/index.json', 'r') as f: try: index = json.load(f) except: index={} # build the index of sentences organized by keywords alltext = "" for n in args.text: text = open(n).read() text = text.replace("\n", " ") sentences = sent_tokenize(text) for sentence in sentences: r.extract_keywords_from_text(sentence) keys = r.get_ranked_phrases() for key in keys: if key not in index: index[key] = [] index[key].append({'filename': n, 'sentence': sentence, 'key': key}) alltext += text #print(index) with open('src/index.json', 'w') as outfile: json.dump(index, outfile) ######## DATABASE with open('src/database.json') as f: try: index = json.load(f) except: index = {} nouns = [] verbs = [] adverbs = [] pronouns = ["I", "you", "we", "they"] adjectives = [] keywords = [] keywordspersentence = {} alltext = "" allcharacters = [] for n in args.text: text = open(n).read() rawtext = text.replace("\n", " ") #get pos for letter in list(rawtext): if letter not in allcharacters and not letter.isalpha(): allcharacters.append(letter) alltext += rawtext text = cleanedText(text) thistext = TextBlob(text) words = word_tokenize(text) listWithPos = nltk.pos_tag(words) print(thistext.tags) for key, tag in thistext.tags: if(tag == "VBP"): verbs.append(key) if(tag == "NN" or tag == "NNS"): nouns.append(key) if(tag == "RB"): adverbs.append(key) if(tag == "JJ"): adjectives.append(key) print("Verbs: {}".format(verbs)) print("Nouns: {}".format(nouns)) print("Adverbs: {}".format(adverbs)) print("Adjectives: {}".format(adjectives)) #keywords r.extract_keywords_from_text(rawtext) phrases = r.get_ranked_phrases_with_scores() for key, phrase in phrases: if key > 2: keywords.append(phrase) print("Keywords: {}".format(keywords)) # keywordsofsentences sentences = sent_tokenize(text) for sentence in sentences: r.extract_keywords_from_text(sentence) keys = r.get_ranked_phrases() for key in keys: if key not in keywordspersentence: keywordspersentence[key] = [] keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key}) print(allcharacters) if not index: index["nouns"]=nouns index["verbs"]=verbs index["pronouns"]=pronouns index["adverbs"]=adverbs index["adjectives"]=adjectives index["keywords"]=keywords index["keywordspersentence"]=keywordspersentence index["rawtext"]=alltext index["chars"] = allcharacters else: if not index["nouns"]: index["nouns"]=nouns else: index["nouns"].extend(nouns) if not index["verbs"]: index["verbs"]=verbs else: index["verbs"].extend(verbs) if not index["pronouns"]: index["pronouns"]=pronouns else: index["pronouns"].extend(pronouns) if not index["adverbs"]: index["adverbs"]=adverbs else: index["adverbs"].extend(adverbs) if not index["adjectives"]: index["adjectives"]=adjectives else: index["adjectives"].extend(adjectives) if not index["keywords"]: index["keywords"]=keywords else: index["keywords"].extend(keywords) if not index["keywordspersentence"]: index["keywordspersentence"]=keywordspersentence else: index["keywordspersentence"].update(keywordspersentence) if not index["chars"]: index["chars"] = allcharacters else: index["chars"].extend(allcharacters) with open('src/database.json', 'w') as outfile: json.dump(index, outfile)