OuNuPo/src/build_database.py

import json
import argparse
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np
from rake_nltk import Rake
from textblob import TextBlob
stop_words = set(stopwords.words('english'))
r= Rake()

ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args()

def cleanedText(text):
    word_tokens = word_tokenize(text)
    word_tokens = [word.lower() for word in word_tokens]
    word_tokens = [word for word in word_tokens if word.isalpha()]
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    text = " ".join(filtered_sentence)
    print(text)
    return text

#### INDEX (DEPR)

with open('src/index.json', 'w') as f:
    try:
        index = json.load(f)
    except:
        index={}
# build the index of sentences organized by keywords
alltext = ""

for n in args.text:
    text = open(n).read()
    text = text.replace("\n", " ")
    sentences = sent_tokenize(text)
    for sentence in sentences:
        r.extract_keywords_from_text(sentence)
        keys = r.get_ranked_phrases()
        for key in keys:
            if key not in index:
                index[key] = []
            index[key].append({'filename': n, 'sentence': sentence, 'key': key})
    alltext += text

#print(index)

with open('src/index.json', 'w') as outfile:
    json.dump(index, outfile)


######## DATABASE

with open('src/database.json', 'w') as f:
    try:
        index = json.load(f)
    except:
        index={}


nouns = []
verbs = []
adverbs = []
pronouns = ["I", "you", "we", "they"]
adjectives = []
keywords = []
keywordspersentence = {}
alltext = ""
allcharacters = []

for n in args.text:
    text = open(n).read()
    rawtext = text.replace("\n", " ")
    #get pos
    for letter in list(rawtext):
        if letter not in allcharacters and not letter.isalpha():
            allcharacters.append(letter)

    alltext += rawtext
    text = cleanedText(text)
    thistext = TextBlob(text)
    words = word_tokenize(text)
    listWithPos = nltk.pos_tag(words)
    print(thistext.tags)
    for key, tag in thistext.tags:
        if(tag == "VBP"):
            verbs.append(key)
        if(tag == "NN" or tag == "NNS"):
            nouns.append(key)
        if(tag == "RB"):
            adverbs.append(key)
        if(tag == "JJ"):
            adjectives.append(key)

    print("Verbs: {}".format(verbs))
    print("Nouns: {}".format(nouns))
    print("Adverbs: {}".format(adverbs))
    print("Adjectives: {}".format(adjectives))
    #keywords
    r.extract_keywords_from_text(rawtext)
    phrases = r.get_ranked_phrases_with_scores()
    for key, phrase in phrases:
        if key > 2:
            keywords.append(phrase)

    print("Keywords: {}".format(keywords))

    # keywordsofsentences
    sentences = sent_tokenize(text)
    for sentence in sentences:

        r.extract_keywords_from_text(sentence)
        keys = r.get_ranked_phrases()
        for key in keys:
            if key not in keywordspersentence:
                keywordspersentence[key] = []
            keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})

print(allcharacters)
index["nouns"]=nouns
index["verbs"]=verbs
index["pronouns"]=pronouns
index["adverbs"]=adverbs
index["adjectives"]=adjectives
index["keywords"]=keywords
index["keywordspersentence"]=keywordspersentence
index["rawtext"]=alltext
index["chars"] = allcharacters

with open('src/database.json', 'w') as outfile:
    json.dump(index, outfile)
added chatbook and resources 7 years ago			`import json`
			`import argparse`
			`import sys`
			`from nltk.tokenize import sent_tokenize, word_tokenize`
added database and userlearning 7 years ago			`from nltk.corpus import stopwords`
			`import nltk`
			`import numpy as np`
added chatbook and resources 7 years ago			`from rake_nltk import Rake`
added database and userlearning 7 years ago			`from textblob import TextBlob`
			`stop_words = set(stopwords.words('english'))`
added chatbook and resources 7 years ago			`r= Rake()`

			`ap = argparse.ArgumentParser("JSON Dumper")`
			`ap.add_argument("text", nargs="+", help="text sources")`
			`args=ap.parse_args()`

added database and userlearning 7 years ago			`def cleanedText(text):`
			`word_tokens = word_tokenize(text)`
			`word_tokens = [word.lower() for word in word_tokens]`
			`word_tokens = [word for word in word_tokens if word.isalpha()]`
			`filtered_sentence = [w for w in word_tokens if not w in stop_words]`
			`text = " ".join(filtered_sentence)`
			`print(text)`
			`return text`

			`#### INDEX (DEPR)`
added chatbook and resources 7 years ago
added database and userlearning 7 years ago			`with open('src/index.json', 'w') as f:`
added chatbook and resources 7 years ago			`try:`
			`index = json.load(f)`
			`except:`
			`index={}`
			`# build the index of sentences organized by keywords`
			`alltext = ""`

			`for n in args.text:`
			`text = open(n).read()`
			`text = text.replace("\n", " ")`
			`sentences = sent_tokenize(text)`
			`for sentence in sentences:`
			`r.extract_keywords_from_text(sentence)`
			`keys = r.get_ranked_phrases()`
			`for key in keys:`
			`if key not in index:`
			`index[key] = []`
			`index[key].append({'filename': n, 'sentence': sentence, 'key': key})`
			`alltext += text`

			`#print(index)`

fixed index 7 years ago			`with open('src/index.json', 'w') as outfile:`
added chatbook and resources 7 years ago			`json.dump(index, outfile)`
added database and userlearning 7 years ago


			`######## DATABASE`

			`with open('src/database.json', 'w') as f:`
			`try:`
			`index = json.load(f)`
			`except:`
			`index={}`


			`nouns = []`
			`verbs = []`
			`adverbs = []`
			`pronouns = ["I", "you", "we", "they"]`
			`adjectives = []`
			`keywords = []`
			`keywordspersentence = {}`
			`alltext = ""`
			`allcharacters = []`

			`for n in args.text:`
			`text = open(n).read()`
			`rawtext = text.replace("\n", " ")`
			`#get pos`
			`for letter in list(rawtext):`
			`if letter not in allcharacters and not letter.isalpha():`
			`allcharacters.append(letter)`

			`alltext += rawtext`
			`text = cleanedText(text)`
			`thistext = TextBlob(text)`
			`words = word_tokenize(text)`
			`listWithPos = nltk.pos_tag(words)`
			`print(thistext.tags)`
			`for key, tag in thistext.tags:`
			`if(tag == "VBP"):`
			`verbs.append(key)`
			`if(tag == "NN" or tag == "NNS"):`
			`nouns.append(key)`
			`if(tag == "RB"):`
			`adverbs.append(key)`
			`if(tag == "JJ"):`
			`adjectives.append(key)`

			`print("Verbs: {}".format(verbs))`
			`print("Nouns: {}".format(nouns))`
			`print("Adverbs: {}".format(adverbs))`
			`print("Adjectives: {}".format(adjectives))`
			`#keywords`
			`r.extract_keywords_from_text(rawtext)`
			`phrases = r.get_ranked_phrases_with_scores()`
			`for key, phrase in phrases:`
			`if key > 2:`
			`keywords.append(phrase)`

			`print("Keywords: {}".format(keywords))`

			`# keywordsofsentences`
			`sentences = sent_tokenize(text)`
			`for sentence in sentences:`

			`r.extract_keywords_from_text(sentence)`
			`keys = r.get_ranked_phrases()`
			`for key in keys:`
			`if key not in keywordspersentence:`
			`keywordspersentence[key] = []`
			`keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})`

			`print(allcharacters)`
			`index["nouns"]=nouns`
			`index["verbs"]=verbs`
			`index["pronouns"]=pronouns`
			`index["adverbs"]=adverbs`
			`index["adjectives"]=adjectives`
			`index["keywords"]=keywords`
			`index["keywordspersentence"]=keywordspersentence`
			`index["rawtext"]=alltext`
			`index["chars"] = allcharacters`

			`with open('src/database.json', 'w') as outfile:`
			`json.dump(index, outfile)`