added database and userlearning

7 years ago · 74ba3d8c0a
parent 15297f6c9d
commit 74ba3d8c0a
5 changed files with 118 additions and 77 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 images/**
 output/**
 src/index.json
 src/database.json
 .DS_Store
 src/**.wav
--- a/src/build_database.py
+++ b/src/build_database.py
@ -2,16 +2,30 @@ import json
 import argparse
 import sys
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 import nltk
 import numpy as np
 from rake_nltk import Rake
-
+from textblob import TextBlob
 stop_words = set(stopwords.words('english'))
 r= Rake()
 ap = argparse.ArgumentParser("JSON Dumper")
 ap.add_argument("text", nargs="+", help="text sources")
 args=ap.parse_args()
 def cleanedText(text):
    word_tokens = word_tokenize(text)
    word_tokens = [word.lower() for word in word_tokens]
    word_tokens = [word for word in word_tokens if word.isalpha()]
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    text = " ".join(filtered_sentence)
    print(text)
    return text
 #### INDEX (DEPR)
-with open('src/index.json') as f:
+with open('src/index.json', 'w') as f:
    try:
        index = json.load(f)
    except:
@ -36,3 +50,86 @@ for n in args.text:
 with open('src/index.json', 'w') as outfile:
    json.dump(index, outfile)
 ######## DATABASE
 with open('src/database.json', 'w') as f:
    try:
        index = json.load(f)
    except:
        index={}
 nouns = []
 verbs = []
 adverbs = []
 pronouns = ["I", "you", "we", "they"]
 adjectives = []
 keywords = []
 keywordspersentence = {}
 alltext = ""
 allcharacters = []
 for n in args.text:
    text = open(n).read()
    rawtext = text.replace("\n", " ")
    #get pos
    for letter in list(rawtext):
        if letter not in allcharacters and not letter.isalpha():
            allcharacters.append(letter)
    alltext += rawtext
    text = cleanedText(text)
    thistext = TextBlob(text)
    words = word_tokenize(text)
    listWithPos = nltk.pos_tag(words)
    print(thistext.tags)
    for key, tag in thistext.tags:
        if(tag == "VBP"):
            verbs.append(key)
        if(tag == "NN" or tag == "NNS"):
            nouns.append(key)
        if(tag == "RB"):
            adverbs.append(key)
        if(tag == "JJ"):
            adjectives.append(key)
    print("Verbs: {}".format(verbs))
    print("Nouns: {}".format(nouns))
    print("Adverbs: {}".format(adverbs))
    print("Adjectives: {}".format(adjectives))
    #keywords
    r.extract_keywords_from_text(rawtext)
    phrases = r.get_ranked_phrases_with_scores()
    for key, phrase in phrases:
        if key > 2:
            keywords.append(phrase)
    print("Keywords: {}".format(keywords))
    # keywordsofsentences
    sentences = sent_tokenize(text)
    for sentence in sentences:
        r.extract_keywords_from_text(sentence)
        keys = r.get_ranked_phrases()
        for key in keys:
            if key not in keywordspersentence:
                keywordspersentence[key] = []
            keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
 print(allcharacters)
 index["nouns"]=nouns
 index["verbs"]=verbs
 index["pronouns"]=pronouns
 index["adverbs"]=adverbs
 index["adjectives"]=adjectives
 index["keywords"]=keywords
 index["keywordspersentence"]=keywordspersentence
 index["rawtext"]=alltext
 index["chars"] = allcharacters
 with open('src/database.json', 'w') as outfile:
    json.dump(index, outfile)
--- a/src/chatbook.py
+++ b/src/chatbook.py
@ -69,9 +69,10 @@ if __name__ == "__main__":
        try:
            index = json.load(f)
        except:
-            index={}
+            print("I can't work with no knowledge")
            sys.exit()
    #print(index)
    myhost = os.uname()[1]
--- a/src/index.json
+++ b/src/index.json
--- a/src/oulibot.py
+++ b/src/oulibot.py
@ -27,6 +27,7 @@ from textblob import TextBlob
 from textblob_aptagger import PerceptronTagger
 import string
 import re
 import json
 import time
 import sys; from PIL import Image; import numpy as np
 chars = np.asarray(list(' .,:;irsXA253hMHGS#9B&@'))
@ -158,7 +159,7 @@ class HelloBot(irc.bot.SingleServerIRCBot):
        self.poem = []
        self.keys = list(map(lambda x:re.compile(x[0], re.IGNORECASE),basicDialog))
        self.values = list(map(lambda x:x[1],basicDialog))
-        self.POSSIBLE_STRUCTURES=[["adjective", "noun", "adverb", "verb", "adjective", "noun"], ["pronoun", "verb", "noun"]]
+        self.POSSIBLE_STRUCTURES=[["adjective", "noun", "adverb", "verb", "adjective", "noun"], ["pronoun", "verb", "adverb"]]
        self.structure = random.choice(self.POSSIBLE_STRUCTURES)
    def translate(self,str,dict):
@ -455,10 +456,15 @@ class HelloBot(irc.bot.SingleServerIRCBot):
            else:
                self.sentence.append(incoming_msg)
                self.lastPOS +=1
                print(self.waitingfor + " " + incoming_msg)
                self.index[self.waitingfor + "s"].append(incoming_msg)
                with open('src/database.json', 'w') as outfile:
                    json.dump(self.index, outfile)
                self.generateSentence(c,e)
 def cleanedText(text):
    word_tokens = word_tokenize(text)
    word_tokens = [word.lower() for word in word_tokens]
@ -482,77 +488,13 @@ if __name__ == "__main__":
    ap.add_argument("--text", nargs="+", help="text sources")
    args=ap.parse_args()
-    # build the index
+    with open("src/database.json") as f:
-    index = {}
+        try:
-
+            index = json.load(f)
-    nouns = []
+        except:
-    verbs = []
+            print("I can't work with no knowledge")
-    adverbs = []
+            sys.exit()
    pronouns = ["I", "you", "we", "they"]
    adjectives = []
    keywords = []
    keywordspersentence = {}
    alltext = ""
    allcharacters = []
    n = "ocr/output.txt"
    text = open(n).read()
    rawtext = text.replace("\n", " ")
    #get pos
    for letter in list(rawtext):
        if letter not in allcharacters and not letter.isalpha():
            allcharacters.append(letter)
    alltext += rawtext
    text = cleanedText(text)
    thistext = TextBlob(text)
    words = word_tokenize(text)
    listWithPos = nltk.pos_tag(words)
    print(thistext.tags)
    for key, tag in thistext.tags:
        if(tag == "VBP"):
            verbs.append(key)
        if(tag == "NN" or tag == "NNS"):
            nouns.append(key)
        if(tag == "RB"):
            adverbs.append(key)
        if(tag == "JJ"):
            adjectives.append(key)
    print("Verbs: {}".format(verbs))
    print("Nouns: {}".format(nouns))
    print("Adverbs: {}".format(adverbs))
    print("Adjectives: {}".format(adjectives))
    #keywords
    r.extract_keywords_from_text(rawtext)
    phrases = r.get_ranked_phrases_with_scores()
    for key, phrase in phrases:
        if key > 2:
            keywords.append(phrase)
    print("Keywords: {}".format(keywords))
    # keywordsofsentences
    sentences = sent_tokenize(text)
    for sentence in sentences:
        r.extract_keywords_from_text(sentence)
        keys = r.get_ranked_phrases()
        for key in keys:
            if key not in keywordspersentence:
                keywordspersentence[key] = []
            keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
    print(allcharacters)
    index["nouns"]=nouns
    index["verbs"]=verbs
    index["pronouns"]=pronouns
    index["adverbs"]=adverbs
    index["adjectives"]=adjectives
    index["keywords"]=keywords
    index["keywordspersentence"]=keywordspersentence
    index["rawtext"]=alltext
    chars = np.asarray(allcharacters)
    chars = np.asarray(index["chars"])
    bot = HelloBot(args.channel, args.nickname, args.server, args.port, index)
    bot.start()