added chatbook and resources

7 years ago · 08ff1142f9
parent bf80ff5c16
commit 08ff1142f9
5 changed files with 119 additions and 2 deletions
--- a/4
+++ b/4
@ -49,6 +49,7 @@ ocr/output.txt:  ## ocr with tesseract
 	echo $(listimgs) > $(@D)/list.txt
 	@echo $(basename $@ .txt)
 	tesseract $(@D)/list.txt $(basename $@ .txt)
+	python3 src/build_database.py $(@)



@ -92,3 +93,6 @@ tts: output/chatbot.txt ocr/output.txt    ## text to speech. Dependencies: espea

 ttssr-human-only: ocr/output.txt ## Loop: text to speech-speech recognition. Dependencies: espeak, pocketsphinx
 	bash src/ttssr-loop-human-only.sh ocr/output.txt
+
+chatbook:
+	python3 src/chatbook.py
--- a/index.json
+++ b/index.json
--- a/src/build_database.py
+++ b/src/build_database.py
@ -0,0 +1,38 @@
+import json
+import argparse
+import sys
+from nltk.tokenize import sent_tokenize, word_tokenize
+from rake_nltk import Rake
+
+r= Rake()
+
+ap = argparse.ArgumentParser("JSON Dumper")
+ap.add_argument("text", nargs="+", help="text sources")
+args=ap.parse_args()
+
+
+with open('src/index.json') as f:
+    try:
+        index = json.load(f)
+    except:
+        index={}
+# build the index of sentences organized by keywords
+alltext = ""
+
+for n in args.text:
+    text = open(n).read()
+    text = text.replace("\n", " ")
+    sentences = sent_tokenize(text)
+    for sentence in sentences:
+        r.extract_keywords_from_text(sentence)
+        keys = r.get_ranked_phrases()
+        for key in keys:
+            if key not in index:
+                index[key] = []
+            index[key].append({'filename': n, 'sentence': sentence, 'key': key})
+    alltext += text
+
+#print(index)
+
+with open('index.json', 'w') as outfile:
+    json.dump(index, outfile)
--- a/src/chatbook.py
+++ b/src/chatbook.py
@ -0,0 +1,73 @@
+import irc.bot
+from rake_nltk import Rake
+import random
+from nltk.tokenize import sent_tokenize, word_tokenize
+import json
+#from thread import start_new_thread
+
+r= Rake()
+
+
+def chunks(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i+n]
+
+class HelloBot(irc.bot.SingleServerIRCBot):
+    def __init__(self, channel, nickname, server, port=6667, index=None):
+        irc.bot.SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
+        self.channel = channel
+        self.index = index
+
+    def on_welcome(self, c, e):
+        c.join(self.channel)
+
+    def on_privmsg(self, c, e):
+        pass
+
+    def on_pubmsg(self, c, e):
+        print(e.arguments, e.source)
+        msg = e.arguments[0]
+        r.extract_keywords_from_text(msg)
+        #r.get_ranked_phrases_with_scores()
+        listOfKeys = r.get_ranked_phrases()
+
+        for keyWord in listOfKeys:
+            if keyWord in self.index:
+                msg = (index.get(keyWord)[0].get('sentence'))
+                msg_where = "I found this in {}".format(index.get(keyWord)[0].get('filename'))
+            else:
+                msg = "I don't know anything about that"
+                msg_where = ""
+
+        for chunk in chunks(msg, 400):
+            c.privmsg(self.channel, chunk)
+
+
+        c.privmsg(self.channel, msg_where)
+
+
+
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+
+    ap = argparse.ArgumentParser("IRC Bot")
+    ap.add_argument("--server", default="irc.freenode.net")
+    ap.add_argument("--port", type=int, default=6667)
+    ap.add_argument("--channel", default="#pzi")
+    ap.add_argument("--nickname", default="scanbot")
+    ap.add_argument("--text", nargs="+", help="database to use", default="index.json")
+    args=ap.parse_args()
+
+    # build the index of sentences organized by keywords
+    with open("index.json") as f:
+        try:
+            index = json.load(f)
+        except:
+            index={}
+
+    #print(index)
+
+    bot = HelloBot(args.channel, args.nickname, args.server, args.port, index)
+    bot.start()
--- a/src/index.json
+++ b/src/index.json