added database and userlearning

master^2
Alex 7 years ago
parent 15297f6c9d
commit 74ba3d8c0a

1
.gitignore vendored

@ -1,5 +1,6 @@
images/** images/**
output/** output/**
src/index.json src/index.json
src/database.json
.DS_Store .DS_Store
src/**.wav src/**.wav

@ -2,16 +2,30 @@ import json
import argparse import argparse
import sys import sys
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import numpy as np
from rake_nltk import Rake from rake_nltk import Rake
from textblob import TextBlob
stop_words = set(stopwords.words('english'))
r= Rake() r= Rake()
ap = argparse.ArgumentParser("JSON Dumper") ap = argparse.ArgumentParser("JSON Dumper")
ap.add_argument("text", nargs="+", help="text sources") ap.add_argument("text", nargs="+", help="text sources")
args=ap.parse_args() args=ap.parse_args()
def cleanedText(text):
word_tokens = word_tokenize(text)
word_tokens = [word.lower() for word in word_tokens]
word_tokens = [word for word in word_tokens if word.isalpha()]
filtered_sentence = [w for w in word_tokens if not w in stop_words]
text = " ".join(filtered_sentence)
print(text)
return text
#### INDEX (DEPR)
with open('src/index.json') as f: with open('src/index.json', 'w') as f:
try: try:
index = json.load(f) index = json.load(f)
except: except:
@ -36,3 +50,86 @@ for n in args.text:
with open('src/index.json', 'w') as outfile: with open('src/index.json', 'w') as outfile:
json.dump(index, outfile) json.dump(index, outfile)
######## DATABASE
with open('src/database.json', 'w') as f:
try:
index = json.load(f)
except:
index={}
nouns = []
verbs = []
adverbs = []
pronouns = ["I", "you", "we", "they"]
adjectives = []
keywords = []
keywordspersentence = {}
alltext = ""
allcharacters = []
for n in args.text:
text = open(n).read()
rawtext = text.replace("\n", " ")
#get pos
for letter in list(rawtext):
if letter not in allcharacters and not letter.isalpha():
allcharacters.append(letter)
alltext += rawtext
text = cleanedText(text)
thistext = TextBlob(text)
words = word_tokenize(text)
listWithPos = nltk.pos_tag(words)
print(thistext.tags)
for key, tag in thistext.tags:
if(tag == "VBP"):
verbs.append(key)
if(tag == "NN" or tag == "NNS"):
nouns.append(key)
if(tag == "RB"):
adverbs.append(key)
if(tag == "JJ"):
adjectives.append(key)
print("Verbs: {}".format(verbs))
print("Nouns: {}".format(nouns))
print("Adverbs: {}".format(adverbs))
print("Adjectives: {}".format(adjectives))
#keywords
r.extract_keywords_from_text(rawtext)
phrases = r.get_ranked_phrases_with_scores()
for key, phrase in phrases:
if key > 2:
keywords.append(phrase)
print("Keywords: {}".format(keywords))
# keywordsofsentences
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in keywordspersentence:
keywordspersentence[key] = []
keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
print(allcharacters)
index["nouns"]=nouns
index["verbs"]=verbs
index["pronouns"]=pronouns
index["adverbs"]=adverbs
index["adjectives"]=adjectives
index["keywords"]=keywords
index["keywordspersentence"]=keywordspersentence
index["rawtext"]=alltext
index["chars"] = allcharacters
with open('src/database.json', 'w') as outfile:
json.dump(index, outfile)

@ -69,9 +69,10 @@ if __name__ == "__main__":
try: try:
index = json.load(f) index = json.load(f)
except: except:
index={} print("I can't work with no knowledge")
sys.exit()
#print(index)
myhost = os.uname()[1] myhost = os.uname()[1]

File diff suppressed because one or more lines are too long

@ -27,6 +27,7 @@ from textblob import TextBlob
from textblob_aptagger import PerceptronTagger from textblob_aptagger import PerceptronTagger
import string import string
import re import re
import json
import time import time
import sys; from PIL import Image; import numpy as np import sys; from PIL import Image; import numpy as np
chars = np.asarray(list(' .,:;irsXA253hMHGS#9B&@')) chars = np.asarray(list(' .,:;irsXA253hMHGS#9B&@'))
@ -158,7 +159,7 @@ class HelloBot(irc.bot.SingleServerIRCBot):
self.poem = [] self.poem = []
self.keys = list(map(lambda x:re.compile(x[0], re.IGNORECASE),basicDialog)) self.keys = list(map(lambda x:re.compile(x[0], re.IGNORECASE),basicDialog))
self.values = list(map(lambda x:x[1],basicDialog)) self.values = list(map(lambda x:x[1],basicDialog))
self.POSSIBLE_STRUCTURES=[["adjective", "noun", "adverb", "verb", "adjective", "noun"], ["pronoun", "verb", "noun"]] self.POSSIBLE_STRUCTURES=[["adjective", "noun", "adverb", "verb", "adjective", "noun"], ["pronoun", "verb", "adverb"]]
self.structure = random.choice(self.POSSIBLE_STRUCTURES) self.structure = random.choice(self.POSSIBLE_STRUCTURES)
def translate(self,str,dict): def translate(self,str,dict):
@ -455,10 +456,15 @@ class HelloBot(irc.bot.SingleServerIRCBot):
else: else:
self.sentence.append(incoming_msg) self.sentence.append(incoming_msg)
self.lastPOS +=1 self.lastPOS +=1
print(self.waitingfor + " " + incoming_msg)
self.index[self.waitingfor + "s"].append(incoming_msg)
with open('src/database.json', 'w') as outfile:
json.dump(self.index, outfile)
self.generateSentence(c,e) self.generateSentence(c,e)
def cleanedText(text): def cleanedText(text):
word_tokens = word_tokenize(text) word_tokens = word_tokenize(text)
word_tokens = [word.lower() for word in word_tokens] word_tokens = [word.lower() for word in word_tokens]
@ -482,77 +488,13 @@ if __name__ == "__main__":
ap.add_argument("--text", nargs="+", help="text sources") ap.add_argument("--text", nargs="+", help="text sources")
args=ap.parse_args() args=ap.parse_args()
# build the index with open("src/database.json") as f:
index = {} try:
index = json.load(f)
nouns = [] except:
verbs = [] print("I can't work with no knowledge")
adverbs = [] sys.exit()
pronouns = ["I", "you", "we", "they"]
adjectives = []
keywords = []
keywordspersentence = {}
alltext = ""
allcharacters = []
n = "ocr/output.txt"
text = open(n).read()
rawtext = text.replace("\n", " ")
#get pos
for letter in list(rawtext):
if letter not in allcharacters and not letter.isalpha():
allcharacters.append(letter)
alltext += rawtext
text = cleanedText(text)
thistext = TextBlob(text)
words = word_tokenize(text)
listWithPos = nltk.pos_tag(words)
print(thistext.tags)
for key, tag in thistext.tags:
if(tag == "VBP"):
verbs.append(key)
if(tag == "NN" or tag == "NNS"):
nouns.append(key)
if(tag == "RB"):
adverbs.append(key)
if(tag == "JJ"):
adjectives.append(key)
print("Verbs: {}".format(verbs))
print("Nouns: {}".format(nouns))
print("Adverbs: {}".format(adverbs))
print("Adjectives: {}".format(adjectives))
#keywords
r.extract_keywords_from_text(rawtext)
phrases = r.get_ranked_phrases_with_scores()
for key, phrase in phrases:
if key > 2:
keywords.append(phrase)
print("Keywords: {}".format(keywords))
# keywordsofsentences
sentences = sent_tokenize(text)
for sentence in sentences:
r.extract_keywords_from_text(sentence)
keys = r.get_ranked_phrases()
for key in keys:
if key not in keywordspersentence:
keywordspersentence[key] = []
keywordspersentence[key].append({'filename': n, 'sentence': sentence, 'key': key})
print(allcharacters)
index["nouns"]=nouns
index["verbs"]=verbs
index["pronouns"]=pronouns
index["adverbs"]=adverbs
index["adjectives"]=adjectives
index["keywords"]=keywords
index["keywordspersentence"]=keywordspersentence
index["rawtext"]=alltext
chars = np.asarray(allcharacters)
chars = np.asarray(index["chars"])
bot = HelloBot(args.channel, args.nickname, args.server, args.port, index) bot = HelloBot(args.channel, args.nickname, args.server, args.port, index)
bot.start() bot.start()

Loading…
Cancel
Save