diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..7855ac4 Binary files /dev/null and b/.DS_Store differ diff --git a/Makefile b/Makefile index e245408..763be96 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ output/tagged-words.txt: ocr/output.txt ## Analyzes OCR'ed text using a Part of # >>> import nltk # >>> nltk.download('averaged_perceptron_tagger') -output/chatbot.txt: ocr/output.txt ## DESCRIBE WHAT IT DOES. Dependencies: python3's chatterbot +output/chatbot.txt: ocr/output.txt ## Comments a text with a simple chatbot. Dependencies: python3's chatterbot cat $< | python3 src/textbotconversation.py $(@) diff --git a/src/wordtagger.py b/src/wordtagger.py index e0ccec4..fedc080 100644 --- a/src/wordtagger.py +++ b/src/wordtagger.py @@ -1,71 +1,82 @@ import nltk from sys import stdin, stdout -# Step 1: define input and set up a list +# Define input input = stdin.read() -taggedwordlist = [] -string = input -words = nltk.word_tokenize(string) -taggedwordlist = nltk.pos_tag(words) - -for word, pos in nltk.pos_tag(words): +# FILTER FUNCTIONS +# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags +def postagger(string): + words = nltk.word_tokenize(string) taggedwordlist = nltk.pos_tag(words) - # print('{0} is a {1}'.format(word,pos)) # Command out to print the analysis step + for word, pos in nltk.pos_tag(words): + taggedwordlist = nltk.pos_tag(words) + #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step + + taglist = [ pos for word,pos in taggedwordlist ] + #print(taglist) + return taglist; -taglist = [ pos for word,pos in taggedwordlist ] +# This function changes the tags to readable equivalents (NNP to noun for example) +def postagger_readable(list): + readabletaglist = [] -#print(taglist) + for tag in list: + if tag in {"NNP","NNS","NN","NNPS"}: + readabletag = 'noun' + elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}: + readabletag = 'verb' + elif tag in {'RB','RBR','RBS','WRB'}: + readabletag = 'adverb' + elif tag in {'PRP','PRP$'}: + readabletag = 'pronoun' + elif tag in {'JJ','JJR','JJS'}: + readabletag = 'adjective' + elif tag == 'IN': + readabletag = 'preposition' + elif tag == 'WDT': + readabletag = 'determiner' + elif tag in {'WP','WP$'}: + readabletag = 'pronoun' + elif tag == 'UH': + readabletag = 'interjection' + elif tag == 'POS': + readabletag = 'possesive ending' + elif tag == 'SYM': + readabletag = 'symbol' + elif tag == 'EX': + readabletag = 'existential there' + elif tag == 'DT': + readabletag = 'determiner' + elif tag == 'MD': + readabletag = 'modal' + elif tag == 'LS': + readabletag = 'list item marker' + elif tag == 'FW': + readabletag = 'foreign word' + elif tag == 'CC': + readabletag = 'coordinating conjunction ' + elif tag == 'CD': + readabletag = 'cardinal number' + elif tag == 'TO': + readabletag = 'to' + elif tag == '.': + readabletag = 'line ending' + elif tag == ',': + readabletag = 'comma' + else: + readabletag = tag -readabletaglist = [] + readabletaglist.append(readabletag) + return readabletaglist; -for tag in taglist: - if tag in {"NNP","NNS","NN","NNPS"}: - readabletag = 'noun' - elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}: - readabletag = 'verb' - elif tag in {'RB','RBR','RBS','WRB'}: - readabletag = 'adverb' - elif tag in {'PRP','PRP$'}: - readabletag = 'pronoun' - elif tag in {'JJ','JJR','JJS'}: - readabletag = 'adjective' - elif tag == 'IN': - readabletag = 'preposition' - elif tag == 'WDT': - readabletag = 'determiner' - elif tag in {'WP','WP$'}: - readabletag = 'pronoun' - elif tag == 'UH': - readabletag = 'interjection' - elif tag == 'POS': - readabletag = 'possesive ending' - elif tag == 'SYM': - readabletag = 'symbol' - elif tag == 'EX': - readabletag = 'existential there' - elif tag == 'DT': - readabletag = 'determiner' - elif tag == 'MD': - readabletag = 'modal' - elif tag == 'LS': - readabletag = 'list item marker' - elif tag == 'FW': - readabletag = 'foreign word' - elif tag == 'CC': - readabletag = 'coordinating conjunction ' - elif tag == 'CD': - readabletag = 'cardinal number' - elif tag == 'TO': - readabletag = 'to' - elif tag == '.': - readabletag = 'line ending' - elif tag == ',': - readabletag = 'comma' - else: - readabletag = tag - readabletaglist.append(readabletag) +# This function creates the output +def main(): + taglist = postagger(input) + readabletaglist = postagger_readable(taglist) + stdout.write(' '.join(readabletaglist)) + stdout.write('\n') -stdout.write(' '.join(readabletaglist)) +main()