Rewrote script to make use of functions.

7 years ago · 89e8436cd0
parent 3cd30d75a5
commit 89e8436cd0
2 changed files with 69 additions and 58 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/src/wordtagger.py
+++ b/src/wordtagger.py
@ -1,71 +1,82 @@
 import nltk
 from sys import stdin, stdout

-# Step 1: define input and set up a list
+# Define input
 input = stdin.read()
-taggedwordlist = []

-string = input
-words = nltk.word_tokenize(string)
-taggedwordlist = nltk.pos_tag(words)
-
-for word, pos in nltk.pos_tag(words):
+# FILTER FUNCTIONS
+# This function cuts a string into words. Then runs a POS tagger for each word. Returns a list with tags
+def postagger(string):
+    words = nltk.word_tokenize(string)
    taggedwordlist = nltk.pos_tag(words)
-    # print('{0} is a {1}'.format(word,pos)) # Command out to print the analysis step

+    for word, pos in nltk.pos_tag(words):
+        taggedwordlist = nltk.pos_tag(words)
+        #print('{0} is a {1}'.format(word,pos)) # Comment out to print the analysis step
+
+    taglist = [ pos for word,pos in taggedwordlist ]
+    #print(taglist)
+    return taglist;

-taglist = [ pos for word,pos in taggedwordlist ]
+# This function changes the tags to readable equivalents (NNP to noun for example)
+def postagger_readable(list):
+    readabletaglist = []

-#print(taglist)
+    for tag in list:
+        if tag in {"NNP","NNS","NN","NNPS"}:
+            readabletag = 'noun'
+        elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
+            readabletag = 'verb'
+        elif tag in {'RB','RBR','RBS','WRB'}:
+            readabletag = 'adverb'
+        elif tag in {'PRP','PRP$'}:
+            readabletag = 'pronoun'
+        elif tag in {'JJ','JJR','JJS'}:
+            readabletag = 'adjective'
+        elif tag == 'IN':
+            readabletag = 'preposition'
+        elif tag == 'WDT':
+            readabletag = 'determiner'
+        elif tag in {'WP','WP$'}:
+            readabletag = 'pronoun'
+        elif tag == 'UH':
+            readabletag = 'interjection'
+        elif tag == 'POS':
+            readabletag = 'possesive ending'
+        elif tag == 'SYM':
+            readabletag = 'symbol'
+        elif tag == 'EX':
+            readabletag = 'existential there'
+        elif tag == 'DT':
+            readabletag = 'determiner'
+        elif tag == 'MD':
+            readabletag = 'modal'
+        elif tag == 'LS':
+            readabletag = 'list item marker'
+        elif tag == 'FW':
+            readabletag = 'foreign word'
+        elif tag == 'CC':
+            readabletag = 'coordinating conjunction '
+        elif tag == 'CD':
+            readabletag = 'cardinal number'
+        elif tag == 'TO':
+            readabletag = 'to'
+        elif tag == '.':
+            readabletag = 'line ending'
+        elif tag == ',':
+            readabletag = 'comma'
+        else:
+            readabletag = tag

-readabletaglist = []
+        readabletaglist.append(readabletag)
+    return readabletaglist;

-for tag in taglist:
-    if tag in {"NNP","NNS","NN","NNPS"}:
-        readabletag = 'noun'
-    elif tag in {'VB','VBD','VBG','VBN','VBP','VBZ'}:
-        readabletag = 'verb'
-    elif tag in {'RB','RBR','RBS','WRB'}:
-        readabletag = 'adverb'
-    elif tag in {'PRP','PRP$'}:
-        readabletag = 'pronoun'
-    elif tag in {'JJ','JJR','JJS'}:
-        readabletag = 'adjective'
-    elif tag == 'IN':
-        readabletag = 'preposition'
-    elif tag == 'WDT':
-        readabletag = 'determiner'
-    elif tag in {'WP','WP$'}:
-        readabletag = 'pronoun'
-    elif tag == 'UH':
-        readabletag = 'interjection'
-    elif tag == 'POS':
-        readabletag = 'possesive ending'
-    elif tag == 'SYM':
-        readabletag = 'symbol'
-    elif tag == 'EX':
-        readabletag = 'existential there'
-    elif tag == 'DT':
-        readabletag = 'determiner'
-    elif tag == 'MD':
-        readabletag = 'modal'
-    elif tag == 'LS':
-        readabletag = 'list item marker'
-    elif tag == 'FW':
-        readabletag = 'foreign word'
-    elif tag == 'CC':
-        readabletag = 'coordinating conjunction '
-    elif tag == 'CD':
-        readabletag = 'cardinal number'
-    elif tag == 'TO':
-        readabletag = 'to'
-    elif tag == '.':
-        readabletag = 'line ending'
-    elif tag == ',':
-        readabletag = 'comma'
-    else:
-        readabletag = tag

-    readabletaglist.append(readabletag)
+# This function creates the output
+def main():
+    taglist = postagger(input)
+    readabletaglist = postagger_readable(taglist)
+    stdout.write(' '.join(readabletaglist))
+    stdout.write('\n')

-stdout.write(' '.join(readabletaglist))
+main()