57 KiB
NLTK - Part of Speech¶
import nltk import random
lines = open('manifesto.txt').readlines() sentence = random.choice(lines) print(sentence)
LIQUID BODIES ARE PLURI-PONTENT. THEY ARE CAPABLE OF MANY ACTS OF TRANSFORMATION. THEY DE-SIMPLIFY THE MATTER OF BEING A BODY THROUGH THEIR VISCERAL ENTANGLEMENTS. WHILE THE BÊTE MACHINE DEPENDS ON AN ABSTRACTED UNDERSTANDING OF ANATOMY FOUNDED UPON GENERALIZATIONS AND IDEALS, LIQUID BODIES RESIST THESE TROPES. LIQUID BODIES DISCUSS A MODE OF EXISTENCE THAT IS CONSTANTLY CHANGING – NOT AS THE CUMULATIVE OUTCOMES OF ‘ERROR’ – BUT AS A HIGHLY CHOREOGRAPHED AND CONTINUOUS SPECTRUM STREAM OF EVENTS THAT ARISE FROM THE PHYSICAL INTERACTIONS OF MATTER. THEY INTERNALIZE OTHER BODIES AS MANIFOLDS WITHIN THEIR SUBSTANCE AND ASSERT THEIR IDENTITY THROUGH THEIR ENVIRONMENTAL CONTEXTS. SUCH ENTANGLEMENTS INVOKE MARGINAL RELATIONS BETWEEN MULTIPLE AGENCIES AND EXCEED THE CLASSICAL LOGIC OF OBJECTS. THEY ARE INSEPARABLE FROM THEIR CONTEXT AND OFFER WAYS OF THINKING AND EXPERIMENTING WITH THE CONVENTIONS OF MAKING AND BEING EMBODIED.
Tokens¶
tokens = nltk.word_tokenize(sentence) print(tokens)
['LIQUID', 'BODIES', 'ARE', 'PLURI-PONTENT', '.', 'THEY', 'ARE', 'CAPABLE', 'OF', 'MANY', 'ACTS', 'OF', 'TRANSFORMATION', '.', 'THEY', 'DE-SIMPLIFY', 'THE', 'MATTER', 'OF', 'BEING', 'A', 'BODY', 'THROUGH', 'THEIR', 'VISCERAL', 'ENTANGLEMENTS', '.', 'WHILE', 'THE', 'BÊTE', 'MACHINE', 'DEPENDS', 'ON', 'AN', 'ABSTRACTED', 'UNDERSTANDING', 'OF', 'ANATOMY', 'FOUNDED', 'UPON', 'GENERALIZATIONS', 'AND', 'IDEALS', ',', 'LIQUID', 'BODIES', 'RESIST', 'THESE', 'TROPES', '.', 'LIQUID', 'BODIES', 'DISCUSS', 'A', 'MODE', 'OF', 'EXISTENCE', 'THAT', 'IS', 'CONSTANTLY', 'CHANGING', '–', 'NOT', 'AS', 'THE', 'CUMULATIVE', 'OUTCOMES', 'OF', '‘', 'ERROR', '’', '–', 'BUT', 'AS', 'A', 'HIGHLY', 'CHOREOGRAPHED', 'AND', 'CONTINUOUS', 'SPECTRUM', 'STREAM', 'OF', 'EVENTS', 'THAT', 'ARISE', 'FROM', 'THE', 'PHYSICAL', 'INTERACTIONS', 'OF', 'MATTER', '.', 'THEY', 'INTERNALIZE', 'OTHER', 'BODIES', 'AS', 'MANIFOLDS', 'WITHIN', 'THEIR', 'SUBSTANCE', 'AND', 'ASSERT', 'THEIR', 'IDENTITY', 'THROUGH', 'THEIR', 'ENVIRONMENTAL', 'CONTEXTS', '.', 'SUCH', 'ENTANGLEMENTS', 'INVOKE', 'MARGINAL', 'RELATIONS', 'BETWEEN', 'MULTIPLE', 'AGENCIES', 'AND', 'EXCEED', 'THE', 'CLASSICAL', 'LOGIC', 'OF', 'OBJECTS', '.', 'THEY', 'ARE', 'INSEPARABLE', 'FROM', 'THEIR', 'CONTEXT', 'AND', 'OFFER', 'WAYS', 'OF', 'THINKING', 'AND', 'EXPERIMENTING', 'WITH', 'THE', 'CONVENTIONS', 'OF', 'MAKING', 'AND', 'BEING', 'EMBODIED', '.']
Part of Speech "tags"¶
tagged = nltk.pos_tag(tokens) print(tagged)
[('LIQUID', 'JJ'), ('BODIES', 'NNP'), ('ARE', 'NNP'), ('PLURI-PONTENT', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'NNP'), ('CAPABLE', 'NNP'), ('OF', 'NNP'), ('MANY', 'NNP'), ('ACTS', 'NNP'), ('OF', 'NNP'), ('TRANSFORMATION', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('DE-SIMPLIFY', 'VBP'), ('THE', 'NNP'), ('MATTER', 'NNP'), ('OF', 'NNP'), ('BEING', 'NNP'), ('A', 'NNP'), ('BODY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('VISCERAL', 'NNP'), ('ENTANGLEMENTS', 'NNP'), ('.', '.'), ('WHILE', 'IN'), ('THE', 'DT'), ('BÊTE', 'NNP'), ('MACHINE', 'NNP'), ('DEPENDS', 'NNP'), ('ON', 'NNP'), ('AN', 'NNP'), ('ABSTRACTED', 'NNP'), ('UNDERSTANDING', 'NN'), ('OF', 'NNP'), ('ANATOMY', 'NNP'), ('FOUNDED', 'NNP'), ('UPON', 'NNP'), ('GENERALIZATIONS', 'NNP'), ('AND', 'NNP'), ('IDEALS', 'NNP'), (',', ','), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('RESIST', 'NNP'), ('THESE', 'NNP'), ('TROPES', 'NNP'), ('.', '.'), ('LIQUID', 'NNP'), ('BODIES', 'NNP'), ('DISCUSS', 'NNP'), ('A', 'NNP'), ('MODE', 'NNP'), ('OF', 'NNP'), ('EXISTENCE', 'NNP'), ('THAT', 'NNP'), ('IS', 'VBZ'), ('CONSTANTLY', 'NNP'), ('CHANGING', 'NNP'), ('–', 'NNP'), ('NOT', 'NNP'), ('AS', 'IN'), ('THE', 'NNP'), ('CUMULATIVE', 'NNP'), ('OUTCOMES', 'NNP'), ('OF', 'NNP'), ('‘', 'NNP'), ('ERROR', 'NNP'), ('’', 'NNP'), ('–', 'NNP'), ('BUT', 'NNP'), ('AS', 'IN'), ('A', 'NNP'), ('HIGHLY', 'NNP'), ('CHOREOGRAPHED', 'NNP'), ('AND', 'NNP'), ('CONTINUOUS', 'NNP'), ('SPECTRUM', 'NNP'), ('STREAM', 'NNP'), ('OF', 'NNP'), ('EVENTS', 'NNP'), ('THAT', 'NNP'), ('ARISE', 'NNP'), ('FROM', 'NNP'), ('THE', 'NNP'), ('PHYSICAL', 'NNP'), ('INTERACTIONS', 'NNP'), ('OF', 'NNP'), ('MATTER', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('INTERNALIZE', 'NNP'), ('OTHER', 'NNP'), ('BODIES', 'NNP'), ('AS', 'NNP'), ('MANIFOLDS', 'NNP'), ('WITHIN', 'NNP'), ('THEIR', 'NNP'), ('SUBSTANCE', 'NNP'), ('AND', 'NNP'), ('ASSERT', 'NNP'), ('THEIR', 'NNP'), ('IDENTITY', 'NNP'), ('THROUGH', 'NNP'), ('THEIR', 'NNP'), ('ENVIRONMENTAL', 'NNP'), ('CONTEXTS', 'NNP'), ('.', '.'), ('SUCH', 'JJ'), ('ENTANGLEMENTS', 'NNP'), ('INVOKE', 'NNP'), ('MARGINAL', 'NNP'), ('RELATIONS', 'NNP'), ('BETWEEN', 'NNP'), ('MULTIPLE', 'NNP'), ('AGENCIES', 'NNP'), ('AND', 'NNP'), ('EXCEED', 'NNP'), ('THE', 'NNP'), ('CLASSICAL', 'NNP'), ('LOGIC', 'NNP'), ('OF', 'NNP'), ('OBJECTS', 'NNP'), ('.', '.'), ('THEY', 'NNP'), ('ARE', 'VBP'), ('INSEPARABLE', 'NNP'), ('FROM', 'NNP'), ('THEIR', 'NNP'), ('CONTEXT', 'NNP'), ('AND', 'NNP'), ('OFFER', 'NNP'), ('WAYS', 'NNP'), ('OF', 'NNP'), ('THINKING', 'NNP'), ('AND', 'NNP'), ('EXPERIMENTING', 'NNP'), ('WITH', 'NNP'), ('THE', 'NNP'), ('CONVENTIONS', 'NNP'), ('OF', 'NNP'), ('MAKING', 'NNP'), ('AND', 'NNP'), ('BEING', 'NNP'), ('EMBODIED', 'NNP'), ('.', '.')]
Now, you could select for example all the type of verbs:
selection = [] for word, tag in tagged: if 'NN' in tag: selection.append(word) print(selection)
['LIQUID', 'BODIES', 'INVITE', 'US', 'TO', 'ARTICULATE', 'THE', 'FUZZINESS', 'PARADOXES', 'AND', 'UNCERTAINTIES', 'THE', 'LIVING', 'REALM', 'THEY', 'ARE', 'STILL', 'INSTANTLY', 'RECOGNIZABLE', 'CAN', 'BE', 'NAMED', 'AS', 'TORNADO', 'CIRRUS', 'SOIL', 'EMBRYO', 'OR', 'BIOFILM', 'THESE', 'CONTRADICTIONS', '–', 'OF', 'FORM', 'AND', 'CONSTANCY', '–', 'ENCOURAGE', 'ALTERNATIVE', 'READINGS', 'OF', 'HOW', 'WE', 'ORDER', 'AND', 'SORT', 'THE', 'WORLD', 'WHOSE', 'MAIN', 'METHODOLOGY', 'IS', 'THROUGH', 'RELATING', 'ONE', 'BODY', 'TO', 'ANOTHER', 'INDEED', 'PROTEAN', 'LIQUID', 'BODIES', 'HELP', 'US', 'UNDERSTAND', 'THAT', 'WHILE', 'UNIVERSALISMS', 'AVERAGES', 'AND', 'GENERALIZATIONS', 'ARE', 'USEFUL', 'IN', 'PRODUCING', 'MAPS', 'OUR', 'BEING', 'IN', 'THE', 'WORLD', 'THEY', 'NEGLECT', 'SPECIFIC', 'DETAILS', 'WHICH', '‘', 'BRING', 'FORTH', 'THE', 'MATERIALITY', 'THE', 'ENVIRONMENT']
Where do these tags come from?¶
An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset.
NLTK provides documentation for each tag, which can be queried using the tag, e.g. nltk.help.upenn_tagset('RB').
nltk.help.upenn_tagset('PRP')
--------------------------------------------------------------------------- LookupError Traceback (most recent call last) <ipython-input-10-b694f07a3ba6> in <module> ----> 1 nltk.help.upenn_tagset('PRP') /usr/local/lib/python3.7/dist-packages/nltk/help.py in upenn_tagset(tagpattern) 25 26 def upenn_tagset(tagpattern=None): ---> 27 _format_tagset("upenn_tagset", tagpattern) 28 29 /usr/local/lib/python3.7/dist-packages/nltk/help.py in _format_tagset(tagset, tagpattern) 44 45 def _format_tagset(tagset, tagpattern=None): ---> 46 tagdict = load("help/tagsets/" + tagset + ".pickle") 47 if not tagpattern: 48 _print_entries(sorted(tagdict), tagdict) /usr/local/lib/python3.7/dist-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding) 750 751 # Load the resource. --> 752 opened_resource = _open(resource_url) 753 754 if format == "raw": /usr/local/lib/python3.7/dist-packages/nltk/data.py in _open(resource_url) 875 876 if protocol is None or protocol.lower() == "nltk": --> 877 return find(path_, path + [""]).open() 878 elif protocol.lower() == "file": 879 # urllib might not use mode='rb', so handle this one ourselves: /usr/local/lib/python3.7/dist-packages/nltk/data.py in find(resource_name, paths) 583 sep = "*" * 70 584 resource_not_found = "\n%s\n%s\n%s\n" % (sep, msg, sep) --> 585 raise LookupError(resource_not_found) 586 587 LookupError: ********************************************************************** Resource tagsets not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('tagsets') For more information see: https://www.nltk.org/data.html Attempted to load help/tagsets/PY3/upenn_tagset.pickle Searched in: - '/home/kendalb/nltk_data' - '/usr/nltk_data' - '/usr/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' - '' **********************************************************************
An alphabetical list of part-of-speech tags used in the Penn Treebank Project (link):
Number
|
Tag
|
Description
|
1. | CC | Coordinating conjunction |
2. | CD | Cardinal number |
3. | DT | Determiner |
4. | EX | Existential there |
5. | FW | Foreign word |
6. | IN | Preposition or subordinating conjunction |
7. | JJ | Adjective |
8. | JJR | Adjective, comparative |
9. | JJS | Adjective, superlative |
10. | LS | List item marker |
11. | MD | Modal |
12. | NN | Noun, singular or mass |
13. | NNS | Noun, plural |
14. | NNP | Proper noun, singular |
15. | NNPS | Proper noun, plural |
16. | PDT | Predeterminer |
17. | POS | Possessive ending |
18. | PRP | Personal pronoun |
19. | PRP\$ | Possessive pronoun |
20. | RB | Adverb |
21. | RBR | Adverb, comparative |
22. | RBS | Adverb, superlative |
23. | RP | Particle |
24. | SYM | Symbol |
25. | TO | to |
26. | UH | Interjection |
27. | VB | Verb, base form |
28. | VBD | Verb, past tense |
29. | VBG | Verb, gerund or present participle |
30. | VBN | Verb, past participle |
31. | VBP | Verb, non-3rd person singular present |
32. | VBZ | Verb, 3rd person singular present |
33. | WDT | Wh-determiner |
34. | WP | Wh-pronoun |
35. | WP$ | Possessive wh-pronoun |
36. | WRB | Wh-adverb |
A telling/tricky case¶
It's important to realize that POS tagging is not a fixed property of a word -- but depends on the context of each word. The NLTK book gives an example of homonyms -- words that are written the same, but are actually pronounced differently and have different meanings depending on their use.
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit") nltk.pos_tag(text)
From the book:
Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," while REFuse is a noun meaning "trash" (i.e. they are not homophones). Thus, we need to know which word is being used in order to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)
Applying to an entire text¶
language = open('../txt/language.txt').read() tokens = nltk.word_tokenize(language) tagged = nltk.pos_tag(tokens)
tagged
[('Language', 'NN'), ('Florian', 'JJ'), ('Cramer', 'NNP'), ('Software', 'NNP'), ('and', 'CC'), ('language', 'NN'), ('are', 'VBP'), ('intrinsically', 'RB'), ('related', 'VBN'), (',', ','), ('since', 'IN'), ('software', 'NN'), ('may', 'MD'), ('process', 'VB'), ('language', 'NN'), (',', ','), ('and', 'CC'), ('is', 'VBZ'), ('constructed', 'VBN'), ('in', 'IN'), ('language', 'NN'), ('.', '.'), ('Yet', 'CC'), ('language', 'NN'), ('means', 'VBZ'), ('different', 'JJ'), ('things', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('context', 'NN'), ('of', 'IN'), ('computing', 'VBG'), (':', ':'), ('formal', 'JJ'), ('languages', 'NNS'), ('in', 'IN'), ('which', 'WDT'), ('algorithms', 'EX'), ('are', 'VBP'), ('expressed', 'VBN'), ('and', 'CC'), ('software', 'NN'), ('is', 'VBZ'), ('implemented', 'VBN'), (',', ','), ('and', 'CC'), ('in', 'IN'), ('so-called', 'JJ'), ('“', 'NNP'), ('natural', 'JJ'), ('”', 'NNP'), ('spoken', 'NN'), ('languages', 'NNS'), ('.', '.'), ('There', 'EX'), ('are', 'VBP'), ('at', 'IN'), ('least', 'JJS'), ('two', 'CD'), ('layers', 'NNS'), ('of', 'IN'), ('formal', 'JJ'), ('language', 'NN'), ('in', 'IN'), ('software', 'NN'), (':', ':'), ('programming', 'NN'), ('language', 'NN'), ('in', 'IN'), ('which', 'WDT'), ('the', 'DT'), ('software', 'NN'), ('is', 'VBZ'), ('written', 'VBN'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('language', 'NN'), ('implemented', 'VBD'), ('within', 'IN'), ('the', 'DT'), ('software', 'NN'), ('as', 'IN'), ('its', 'PRP$'), ('symbolic', 'JJ'), ('controls', 'NNS'), ('.', '.'), ('In', 'IN'), ('the', 'DT'), ('case', 'NN'), ('of', 'IN'), ('compilers', 'NNS'), (',', ','), ('shells', 'NNS'), (',', ','), ('and', 'CC'), ('macro', 'NN'), ('languages', 'NNS'), (',', ','), ('for', 'IN'), ('example', 'NN'), (',', ','), ('these', 'DT'), ('layers', 'NNS'), ('can', 'MD'), ('overlap', 'VB'), ('.', '.'), ('“', 'VB'), ('Natural', 'NNP'), ('”', 'NNP'), ('language', 'NN'), ('is', 'VBZ'), ('what', 'WP'), ('can', 'MD'), ('be', 'VB'), ('processed', 'VBN'), ('as', 'IN'), ('data', 'NNS'), ('by', 'IN'), ('software', 'NN'), (';', ':'), ('since', 'IN'), ('this', 'DT'), ('processing', 'NN'), ('is', 'VBZ'), ('formal', 'JJ'), (',', ','), ('however', 'RB'), (',', ','), ('it', 'PRP'), ('is', 'VBZ'), ('restricted', 'VBN'), ('to', 'TO'), ('syntactical', 'JJ'), ('operations', 'NNS'), ('.', '.'), ('While', 'IN'), ('differentiation', 'NN'), ('of', 'IN'), ('computer', 'NN'), ('programming', 'VBG'), ('languages', 'NNS'), ('as', 'IN'), ('“', 'JJ'), ('artificial', 'JJ'), ('languages', 'NNS'), ('”', 'VBP'), ('from', 'IN'), ('languages', 'NNS'), ('like', 'VBP'), ('English', 'NNP'), ('as', 'IN'), ('“', 'NNP'), ('natural', 'JJ'), ('languages', 'NNS'), ('”', 'VBP'), ('is', 'VBZ'), ('conceptually', 'RB'), ('important', 'JJ'), ('and', 'CC'), ('undisputed', 'JJ'), (',', ','), ('it', 'PRP'), ('remains', 'VBZ'), ('problematic', 'JJ'), ('in', 'IN'), ('its', 'PRP$'), ('pure', 'NN'), ('terminology', 'NN'), (':', ':'), ('There', 'EX'), ('is', 'VBZ'), ('nothing', 'NN'), ('“', 'JJ'), ('natural', 'JJ'), ('”', 'NN'), ('about', 'IN'), ('spoken', 'JJ'), ('language', 'NN'), (';', ':'), ('it', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('cultural', 'JJ'), ('construct', 'NN'), ('and', 'CC'), ('thus', 'RB'), ('just', 'RB'), ('as', 'IN'), ('“', 'JJ'), ('artificial', 'JJ'), ('”', 'NN'), ('as', 'IN'), ('any', 'DT'), ('formal', 'JJ'), ('machine', 'NN'), ('control', 'NN'), ('language', 'NN'), ('.', '.'), ('To', 'TO'), ('call', 'VB'), ('programming', 'NN'), ('languages', 'NNS'), ('“', 'VBP'), ('machine', 'NN'), ('languages', 'NNS'), ('”', 'VBP'), ('doesn', 'JJ'), ('’', 'NNP'), ('t', 'NN'), ('solve', 'VBP'), ('the', 'DT'), ('problem', 'NN'), ('either', 'RB'), (',', ','), ('as', 'IN'), ('it', 'PRP'), ('obscures', 'VBZ'), ('that', 'IN'), ('“', 'FW'), ('machine', 'NN'), ('languages', 'NNS'), ('”', 'VBP'), ('are', 'VBP'), ('human', 'JJ'), ('creations', 'NNS'), ('.', '.'), ('High-level', 'JJ'), ('machine-independent', 'JJ'), ('programming', 'NN'), ('languages', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('Fortran', 'NNP'), (',', ','), ('C', 'NNP'), (',', ','), ('Java', 'NNP'), (',', ','), ('and', 'CC'), ('Basic', 'NNP'), ('are', 'VBP'), ('not', 'RB'), ('even', 'RB'), ('direct', 'JJ'), ('mappings', 'NNS'), ('of', 'IN'), ('machine', 'NN'), ('logic', 'NN'), ('.', '.'), ('If', 'IN'), ('programming', 'JJ'), ('languages', 'NNS'), ('are', 'VBP'), ('human', 'JJ'), ('languages', 'NNS'), ('for', 'IN'), ('machine', 'NN'), ('control', 'NN'), (',', ','), ('they', 'PRP'), ('could', 'MD'), ('be', 'VB'), ('called', 'VBN'), ('cybernetic', 'JJ'), ('languages', 'NNS'), ('.', '.'), ('But', 'CC'), ('these', 'DT'), ('languages', 'NNS'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('used', 'VBN'), ('outside', 'JJ'), ('machines—in', 'NN'), ('programming', 'VBG'), ('handbooks', 'NNS'), (',', ','), ('for', 'IN'), ('example', 'NN'), (',', ','), ('in', 'IN'), ('programmer', 'NN'), ('’', 'NNP'), ('s', 'NN'), ('dinner', 'NN'), ('table', 'JJ'), ('jokes', 'NNS'), (',', ','), ('or', 'CC'), ('as', 'IN'), ('abstract', 'JJ'), ('formal', 'JJ'), ('languages', 'NNS'), ('for', 'IN'), ('expressing', 'VBG'), ('logical', 'JJ'), ('constructs', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('in', 'IN'), ('Hugh', 'NNP'), ('Kenner', 'NNP'), ('’', 'NNP'), ('s', 'NN'), ('use', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Pascal', 'NNP'), ('programming', 'NN'), ('language', 'NN'), ('to', 'TO'), ('explain', 'VB'), ('aspects', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('structure', 'NN'), ('of', 'IN'), ('Samuel', 'NNP'), ('Beckett', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('writing.1', 'NN'), ('In', 'IN'), ('this', 'DT'), ('sense', 'NN'), (',', ','), ('computer', 'NN'), ('control', 'NN'), ('languages', 'NNS'), ('could', 'MD'), ('be', 'VB'), ('more', 'RBR'), ('broadly', 'RB'), ('defined', 'VBN'), ('as', 'IN'), ('syntactical', 'JJ'), ('languages', 'NNS'), ('as', 'IN'), ('opposed', 'VBN'), ('to', 'TO'), ('semantic', 'JJ'), ('languages', 'NNS'), ('.', '.'), ('But', 'CC'), ('this', 'DT'), ('terminology', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('without', 'IN'), ('its', 'PRP$'), ('problems', 'NNS'), ('either', 'DT'), ('.', '.'), ('Common', 'JJ'), ('languages', 'NNS'), ('like', 'IN'), ('English', 'NNP'), ('are', 'VBP'), ('both', 'DT'), ('formal', 'JJ'), ('and', 'CC'), ('semantic', 'JJ'), (';', ':'), ('although', 'IN'), ('their', 'PRP$'), ('scope', 'NN'), ('extends', 'VBZ'), ('beyond', 'IN'), ('the', 'DT'), ('formal', 'JJ'), (',', ','), ('anything', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('can', 'MD'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('common', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('follows', 'VBZ'), ('that', 'IN'), ('computer', 'NN'), ('control', 'NN'), ('languages', 'NNS'), ('are', 'VBP'), ('a', 'DT'), ('formal', 'JJ'), ('(', '('), ('and', 'CC'), ('as', 'IN'), ('such', 'JJ'), ('rather', 'RB'), ('primitive', 'JJ'), (')', ')'), ('subset', 'NN'), ('of', 'IN'), ('common', 'JJ'), ('human', 'JJ'), ('languages', 'NNS'), ('.', '.'), ('To', 'TO'), ('complicate', 'VB'), ('things', 'NNS'), ('even', 'RB'), ('further', 'RB'), (',', ','), ('computer', 'NN'), ('science', 'NN'), ('has', 'VBZ'), ('its', 'PRP$'), ('own', 'JJ'), ('understanding', 'NN'), ('of', 'IN'), ('“', 'NNP'), ('operational', 'JJ'), ('semantics', 'NNS'), ('”', 'VBP'), ('in', 'IN'), ('programming', 'NN'), ('languages', 'NNS'), (',', ','), ('for', 'IN'), ('example', 'NN'), ('in', 'IN'), ('the', 'DT'), ('construction', 'NN'), ('of', 'IN'), ('a', 'DT'), ('programming', 'JJ'), ('language', 'NN'), ('interpreter', 'NN'), ('or', 'CC'), ('compiler', 'NN'), ('.', '.'), ('Just', 'RB'), ('as', 'IN'), ('this', 'DT'), ('interpreter', 'NN'), ('doesn', 'NN'), ('’', 'NNP'), ('t', 'NN'), ('perform', 'NN'), ('“', 'NNP'), ('interpretations', 'NNS'), ('”', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('hermeneutic', 'JJ'), ('sense', 'NN'), ('of', 'IN'), ('semantic', 'JJ'), ('text', 'NN'), ('explication', 'NN'), (',', ','), ('the', 'DT'), ('computer', 'NN'), ('science', 'NN'), ('notion', 'NN'), ('of', 'IN'), ('“', 'JJ'), ('semantics', 'NNS'), ('”', 'JJ'), ('defies', 'NNS'), ('linguistic', 'JJ'), ('and', 'CC'), ('common', 'JJ'), ('sense', 'NN'), ('understanding', 'NN'), ('of', 'IN'), ('the', 'DT'), ('word', 'NN'), (',', ','), ('since', 'IN'), ('compiler', 'NN'), ('construction', 'NN'), ('is', 'VBZ'), ('purely', 'RB'), ('syntactical', 'JJ'), (',', ','), ('and', 'CC'), ('programming', 'VBG'), ('languages', 'NNS'), ('denote', 'VBP'), ('nothing', 'NN'), ('but', 'CC'), ('syntactical', 'JJ'), ('manipulations', 'NNS'), ('of', 'IN'), ('symbols', 'NNS'), ('.', '.'), ('What', 'WP'), ('might', 'MD'), ('more', 'JJR'), ('suitably', 'RB'), ('be', 'VB'), ('called', 'VBN'), ('the', 'DT'), ('semantics', 'NNS'), ('of', 'IN'), ('computer', 'NN'), ('control', 'NN'), ('languages', 'VBZ'), ('resides', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('symbols', 'NNS'), ('with', 'IN'), ('which', 'WDT'), ('those', 'DT'), ('operations', 'NNS'), ('are', 'VBP'), ('denoted', 'VBN'), ('in', 'IN'), ('most', 'JJS'), ('programming', 'JJ'), ('languages', 'NNS'), (':', ':'), ('English', 'JJ'), ('words', 'NNS'), ('like', 'IN'), ('“', 'NN'), ('if', 'IN'), (',', ','), ('”', 'FW'), ('“', 'FW'), ('then', 'RB'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('else', 'RB'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('for', 'IN'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('while', 'IN'), (',', ','), ('”', 'FW'), ('“', 'NNP'), ('goto', 'NN'), (',', ','), ('”', 'NNP'), ('and', 'CC'), ('“', 'NNP'), ('print', 'NN'), (',', ','), ('”', 'NN'), ('in', 'IN'), ('conjunction', 'NN'), ('with', 'IN'), ('arithmetical', 'JJ'), ('and', 'CC'), ('punctuation', 'NN'), ('symbols', 'NNS'), (';', ':'), ('in', 'IN'), ('alphabetic', 'JJ'), ('software', 'NN'), ('controls', 'NNS'), (',', ','), ('words', 'NNS'), ('like', 'IN'), ('“', 'NNP'), ('list', 'NN'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('move', 'NN'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('copy', 'NN'), (',', ','), ('”', 'NN'), ('and', 'CC'), ('“', 'NNP'), ('paste', 'NN'), ('”', 'NN'), (';', ':'), ('in', 'IN'), ('graphical', 'JJ'), ('software', 'NN'), ('controls', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('symbols', 'NNS'), ('like', 'IN'), ('the', 'DT'), ('trash', 'NN'), ('can', 'MD'), ('.', '.'), ('Ferdinand', 'NNP'), ('de', 'IN'), ('Saussure', 'NNP'), ('states', 'VBZ'), ('that', 'IN'), ('the', 'DT'), ('signs', 'NNS'), ('of', 'IN'), ('common', 'JJ'), ('human', 'JJ'), ('language', 'NN'), ('are', 'VBP'), ('arbitrary2', 'RB'), ('because', 'IN'), ('it', 'PRP'), ('’', 'VBZ'), ('s', 'JJ'), ('purely', 'RB'), ('a', 'DT'), ('cultural-social', 'JJ'), ('convention', 'NN'), ('that', 'IN'), ('assigns', 'VBZ'), ('phonemes', 'NNS'), ('to', 'TO'), ('concepts', 'NNS'), ('.', '.'), ('Likewise', 'NNP'), (',', ','), ('it', 'PRP'), ('’', 'VBZ'), ('s', 'JJ'), ('purely', 'RB'), ('a', 'DT'), ('cultural', 'JJ'), ('convention', 'NN'), ('to', 'TO'), ('assign', 'VB'), ('symbols', 'NNS'), ('to', 'TO'), ('machine', 'NN'), ('operations', 'NNS'), ('.', '.'), ('But', 'CC'), ('just', 'RB'), ('as', 'IN'), ('the', 'DT'), ('cultural', 'JJ'), ('choice', 'NN'), ('of', 'IN'), ('phonemes', 'NNS'), ('in', 'IN'), ('spoken', 'JJ'), ('language', 'NN'), ('is', 'VBZ'), ('restrained', 'VBN'), ('by', 'IN'), ('what', 'WP'), ('the', 'DT'), ('human', 'JJ'), ('voice', 'NN'), ('can', 'MD'), ('pronounce', 'VB'), (',', ','), ('the', 'DT'), ('assignment', 'NN'), ('of', 'IN'), ('symbols', 'NNS'), ('to', 'TO'), ('machine', 'NN'), ('operations', 'NNS'), ('is', 'VBZ'), ('limited', 'VBN'), ('to', 'TO'), ('what', 'WP'), ('can', 'MD'), ('be', 'VB'), ('efficiently', 'RB'), ('processed', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('machine', 'NN'), ('and', 'CC'), ('of', 'IN'), ('good', 'JJ'), ('use', 'NN'), ('to', 'TO'), ('humans.3', 'VB'), ('This', 'DT'), ('compromise', 'NN'), ('between', 'IN'), ('operability', 'NN'), ('and', 'CC'), ('usability', 'NN'), ('is', 'VBZ'), ('obvious', 'JJ'), ('in', 'IN'), (',', ','), ('for', 'IN'), ('example', 'NN'), (',', ','), ('Unix', 'NNP'), ('commands', 'VBZ'), ('.', '.'), ('Originally', 'RB'), ('used', 'VBN'), ('on', 'IN'), ('teletype', 'NN'), ('terminals', 'NNS'), (',', ','), ('the', 'DT'), ('operation', 'NN'), ('“', 'NNP'), ('copy', 'NN'), ('”', 'NN'), ('was', 'VBD'), ('abbreviated', 'VBN'), ('to', 'TO'), ('the', 'DT'), ('command', 'NN'), ('“', 'NNP'), ('cp', 'NN'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('move', 'NN'), ('”', 'NN'), ('to', 'TO'), ('“', 'VB'), ('mv', 'NN'), (',', ','), ('”', 'NNP'), ('“', 'NNP'), ('list', 'NN'), ('”', 'NN'), ('to', 'TO'), ('“', 'VB'), ('ls', 'NN'), (',', ','), ('”', 'NNP'), ('etc.', 'NN'), (',', ','), ('in', 'IN'), ('order', 'NN'), ('to', 'TO'), ('cut', 'VB'), ('down', 'RP'), ('machine', 'NN'), ('memory', 'NN'), ('use', 'NN'), (',', ','), ('teletype', 'JJ'), ('paper', 'NN'), ('consumption', 'NN'), (',', ','), ('and', 'CC'), ('human', 'JJ'), ('typing', 'VBG'), ('effort', 'NN'), ('at', 'IN'), ('the', 'DT'), ('same', 'JJ'), ('time', 'NN'), ('.', '.'), ('Any', 'DT'), ('computer', 'NN'), ('control', 'NN'), ('language', 'NN'), ('is', 'VBZ'), ('thus', 'RB'), ('a', 'DT'), ('cultural', 'JJ'), ('compromise', 'NN'), ('between', 'IN'), ('the', 'DT'), ('constraints', 'NNS'), ('of', 'IN'), ('machine', 'NN'), ('design—which', 'NN'), ('is', 'VBZ'), ('far', 'RB'), ('from', 'IN'), ('objective', 'JJ'), (',', ','), ('but', 'CC'), ('based', 'VBN'), ('on', 'IN'), ('human', 'JJ'), ('choices', 'NNS'), (',', ','), ('culture', 'NN'), (',', ','), ('and', 'CC'), ('thinking', 'VBG'), ('style', 'NN'), ('itself', 'PRP'), ('4—and', 'CD'), ('the', 'DT'), ('equally', 'RB'), ('subjective', 'JJ'), ('user', 'NN'), ('preferences', 'NNS'), (',', ','), ('involving', 'VBG'), ('fuzzy', 'JJ'), ('factors', 'NNS'), ('like', 'IN'), ('readability', 'NN'), (',', ','), ('elegance', 'NN'), (',', ','), ('and', 'CC'), ('usage', 'JJ'), ('efficiency', 'NN'), ('.', '.'), ('The', 'DT'), ('symbols', 'NNS'), ('of', 'IN'), ('computer', 'NN'), ('control', 'NN'), ('languages', 'VBZ'), ('inevitably', 'RB'), ('do', 'VBP'), ('have', 'VB'), ('semantic', 'JJ'), ('connotations', 'NNS'), ('simply', 'RB'), ('because', 'IN'), ('there', 'EX'), ('exist', 'VBP'), ('no', 'DT'), ('symbols', 'NNS'), ('with', 'IN'), ('which', 'WDT'), ('humans', 'NNS'), ('would', 'MD'), ('not', 'RB'), ('associate', 'VB'), ('some', 'DT'), ('meaning', 'NN'), ('.', '.'), ('But', 'CC'), ('symbols', 'NNS'), ('can', 'MD'), ('’', 'VB'), ('t', 'JJ'), ('denote', 'NN'), ('any', 'DT'), ('semantic', 'JJ'), ('statements', 'NNS'), (',', ','), ('that', 'DT'), ('is', 'VBZ'), (',', ','), ('they', 'PRP'), ('do', 'VBP'), ('not', 'RB'), ('express', 'VB'), ('meaning', 'VBG'), ('in', 'IN'), ('their', 'PRP$'), ('own', 'JJ'), ('terms', 'NNS'), (';', ':'), ('humans', 'NNS'), ('metaphorically', 'RB'), ('read', 'VB'), ('meaning', 'VBG'), ('into', 'IN'), ('them', 'PRP'), ('through', 'IN'), ('associations', 'NNS'), ('they', 'PRP'), ('make', 'VBP'), ('.', '.'), ('Languages', 'NNS'), ('without', 'IN'), ('semantic', 'JJ'), ('denotation', 'NN'), ('are', 'VBP'), ('not', 'RB'), ('historically', 'RB'), ('new', 'JJ'), ('phenomena', 'NNS'), (';', ':'), ('mathematical', 'JJ'), ('formulas', 'NNS'), ('are', 'VBP'), ('their', 'PRP$'), ('oldest', 'JJS'), ('example', 'NN'), ('.', '.'), ('In', 'IN'), ('comparison', 'NN'), ('to', 'TO'), ('common', 'JJ'), ('human', 'JJ'), ('languages', 'NNS'), (',', ','), ('the', 'DT'), ('multitude', 'NN'), ('of', 'IN'), ('programming', 'VBG'), ('languages', 'NNS'), ('is', 'VBZ'), ('of', 'IN'), ('lesser', 'JJR'), ('significance', 'NN'), ('.', '.'), ('The', 'DT'), ('criterion', 'NN'), ('of', 'IN'), ('Turing', 'NNP'), ('completeness', 'NN'), ('of', 'IN'), ('a', 'DT'), ('programming', 'NN'), ('language', 'NN'), (',', ','), ('that', 'WDT'), ('is', 'VBZ'), (',', ','), ('that', 'IN'), ('any', 'DT'), ('computation', 'NN'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('it', 'PRP'), (',', ','), ('means', 'VBZ'), ('that', 'IN'), ('every', 'DT'), ('programming', 'NN'), ('language', 'NN'), ('is', 'VBZ'), (',', ','), ('formally', 'RB'), ('speaking', 'VBG'), (',', ','), ('just', 'RB'), ('a', 'DT'), ('riff', 'NN'), ('on', 'IN'), ('every', 'DT'), ('other', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('.', '.'), ('Nothing', 'NN'), ('can', 'MD'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('Turingcomplete', 'JJ'), ('language', 'NN'), ('such', 'JJ'), ('as', 'IN'), ('C', 'NNP'), ('that', 'IN'), ('couldn', 'NN'), ('’', 'NNP'), ('t', 'NN'), ('also', 'RB'), ('be', 'VB'), ('expressed', 'VBN'), ('in', 'IN'), ('another', 'DT'), ('Turingcomplete', 'NNP'), ('language', 'NN'), ('such', 'JJ'), ('as', 'IN'), ('Lisp', 'NNP'), ('(', '('), ('or', 'CC'), ('Fortran', 'NNP'), (',', ','), ('Smalltalk', 'NNP'), (',', ','), ('Java', 'NNP'), ('...', ':'), (')', ')'), ('and', 'CC'), ('vice', 'NN'), ('versa', 'NN'), ('.', '.'), ('This', 'DT'), ('ultimately', 'JJ'), ('proves', 'VBZ'), ('the', 'DT'), ...]
words = "in the beginning was heaven and earth and the time of the whatever".split()
words
['in', 'the', 'beginning', 'was', 'heaven', 'and', 'earth', 'and', 'the', 'time', 'of', 'the', 'whatever']
words.index("the")
1
for i, word in enumerate(words): if word == "the": print (i, word) else: print (word.upper())
IN 1 the BEGINNING WAS HEAVEN AND EARTH AND 8 the TIME OF 11 the WHATEVER
import random words = {} words["VB"] = [] for word in nltk.word_tokenize("in the beginning was heaven and earth and the time of the whatever"): words["VB"].append(word) random.choice(words["VB"])
'in'