You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
2.2 KiB
Python
58 lines
2.2 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.en import parse, pprint, tag
|
|
|
|
# The en module contains a fast regular expressions-based parser.
|
|
# A parser identifies words in a sentence, word part-of-speech tags (e.g. noun, verb)
|
|
# and groups of words that belong together (e.g. noun phrases).
|
|
# Common part-of-speech tags: NN (noun), VB (verb), JJ (adjective), PP (preposition).
|
|
# A tag can have a suffix, for example NNS (plural noun) or VBG (gerund verb).
|
|
# Overview of tags: http://www.clips.ua.ac.be/pages/mbsp-tags
|
|
s = "I eat pizza with a fork."
|
|
s = parse(s,
|
|
tokenize = True, # Tokenize the input, i.e. split punctuation from words.
|
|
tags = True, # Find part-of-speech tags.
|
|
chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
|
|
relations = True, # Find relations between chunks.
|
|
lemmata = True, # Find word lemmata.
|
|
light = False)
|
|
|
|
# The light parameter determines how unknown words are handled.
|
|
# By default, unknown words are tagged NN and then improved with a set of rules.
|
|
# light=False uses Brill's lexical and contextual rules,
|
|
# light=True uses a set of custom rules that is less accurate but faster (5x-10x).
|
|
|
|
# The output is a string with each sentence on a new line.
|
|
# Words in a sentence have been annotated with tags,
|
|
# for example: fork/NN/I-NP/I-PNP
|
|
# NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase.
|
|
print(s)
|
|
print("")
|
|
|
|
# Prettier output can be obtained with the pprint() command:
|
|
pprint(s)
|
|
print("")
|
|
|
|
# The string's split() method will (unless a split character is given),
|
|
# split into a list of sentences, where each sentence is a list of words
|
|
# and each word is a list with the word + its tags.
|
|
print(s.split())
|
|
print("")
|
|
|
|
# The tag() command returns a list of (word, POS-tag)-tuples.
|
|
# With light=True, this is the fastest and simplest way to get an idea
|
|
# of a sentence's constituents:
|
|
s = "I eat pizza with a fork."
|
|
s = tag(s)
|
|
print(s)
|
|
for word, tag in s:
|
|
if tag == "NN": # Find all nouns in the input string.
|
|
print(word)
|