You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

47 lines
1.7 KiB
Python

5 years ago
from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.search import search
from pattern.en import parsetree
# Constraints ending in "?" are optional, matching one or no word.
# Pattern.search() uses a "greedy" approach:
# it will attempt to include as many optional constraints as possible.
# The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns).
# A preceding adjective, adverb or determiner are picked up as well.
for s in (
"the cat", # DT NN
"the very black cat", # DT RB JJ NN
"tasty cat food", # JJ NN NN
"the funny black cat", # JJ NN
"very funny", # RB JJ => no match, since there is no noun.
"my cat is black and your cat is white"): # NN + NN
t = parsetree(s)
m = search("DT? RB? JJ? NN+", t)
print("")
print(t)
print(m)
if m:
for w in m[0].words:
print("%s matches %s" % (w, m[0].constraint(w)))
# Before Pattern 2.4, "( )" was used instead of "?".
# For example: "(JJ)" instead of "JJ?".
# The syntax was changed to resemble regular expressions, which use "?".
# The old syntax "(JJ)" still works in Pattern 2.4, but it may change later.
# Note: the above pattern could also be written as "DT|RB|JJ?+ NN+"
# to include multiple adverbs/adjectives.
# By combining "*", "?" and "+" patterns can become quite complex.
# Optional constraints are useful for very specific patterns, but slow.
# Also, depending on which parser you use (e.g. MBSP), words can be tagged differently
# and may not match in the way you expect.
# Consider using a simple, robust "NP" search pattern.