You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.6 KiB
Python
81 lines
2.6 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.search import search, Pattern, Constraint
|
|
from pattern.en import parsetree, parse, Sentence
|
|
|
|
# What we call a "search word" in example 01-search.py
|
|
# is actually called a constraint, because it can contain different options.
|
|
# Options are separated by "|".
|
|
# The next search pattern retrieves words that are a noun OR an adjective:
|
|
s = parsetree("big white rabbit")
|
|
print(search("NN|JJ", s))
|
|
print("")
|
|
|
|
# This pattern yields phrases containing an adjective followed by a noun.
|
|
# Consecutive constraints are separated by a space:
|
|
print(search("JJ NN", s))
|
|
print("")
|
|
|
|
# Or a noun preceded by any number of adjectives:
|
|
print(search("JJ?+ NN", s))
|
|
print("")
|
|
|
|
# Note: NN marks singular nouns, NNS marks plural nouns.
|
|
# If you want to include both, use "NN*" as a constraint.
|
|
# This works for NN*, VB*, JJ*, RB*.
|
|
|
|
s = parsetree("When I sleep the big white rabbit will stare at my feet.")
|
|
m = search("rabbit stare at feet", s)
|
|
print(s)
|
|
print(m)
|
|
print("")
|
|
# Why does this work?
|
|
# The word "will" is included in the result, even if the pattern does not define it.
|
|
# The pattern should break when it does not encounter "stare" after "rabbit."
|
|
# It works because "will stare" is one verb chunk.
|
|
# The "stare" constraint matches the head word of the chunk ("stare"),
|
|
# so "will stare" is considered an overspecified version of "stare".
|
|
# The same happens with "my feet" and the "rabbit" constraint,
|
|
# which matches the overspecified chunk "the big white rabbit".
|
|
|
|
p = Pattern.fromstring("rabbit stare at feet", s)
|
|
p.strict = True # Now it matches only what the pattern explicitly defines (=no match).
|
|
m = p.search(s)
|
|
print(m)
|
|
print("")
|
|
|
|
# Sentence chunks can be matched by tag (e.g. NP, VP, ADJP).
|
|
# The pattern below matches anything from
|
|
# "the rabbit gnaws at your fingers" to
|
|
# "the white rabbit looks at the carrots":
|
|
p = Pattern.fromstring("rabbit VP at NP", s)
|
|
m = p.search(s)
|
|
print(m)
|
|
print("")
|
|
|
|
if m:
|
|
for w in m[0].words:
|
|
print("%s\t=> %s" % (w, m[0].constraint(w)))
|
|
|
|
print("")
|
|
print("-------------------------------------------------------------")
|
|
# Finally, constraints can also include regular expressions.
|
|
# To include them we need to use the full syntax instead of the search() function:
|
|
import re
|
|
r = re.compile(r"[0-9|\.]+") # all numbers
|
|
p = Pattern()
|
|
p.sequence.append(Constraint(words=[r]))
|
|
p.sequence.append(Constraint(tags=["NN*"]))
|
|
|
|
s = Sentence(parse("I have 9.5 rabbits."))
|
|
print(s)
|
|
print(p.search(s))
|
|
print("")
|