You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

83 lines
2.7 KiB
Python

from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.search import search, taxonomy, Classifier
from pattern.en import parsetree
# The search module includes a Taxonomy class
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make search patterns somewhat unwieldy:
# search("rose|lily|daisy|daffodil|begonia", txt).
# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
taxonomy.append(flower, type="flower")
print(taxonomy.children("flower"))
print(taxonomy.parents("rose"))
print(taxonomy.classify("rose")) # Yields the most recently added parent.
print("")
# Taxonomy terms can be included in a pattern by using uppercase:
t = parsetree("A field of white daffodils.", lemmata=True)
m = search("FLOWER", t)
print(t)
print(m)
print("")
# Another example:
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
print(taxonomy.parents("chicken"))
print(taxonomy.children("animal", recursive=True))
print(search("FOOD", "I'm eating chicken."))
print("")
# The advantage is that the taxonomy can hold an entire hierarchy.
# For example, "flower" could be classified as "organism".
# Other organisms could be defined as well (insects, trees, mammals, ...)
# The ORGANISM constraint then matches everything that is an organism.
# A taxonomy entry can also be a proper name containing spaces
# (e.g. "windows vista", case insensitive).
# It will be detected as long as it is contained in a single chunk:
taxonomy.append("windows vista", type="operating system")
taxonomy.append("ubuntu", type="operating system")
t = parsetree("Which do you like more, Windows Vista, or Ubuntu?")
m = search("OPERATING_SYSTEM", t)
print(t)
print(m)
print(m[0].constituents())
print("")
# Taxonomy entries cannot have wildcards (*),
# but you can use a classifier to simulate this.
# Classifiers are quite slow but useful in many ways.
# For example, a classifier could be written to dynamically
# retrieve word categories from WordNet.
def find_parents(word):
if word.startswith(("mac os", "windows", "ubuntu")):
return ["operating system"]
c = Classifier(parents=find_parents)
taxonomy.classifiers.append(c)
t = parsetree("I like Mac OS X 10.5 better than Windows XP or Ubuntu.")
m = search("OPERATING_SYSTEM", t)
print(t)
print(m)
print(m[0].constituents())
print(m[1].constituents())
print("")