You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.search import search, taxonomy, Classifier
|
|
from pattern.en import parsetree
|
|
|
|
# The search module includes a Taxonomy class
|
|
# that can be used to define semantic word types.
|
|
# For example, consider that you want to extract flower names from a text.
|
|
# This would make search patterns somewhat unwieldy:
|
|
# search("rose|lily|daisy|daffodil|begonia", txt).
|
|
|
|
# A better approach is to use the taxonomy:
|
|
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
|
|
taxonomy.append(flower, type="flower")
|
|
|
|
print(taxonomy.children("flower"))
|
|
print(taxonomy.parents("rose"))
|
|
print(taxonomy.classify("rose")) # Yields the most recently added parent.
|
|
print("")
|
|
|
|
# Taxonomy terms can be included in a pattern by using uppercase:
|
|
t = parsetree("A field of white daffodils.", lemmata=True)
|
|
m = search("FLOWER", t)
|
|
print(t)
|
|
print(m)
|
|
print("")
|
|
|
|
# Another example:
|
|
taxonomy.append("chicken", type="food")
|
|
taxonomy.append("chicken", type="bird")
|
|
taxonomy.append("penguin", type="bird")
|
|
taxonomy.append("bird", type="animal")
|
|
print(taxonomy.parents("chicken"))
|
|
print(taxonomy.children("animal", recursive=True))
|
|
print(search("FOOD", "I'm eating chicken."))
|
|
print("")
|
|
|
|
# The advantage is that the taxonomy can hold an entire hierarchy.
|
|
# For example, "flower" could be classified as "organism".
|
|
# Other organisms could be defined as well (insects, trees, mammals, ...)
|
|
# The ORGANISM constraint then matches everything that is an organism.
|
|
|
|
# A taxonomy entry can also be a proper name containing spaces
|
|
# (e.g. "windows vista", case insensitive).
|
|
# It will be detected as long as it is contained in a single chunk:
|
|
taxonomy.append("windows vista", type="operating system")
|
|
taxonomy.append("ubuntu", type="operating system")
|
|
|
|
t = parsetree("Which do you like more, Windows Vista, or Ubuntu?")
|
|
m = search("OPERATING_SYSTEM", t)
|
|
print(t)
|
|
print(m)
|
|
print(m[0].constituents())
|
|
print("")
|
|
|
|
# Taxonomy entries cannot have wildcards (*),
|
|
# but you can use a classifier to simulate this.
|
|
# Classifiers are quite slow but useful in many ways.
|
|
# For example, a classifier could be written to dynamically
|
|
# retrieve word categories from WordNet.
|
|
|
|
|
|
def find_parents(word):
|
|
if word.startswith(("mac os", "windows", "ubuntu")):
|
|
return ["operating system"]
|
|
c = Classifier(parents=find_parents)
|
|
taxonomy.classifiers.append(c)
|
|
|
|
t = parsetree("I like Mac OS X 10.5 better than Windows XP or Ubuntu.")
|
|
m = search("OPERATING_SYSTEM", t)
|
|
print(t)
|
|
print(m)
|
|
print(m[0].constituents())
|
|
print(m[1].constituents())
|
|
print("")
|