You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
5 years ago
|
from __future__ import print_function
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
from builtins import str, bytes, dict, int
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||
|
|
||
|
from pattern.search import search
|
||
|
from pattern.en import parsetree
|
||
|
|
||
|
# The pattern.search module contains a number of pattern matching tools
|
||
|
# to search a string syntactically (word function) or semantically (word meaning).
|
||
|
# If you only need to match string characters, regular expressions are faster.
|
||
|
# However, if you are scanning a sentence for concept types (e.g. all flowers)
|
||
|
# or parts-of-speech (e.g. all adjectives), this module provides the functionality.
|
||
|
|
||
|
# In the simplest case, the search() function
|
||
|
# takes a word (or a sequence of words) that you want to retrieve:
|
||
|
print(search("rabbit", "big white rabbit"))
|
||
|
print("")
|
||
|
|
||
|
# Search words can contain wildcard characters:
|
||
|
print(search("rabbit*", "big white rabbit"))
|
||
|
print(search("rabbit*", "big white rabbits"))
|
||
|
print("")
|
||
|
|
||
|
# Search words can contain different options:
|
||
|
print(search("rabbit|cony|bunny", "big black bunny"))
|
||
|
print("")
|
||
|
|
||
|
# Things become more interesting if we involve the pattern.en.parser module.
|
||
|
# The parser takes a string, identifies words, and assigns a part-of-speech tag
|
||
|
# to each word, for example NN (noun) or JJ (adjective).
|
||
|
# A parsed sentence can be scanned for part-of-speech tags:
|
||
|
s = parsetree("big white rabbit")
|
||
|
print(search("JJ", s)) # all adjectives
|
||
|
print(search("NN", s)) # all nouns
|
||
|
print(search("NP", s)) # all noun phrases
|
||
|
print("")
|
||
|
|
||
|
# Since the search() is case-insensitive, uppercase search words
|
||
|
# are always considered to be tags (or taxonomy terms - see further examples).
|
||
|
|
||
|
# The return value is a Match object,
|
||
|
# where Match.words is a list of Word objects that matched:
|
||
|
m = search("NP", s)
|
||
|
for word in m[0].words:
|
||
|
print(word.string, word.tag)
|