You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
1.6 KiB
Python
60 lines
1.6 KiB
Python
# Natural Language Toolkit: Stemmers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
|
# Edward Loper <edloper@gmail.com>
|
|
# Steven Bird <stevenbird1@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
from __future__ import unicode_literals
|
|
import re
|
|
|
|
from nltk.stem.api import StemmerI
|
|
from nltk.compat import python_2_unicode_compatible
|
|
|
|
|
|
@python_2_unicode_compatible
|
|
class RegexpStemmer(StemmerI):
|
|
"""
|
|
A stemmer that uses regular expressions to identify morphological
|
|
affixes. Any substrings that match the regular expressions will
|
|
be removed.
|
|
|
|
>>> from nltk.stem import RegexpStemmer
|
|
>>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
|
|
>>> st.stem('cars')
|
|
'car'
|
|
>>> st.stem('mass')
|
|
'mas'
|
|
>>> st.stem('was')
|
|
'was'
|
|
>>> st.stem('bee')
|
|
'bee'
|
|
>>> st.stem('compute')
|
|
'comput'
|
|
>>> st.stem('advisable')
|
|
'advis'
|
|
|
|
:type regexp: str or regexp
|
|
:param regexp: The regular expression that should be used to
|
|
identify morphological affixes.
|
|
:type min: int
|
|
:param min: The minimum length of string to stem
|
|
"""
|
|
|
|
def __init__(self, regexp, min=0):
|
|
|
|
if not hasattr(regexp, 'pattern'):
|
|
regexp = re.compile(regexp)
|
|
self._regexp = regexp
|
|
self._min = min
|
|
|
|
def stem(self, word):
|
|
if len(word) < self._min:
|
|
return word
|
|
else:
|
|
return self._regexp.sub('', word)
|
|
|
|
def __repr__(self):
|
|
return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
|