|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
#
|
|
|
|
|
# Natural Language Toolkit: Snowball Stemmer
|
|
|
|
|
#
|
|
|
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
|
|
|
# Author: Peter Michael Stahl <pemistahl@gmail.com>
|
|
|
|
|
# Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
|
|
|
|
|
# Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
|
|
|
|
|
# Assem Chelli <assem.ch@gmail.com> (reviewer arabicstemmer)
|
|
|
|
|
# Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer)
|
|
|
|
|
# Algorithms: Dr Martin Porter <martin@tartarus.org>
|
|
|
|
|
# Assem Chelli <assem.ch@gmail.com> arabic stemming algorithm
|
|
|
|
|
# Benzahia Lakhdar <lakhdar.benzahia@gmail.com>
|
|
|
|
|
# URL: <http://nltk.org/>
|
|
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Snowball stemmers
|
|
|
|
|
|
|
|
|
|
This module provides a port of the Snowball stemmers
|
|
|
|
|
developed by Martin Porter.
|
|
|
|
|
|
|
|
|
|
There is also a demo function: `snowball.demo()`.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
|
from nltk.stem import porter
|
|
|
|
|
from nltk.stem.util import suffix_replace, prefix_replace
|
|
|
|
|
|
|
|
|
|
from nltk.stem.api import StemmerI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SnowballStemmer(StemmerI):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Snowball Stemmer
|
|
|
|
|
|
|
|
|
|
The following languages are supported:
|
|
|
|
|
Arabic, Danish, Dutch, English, Finnish, French, German,
|
|
|
|
|
Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
|
|
|
|
|
Spanish and Swedish.
|
|
|
|
|
|
|
|
|
|
The algorithm for English is documented here:
|
|
|
|
|
|
|
|
|
|
Porter, M. \"An algorithm for suffix stripping.\"
|
|
|
|
|
Program 14.3 (1980): 130-137.
|
|
|
|
|
|
|
|
|
|
The algorithms have been developed by Martin Porter.
|
|
|
|
|
These stemmers are called Snowball, because Porter created
|
|
|
|
|
a programming language with this name for creating
|
|
|
|
|
new stemming algorithms. There is more information available
|
|
|
|
|
at http://snowball.tartarus.org/
|
|
|
|
|
|
|
|
|
|
The stemmer is invoked as shown below:
|
|
|
|
|
|
|
|
|
|
>>> from nltk.stem import SnowballStemmer
|
|
|
|
|
>>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
|
|
|
|
|
arabic danish dutch english finnish french german hungarian
|
|
|
|
|
italian norwegian porter portuguese romanian russian
|
|
|
|
|
spanish swedish
|
|
|
|
|
>>> stemmer = SnowballStemmer("german") # Choose a language
|
|
|
|
|
>>> stemmer.stem("Autobahnen") # Stem a word
|
|
|
|
|
'autobahn'
|
|
|
|
|
|
|
|
|
|
Invoking the stemmers that way is useful if you do not know the
|
|
|
|
|
language to be stemmed at runtime. Alternatively, if you already know
|
|
|
|
|
the language, then you can invoke the language specific stemmer directly:
|
|
|
|
|
|
|
|
|
|
>>> from nltk.stem.snowball import GermanStemmer
|
|
|
|
|
>>> stemmer = GermanStemmer()
|
|
|
|
|
>>> stemmer.stem("Autobahnen")
|
|
|
|
|
'autobahn'
|
|
|
|
|
|
|
|
|
|
:param language: The language whose subclass is instantiated.
|
|
|
|
|
:type language: str or unicode
|
|
|
|
|
:param ignore_stopwords: If set to True, stopwords are
|
|
|
|
|
not stemmed and returned unchanged.
|
|
|
|
|
Set to False by default.
|
|
|
|
|
:type ignore_stopwords: bool
|
|
|
|
|
:raise ValueError: If there is no stemmer for the specified
|
|
|
|
|
language, a ValueError is raised.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
languages = (
|
|
|
|
|
"arabic",
|
|
|
|
|
"danish",
|
|
|
|
|
"dutch",
|
|
|
|
|
"english",
|
|
|
|
|
"finnish",
|
|
|
|
|
"french",
|
|
|
|
|
"german",
|
|
|
|
|
"hungarian",
|
|
|
|
|
"italian",
|
|
|
|
|
"norwegian",
|
|
|
|
|
"porter",
|
|
|
|
|
"portuguese",
|
|
|
|
|
"romanian",
|
|
|
|
|
"russian",
|
|
|
|
|
"spanish",
|
|
|
|
|
"swedish",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def __init__(self, language, ignore_stopwords=False):
|
|
|
|
|
if language not in self.languages:
|
|
|
|
|
raise ValueError("The language '{0}' is not supported.".format(language))
|
|
|
|
|
stemmerclass = globals()[language.capitalize() + "Stemmer"]
|
|
|
|
|
self.stemmer = stemmerclass(ignore_stopwords)
|
|
|
|
|
self.stem = self.stemmer.stem
|
|
|
|
|
self.stopwords = self.stemmer.stopwords
|
|
|
|
|
|
|
|
|
|
def stem(self, token):
|
|
|
|
|
return self.stemmer.stem(self, token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _LanguageSpecificStemmer(StemmerI):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
This helper subclass offers the possibility
|
|
|
|
|
to invoke a specific stemmer directly.
|
|
|
|
|
This is useful if you already know the language to be stemmed at runtime.
|
|
|
|
|
|
|
|
|
|
Create an instance of the Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:param ignore_stopwords: If set to True, stopwords are
|
|
|
|
|
not stemmed and returned unchanged.
|
|
|
|
|
Set to False by default.
|
|
|
|
|
:type ignore_stopwords: bool
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, ignore_stopwords=False):
|
|
|
|
|
# The language is the name of the class, minus the final "Stemmer".
|
|
|
|
|
language = type(self).__name__.lower()
|
|
|
|
|
if language.endswith("stemmer"):
|
|
|
|
|
language = language[:-7]
|
|
|
|
|
|
|
|
|
|
self.stopwords = set()
|
|
|
|
|
if ignore_stopwords:
|
|
|
|
|
try:
|
|
|
|
|
for word in stopwords.words(language):
|
|
|
|
|
self.stopwords.add(word)
|
|
|
|
|
except IOError:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"{!r} has no list of stopwords. Please set"
|
|
|
|
|
" 'ignore_stopwords' to 'False'.".format(self)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
"""
|
|
|
|
|
Print out the string representation of the respective class.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
return "<{0}>".format(type(self).__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
|
|
|
|
|
"""
|
|
|
|
|
A word stemmer based on the original Porter stemming algorithm.
|
|
|
|
|
|
|
|
|
|
Porter, M. \"An algorithm for suffix stripping.\"
|
|
|
|
|
Program 14.3 (1980): 130-137.
|
|
|
|
|
|
|
|
|
|
A few minor modifications have been made to Porter's basic
|
|
|
|
|
algorithm. See the source code of the module
|
|
|
|
|
nltk.stem.porter for more information.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, ignore_stopwords=False):
|
|
|
|
|
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
|
|
|
|
|
porter.PorterStemmer.__init__(self)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _ScandinavianStemmer(_LanguageSpecificStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
This subclass encapsulates a method for defining the string region R1.
|
|
|
|
|
It is used by the Danish, Norwegian, and Swedish stemmer.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def _r1_scandinavian(self, word, vowels):
|
|
|
|
|
"""
|
|
|
|
|
Return the region R1 that is used by the Scandinavian stemmers.
|
|
|
|
|
|
|
|
|
|
R1 is the region after the first non-vowel following a vowel,
|
|
|
|
|
or is the null region at the end of the word if there is no
|
|
|
|
|
such non-vowel. But then R1 is adjusted so that the region
|
|
|
|
|
before it contains at least three letters.
|
|
|
|
|
|
|
|
|
|
:param word: The word whose region R1 is determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:param vowels: The vowels of the respective language that are
|
|
|
|
|
used to determine the region R1.
|
|
|
|
|
:type vowels: unicode
|
|
|
|
|
:return: the region R1 for the respective word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the respective stem method of
|
|
|
|
|
the subclasses DanishStemmer, NorwegianStemmer, and
|
|
|
|
|
SwedishStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
r1 = ""
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in vowels and word[i - 1] in vowels:
|
|
|
|
|
if 3 > len(word[: i + 1]) > 0:
|
|
|
|
|
r1 = word[3:]
|
|
|
|
|
elif len(word[: i + 1]) >= 3:
|
|
|
|
|
r1 = word[i + 1 :]
|
|
|
|
|
else:
|
|
|
|
|
return word
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return r1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _StandardStemmer(_LanguageSpecificStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
This subclass encapsulates two methods for defining the standard versions
|
|
|
|
|
of the string regions R1, R2, and RV.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def _r1r2_standard(self, word, vowels):
|
|
|
|
|
"""
|
|
|
|
|
Return the standard interpretations of the string regions R1 and R2.
|
|
|
|
|
|
|
|
|
|
R1 is the region after the first non-vowel following a vowel,
|
|
|
|
|
or is the null region at the end of the word if there is no
|
|
|
|
|
such non-vowel.
|
|
|
|
|
|
|
|
|
|
R2 is the region after the first non-vowel following a vowel
|
|
|
|
|
in R1, or is the null region at the end of the word if there
|
|
|
|
|
is no such non-vowel.
|
|
|
|
|
|
|
|
|
|
:param word: The word whose regions R1 and R2 are determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:param vowels: The vowels of the respective language that are
|
|
|
|
|
used to determine the regions R1 and R2.
|
|
|
|
|
:type vowels: unicode
|
|
|
|
|
:return: (r1,r2), the regions R1 and R2 for the respective word.
|
|
|
|
|
:rtype: tuple
|
|
|
|
|
:note: This helper method is invoked by the respective stem method of
|
|
|
|
|
the subclasses DutchStemmer, FinnishStemmer,
|
|
|
|
|
FrenchStemmer, GermanStemmer, ItalianStemmer,
|
|
|
|
|
PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
|
|
|
|
|
It is not to be invoked directly!
|
|
|
|
|
:note: A detailed description of how to define R1 and R2
|
|
|
|
|
can be found at http://snowball.tartarus.org/texts/r1r2.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
r1 = ""
|
|
|
|
|
r2 = ""
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in vowels and word[i - 1] in vowels:
|
|
|
|
|
r1 = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(r1)):
|
|
|
|
|
if r1[i] not in vowels and r1[i - 1] in vowels:
|
|
|
|
|
r2 = r1[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return (r1, r2)
|
|
|
|
|
|
|
|
|
|
def _rv_standard(self, word, vowels):
|
|
|
|
|
"""
|
|
|
|
|
Return the standard interpretation of the string region RV.
|
|
|
|
|
|
|
|
|
|
If the second letter is a consonant, RV is the region after the
|
|
|
|
|
next following vowel. If the first two letters are vowels, RV is
|
|
|
|
|
the region after the next following consonant. Otherwise, RV is
|
|
|
|
|
the region after the third letter.
|
|
|
|
|
|
|
|
|
|
:param word: The word whose region RV is determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:param vowels: The vowels of the respective language that are
|
|
|
|
|
used to determine the region RV.
|
|
|
|
|
:type vowels: unicode
|
|
|
|
|
:return: the region RV for the respective word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the respective stem method of
|
|
|
|
|
the subclasses ItalianStemmer, PortugueseStemmer,
|
|
|
|
|
RomanianStemmer, and SpanishStemmer. It is not to be
|
|
|
|
|
invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
rv = ""
|
|
|
|
|
if len(word) >= 2:
|
|
|
|
|
if word[1] not in vowels:
|
|
|
|
|
for i in range(2, len(word)):
|
|
|
|
|
if word[i] in vowels:
|
|
|
|
|
rv = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif word[0] in vowels and word[1] in vowels:
|
|
|
|
|
for i in range(2, len(word)):
|
|
|
|
|
if word[i] not in vowels:
|
|
|
|
|
rv = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
rv = word[3:]
|
|
|
|
|
|
|
|
|
|
return rv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ArabicStemmer(_StandardStemmer):
|
|
|
|
|
"""
|
|
|
|
|
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
|
|
|
|
|
The Snowball Arabic light Stemmer
|
|
|
|
|
Algorithm : Assem Chelli
|
|
|
|
|
Abdelkrim Aries
|
|
|
|
|
Lakhdar Benzahia
|
|
|
|
|
Nltk Version Author : Lakhdar Benzahia
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Normalize_pre stes
|
|
|
|
|
__vocalization = re.compile(
|
|
|
|
|
r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]"
|
|
|
|
|
) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
|
|
|
|
|
|
|
|
|
|
__kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda
|
|
|
|
|
|
|
|
|
|
__arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟
|
|
|
|
|
|
|
|
|
|
# Normalize_post
|
|
|
|
|
__last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ
|
|
|
|
|
|
|
|
|
|
# normalize other hamza's
|
|
|
|
|
__initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ
|
|
|
|
|
|
|
|
|
|
__waw_hamza = re.compile(r"[\u0624]") # ؤ
|
|
|
|
|
|
|
|
|
|
__yeh_hamza = re.compile(r"[\u0626]") # ئ
|
|
|
|
|
|
|
|
|
|
__alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ
|
|
|
|
|
|
|
|
|
|
# Checks
|
|
|
|
|
__checks1 = (
|
|
|
|
|
"\u0643\u0627\u0644",
|
|
|
|
|
"\u0628\u0627\u0644", # بال، كال
|
|
|
|
|
"\u0627\u0644",
|
|
|
|
|
"\u0644\u0644", # لل، ال
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات
|
|
|
|
|
|
|
|
|
|
# Suffixes
|
|
|
|
|
__suffix_noun_step1a = (
|
|
|
|
|
"\u064a",
|
|
|
|
|
"\u0643",
|
|
|
|
|
"\u0647", # ي، ك، ه
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u0643\u0645",
|
|
|
|
|
"\u0647\u0627",
|
|
|
|
|
"\u0647\u0646",
|
|
|
|
|
"\u0647\u0645", # نا، كم، ها، هن، هم
|
|
|
|
|
"\u0643\u0645\u0627",
|
|
|
|
|
"\u0647\u0645\u0627", # كما، هما
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step1b = "\u0646" # ن
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step2b = "\u0627\u062a" # ات
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step2c1 = "\u062a" # ت
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step2c2 = "\u0629" # ة
|
|
|
|
|
|
|
|
|
|
__suffix_noun_step3 = "\u064a" # ي
|
|
|
|
|
|
|
|
|
|
__suffix_verb_step1 = (
|
|
|
|
|
"\u0647",
|
|
|
|
|
"\u0643", # ه، ك
|
|
|
|
|
"\u0646\u064a",
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u0647\u0627",
|
|
|
|
|
"\u0647\u0645", # ني، نا، ها، هم
|
|
|
|
|
"\u0647\u0646",
|
|
|
|
|
"\u0643\u0645",
|
|
|
|
|
"\u0643\u0646", # هن، كم، كن
|
|
|
|
|
"\u0647\u0645\u0627",
|
|
|
|
|
"\u0643\u0645\u0627",
|
|
|
|
|
"\u0643\u0645\u0648", # هما، كما، كمو
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__suffix_verb_step2a = (
|
|
|
|
|
"\u062a",
|
|
|
|
|
"\u0627",
|
|
|
|
|
"\u0646",
|
|
|
|
|
"\u064a", # ت، ا، ن، ي
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u062a\u0627",
|
|
|
|
|
"\u062a\u0646", # نا، تا، تن Past
|
|
|
|
|
"\u0627\u0646",
|
|
|
|
|
"\u0648\u0646",
|
|
|
|
|
"\u064a\u0646", # ان، هن، ين Present
|
|
|
|
|
"\u062a\u0645\u0627", # تما
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم
|
|
|
|
|
|
|
|
|
|
__suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو
|
|
|
|
|
|
|
|
|
|
__suffix_all_alef_maqsura = "\u0649" # ى
|
|
|
|
|
|
|
|
|
|
# Prefixes
|
|
|
|
|
__prefix_step1 = (
|
|
|
|
|
"\u0623", # أ
|
|
|
|
|
"\u0623\u0623",
|
|
|
|
|
"\u0623\u0622",
|
|
|
|
|
"\u0623\u0624",
|
|
|
|
|
"\u0623\u0627",
|
|
|
|
|
"\u0623\u0625", # أأ، أآ، أؤ، أا، أإ
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال
|
|
|
|
|
|
|
|
|
|
__prefix_step2b = ("\u0641", "\u0648") # ف، و
|
|
|
|
|
|
|
|
|
|
__prefix_step3a_noun = (
|
|
|
|
|
"\u0627\u0644",
|
|
|
|
|
"\u0644\u0644", # لل، ال
|
|
|
|
|
"\u0643\u0627\u0644",
|
|
|
|
|
"\u0628\u0627\u0644", # بال، كال
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__prefix_step3b_noun = (
|
|
|
|
|
"\u0628",
|
|
|
|
|
"\u0643",
|
|
|
|
|
"\u0644", # ب، ك، ل
|
|
|
|
|
"\u0628\u0628",
|
|
|
|
|
"\u0643\u0643", # بب، كك
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__prefix_step3_verb = (
|
|
|
|
|
"\u0633\u064a",
|
|
|
|
|
"\u0633\u062a",
|
|
|
|
|
"\u0633\u0646",
|
|
|
|
|
"\u0633\u0623",
|
|
|
|
|
) # سي، ست، سن، سأ
|
|
|
|
|
|
|
|
|
|
__prefix_step4_verb = (
|
|
|
|
|
"\u064a\u0633\u062a",
|
|
|
|
|
"\u0646\u0633\u062a",
|
|
|
|
|
"\u062a\u0633\u062a",
|
|
|
|
|
) # يست، نست، تست
|
|
|
|
|
|
|
|
|
|
# Suffixes added due to Conjugation Verbs
|
|
|
|
|
__conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_verb_2 = (
|
|
|
|
|
"\u0646\u064a",
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u0647\u0627", # ني، نا، ها
|
|
|
|
|
"\u0647\u0645",
|
|
|
|
|
"\u0647\u0646",
|
|
|
|
|
"\u0643\u0645", # هم، هن، كم
|
|
|
|
|
"\u0643\u0646", # كن
|
|
|
|
|
)
|
|
|
|
|
__conjugation_suffix_verb_3 = (
|
|
|
|
|
"\u0647\u0645\u0627",
|
|
|
|
|
"\u0643\u0645\u0627",
|
|
|
|
|
"\u0643\u0645\u0648",
|
|
|
|
|
) # هما، كما، كمو
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_verb_past = (
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u062a\u0627",
|
|
|
|
|
"\u062a\u0646",
|
|
|
|
|
) # نا، تا، تن
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_verb_present = (
|
|
|
|
|
"\u0627\u0646",
|
|
|
|
|
"\u0648\u0646",
|
|
|
|
|
"\u064a\u0646",
|
|
|
|
|
) # ان، ون، ين
|
|
|
|
|
|
|
|
|
|
# Suffixes added due to derivation Names
|
|
|
|
|
__conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_noun_2 = (
|
|
|
|
|
"\u0646\u0627",
|
|
|
|
|
"\u0643\u0645", # نا، كم
|
|
|
|
|
"\u0647\u0627",
|
|
|
|
|
"\u0647\u0646",
|
|
|
|
|
"\u0647\u0645", # ها، هن، هم
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__conjugation_suffix_noun_3 = (
|
|
|
|
|
"\u0643\u0645\u0627",
|
|
|
|
|
"\u0647\u0645\u0627",
|
|
|
|
|
) # كما، هما
|
|
|
|
|
|
|
|
|
|
# Prefixes added due to derivation Names
|
|
|
|
|
__prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا
|
|
|
|
|
|
|
|
|
|
__articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال
|
|
|
|
|
|
|
|
|
|
__articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل
|
|
|
|
|
|
|
|
|
|
# Prepositions letters
|
|
|
|
|
__prepositions1 = ("\u0643", "\u0644") # ك، ل
|
|
|
|
|
__prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك
|
|
|
|
|
|
|
|
|
|
is_verb = True
|
|
|
|
|
is_noun = True
|
|
|
|
|
is_defined = False
|
|
|
|
|
|
|
|
|
|
suffixes_verb_step1_success = False
|
|
|
|
|
suffix_verb_step2a_success = False
|
|
|
|
|
suffix_verb_step2b_success = False
|
|
|
|
|
suffix_noun_step2c2_success = False
|
|
|
|
|
suffix_noun_step1a_success = False
|
|
|
|
|
suffix_noun_step2a_success = False
|
|
|
|
|
suffix_noun_step2b_success = False
|
|
|
|
|
suffixe_noun_step1b_success = False
|
|
|
|
|
prefix_step2a_success = False
|
|
|
|
|
prefix_step3a_noun_success = False
|
|
|
|
|
prefix_step3b_noun_success = False
|
|
|
|
|
|
|
|
|
|
def __normalize_pre(self, token):
|
|
|
|
|
"""
|
|
|
|
|
:param token: string
|
|
|
|
|
:return: normalized token type string
|
|
|
|
|
"""
|
|
|
|
|
# strip diacritics
|
|
|
|
|
token = self.__vocalization.sub("", token)
|
|
|
|
|
# strip kasheeda
|
|
|
|
|
token = self.__kasheeda.sub("", token)
|
|
|
|
|
# strip punctuation marks
|
|
|
|
|
token = self.__arabic_punctuation_marks.sub("", token)
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __normalize_post(self, token):
|
|
|
|
|
# normalize last hamza
|
|
|
|
|
for hamza in self.__last_hamzat:
|
|
|
|
|
if token.endswith(hamza):
|
|
|
|
|
token = suffix_replace(token, hamza, "\u0621")
|
|
|
|
|
break
|
|
|
|
|
# normalize other hamzat
|
|
|
|
|
token = self.__initial_hamzat.sub("\u0627", token)
|
|
|
|
|
token = self.__waw_hamza.sub("\u0648", token)
|
|
|
|
|
token = self.__yeh_hamza.sub("\u064a", token)
|
|
|
|
|
token = self.__alefat.sub("\u0627", token)
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __checks_1(self, token):
|
|
|
|
|
for prefix in self.__checks1:
|
|
|
|
|
if token.startswith(prefix):
|
|
|
|
|
if prefix in self.__articles_3len and len(token) > 4:
|
|
|
|
|
self.is_noun = True
|
|
|
|
|
self.is_verb = False
|
|
|
|
|
self.is_defined = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if prefix in self.__articles_2len and len(token) > 3:
|
|
|
|
|
self.is_noun = True
|
|
|
|
|
self.is_verb = False
|
|
|
|
|
self.is_defined = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
def __checks_2(self, token):
|
|
|
|
|
for suffix in self.__checks2:
|
|
|
|
|
if token.endswith(suffix):
|
|
|
|
|
if suffix == "\u0629" and len(token) > 2:
|
|
|
|
|
self.is_noun = True
|
|
|
|
|
self.is_verb = False
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix == "\u0627\u062a" and len(token) > 3:
|
|
|
|
|
self.is_noun = True
|
|
|
|
|
self.is_verb = False
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
def __Suffix_Verb_Step1(self, token):
|
|
|
|
|
for suffix in self.__suffix_verb_step1:
|
|
|
|
|
if token.endswith(suffix):
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffixes_verb_step1_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
|
|
|
|
|
token = token[:-2]
|
|
|
|
|
self.suffixes_verb_step1_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
|
|
|
|
|
token = token[:-3]
|
|
|
|
|
self.suffixes_verb_step1_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Verb_Step2a(self, token):
|
|
|
|
|
for suffix in self.__suffix_verb_step2a:
|
|
|
|
|
if token.endswith(suffix) and len(token) > 3:
|
|
|
|
|
if suffix == "\u062a" and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffix_verb_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffix_verb_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
|
|
|
|
|
token = token[:-2] # past
|
|
|
|
|
self.suffix_verb_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
|
|
|
|
|
token = token[:-2] # present
|
|
|
|
|
self.suffix_verb_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix == "\u062a\u0645\u0627" and len(token) >= 6:
|
|
|
|
|
token = token[:-3]
|
|
|
|
|
self.suffix_verb_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Verb_Step2c(self, token):
|
|
|
|
|
for suffix in self.__suffix_verb_step2c:
|
|
|
|
|
if token.endswith(suffix):
|
|
|
|
|
if suffix == "\u062a\u0645\u0648" and len(token) >= 6:
|
|
|
|
|
token = token[:-3]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix == "\u0648" and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Verb_Step2b(self, token):
|
|
|
|
|
for suffix in self.__suffix_verb_step2b:
|
|
|
|
|
if token.endswith(suffix) and len(token) >= 5:
|
|
|
|
|
token = token[:-2]
|
|
|
|
|
self.suffix_verb_step2b_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step2c2(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step2c2:
|
|
|
|
|
if token.endswith(suffix) and len(token) >= 3:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffix_noun_step2c2_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step1a(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step1a:
|
|
|
|
|
if token.endswith(suffix):
|
|
|
|
|
if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffix_noun_step1a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
|
|
|
|
|
token = token[:-2]
|
|
|
|
|
self.suffix_noun_step1a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
|
|
|
|
|
token = token[:-3]
|
|
|
|
|
self.suffix_noun_step1a_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step2a(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step2a:
|
|
|
|
|
if token.endswith(suffix) and len(token) > 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffix_noun_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step2b(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step2b:
|
|
|
|
|
if token.endswith(suffix) and len(token) >= 5:
|
|
|
|
|
token = token[:-2]
|
|
|
|
|
self.suffix_noun_step2b_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step2c1(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step2c1:
|
|
|
|
|
if token.endswith(suffix) and len(token) >= 4:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step1b(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step1b:
|
|
|
|
|
if token.endswith(suffix) and len(token) > 5:
|
|
|
|
|
token = token[:-1]
|
|
|
|
|
self.suffixe_noun_step1b_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_Noun_Step3(self, token):
|
|
|
|
|
for suffix in self.__suffix_noun_step3:
|
|
|
|
|
if token.endswith(suffix) and len(token) >= 3:
|
|
|
|
|
token = token[:-1] # ya' nisbiya
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Suffix_All_alef_maqsura(self, token):
|
|
|
|
|
for suffix in self.__suffix_all_alef_maqsura:
|
|
|
|
|
if token.endswith(suffix):
|
|
|
|
|
token = suffix_replace(token, suffix, "\u064a")
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step1(self, token):
|
|
|
|
|
for prefix in self.__prefix_step1:
|
|
|
|
|
if token.startswith(prefix) and len(token) > 3:
|
|
|
|
|
if prefix == "\u0623\u0623":
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0623")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif prefix == "\u0623\u0622":
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0622")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif prefix == "\u0623\u0624":
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0624")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif prefix == "\u0623\u0627":
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0627")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
elif prefix == "\u0623\u0625":
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0625")
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step2a(self, token):
|
|
|
|
|
for prefix in self.__prefix_step2a:
|
|
|
|
|
if token.startswith(prefix) and len(token) > 5:
|
|
|
|
|
token = token[len(prefix) :]
|
|
|
|
|
self.prefix_step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step2b(self, token):
|
|
|
|
|
for prefix in self.__prefix_step2b:
|
|
|
|
|
if token.startswith(prefix) and len(token) > 3:
|
|
|
|
|
if token[:2] not in self.__prefixes1:
|
|
|
|
|
token = token[len(prefix) :]
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step3a_Noun(self, token):
|
|
|
|
|
for prefix in self.__prefix_step3a_noun:
|
|
|
|
|
if token.startswith(prefix):
|
|
|
|
|
if prefix in self.__articles_2len and len(token) > 4:
|
|
|
|
|
token = token[len(prefix) :]
|
|
|
|
|
self.prefix_step3a_noun_success = True
|
|
|
|
|
break
|
|
|
|
|
if prefix in self.__articles_3len and len(token) > 5:
|
|
|
|
|
token = token[len(prefix) :]
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step3b_Noun(self, token):
|
|
|
|
|
for prefix in self.__prefix_step3b_noun:
|
|
|
|
|
if token.startswith(prefix):
|
|
|
|
|
if len(token) > 3:
|
|
|
|
|
if prefix == "\u0628":
|
|
|
|
|
token = token[len(prefix) :]
|
|
|
|
|
self.prefix_step3b_noun_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if prefix in self.__prepositions2:
|
|
|
|
|
token = prefix_replace(token, prefix, prefix[1])
|
|
|
|
|
self.prefix_step3b_noun_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if prefix in self.__prepositions1 and len(token) > 4:
|
|
|
|
|
token = token[len(prefix) :] # BUG: cause confusion
|
|
|
|
|
self.prefix_step3b_noun_success = True
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step3_Verb(self, token):
|
|
|
|
|
for prefix in self.__prefix_step3_verb:
|
|
|
|
|
if token.startswith(prefix) and len(token) > 4:
|
|
|
|
|
token = prefix_replace(token, prefix, prefix[1])
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def __Prefix_Step4_Verb(self, token):
|
|
|
|
|
for prefix in self.__prefix_step4_verb:
|
|
|
|
|
if token.startswith(prefix) and len(token) > 4:
|
|
|
|
|
token = prefix_replace(token, prefix, "\u0627\u0633\u062a")
|
|
|
|
|
self.is_verb = True
|
|
|
|
|
self.is_noun = False
|
|
|
|
|
break
|
|
|
|
|
return token
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem an Arabic word and return the stemmed form.
|
|
|
|
|
:param word: string
|
|
|
|
|
:return: string
|
|
|
|
|
"""
|
|
|
|
|
# set initial values
|
|
|
|
|
self.is_verb = True
|
|
|
|
|
self.is_noun = True
|
|
|
|
|
self.is_defined = False
|
|
|
|
|
|
|
|
|
|
self.suffix_verb_step2a_success = False
|
|
|
|
|
self.suffix_verb_step2b_success = False
|
|
|
|
|
self.suffix_noun_step2c2_success = False
|
|
|
|
|
self.suffix_noun_step1a_success = False
|
|
|
|
|
self.suffix_noun_step2a_success = False
|
|
|
|
|
self.suffix_noun_step2b_success = False
|
|
|
|
|
self.suffixe_noun_step1b_success = False
|
|
|
|
|
self.prefix_step2a_success = False
|
|
|
|
|
self.prefix_step3a_noun_success = False
|
|
|
|
|
self.prefix_step3b_noun_success = False
|
|
|
|
|
|
|
|
|
|
modified_word = word
|
|
|
|
|
# guess type and properties
|
|
|
|
|
# checks1
|
|
|
|
|
self.__checks_1(modified_word)
|
|
|
|
|
# checks2
|
|
|
|
|
self.__checks_2(modified_word)
|
|
|
|
|
# Pre_Normalization
|
|
|
|
|
modified_word = self.__normalize_pre(modified_word)
|
|
|
|
|
# Avoid stopwords
|
|
|
|
|
if modified_word in self.stopwords or len(modified_word) <= 2:
|
|
|
|
|
return modified_word
|
|
|
|
|
# Start stemming
|
|
|
|
|
if self.is_verb:
|
|
|
|
|
modified_word = self.__Suffix_Verb_Step1(modified_word)
|
|
|
|
|
if self.suffixes_verb_step1_success:
|
|
|
|
|
modified_word = self.__Suffix_Verb_Step2a(modified_word)
|
|
|
|
|
if not self.suffix_verb_step2a_success:
|
|
|
|
|
modified_word = self.__Suffix_Verb_Step2c(modified_word)
|
|
|
|
|
# or next TODO: How to deal with or next instruction
|
|
|
|
|
else:
|
|
|
|
|
modified_word = self.__Suffix_Verb_Step2b(modified_word)
|
|
|
|
|
if not self.suffix_verb_step2b_success:
|
|
|
|
|
modified_word = self.__Suffix_Verb_Step2a(modified_word)
|
|
|
|
|
if self.is_noun:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2c2(modified_word)
|
|
|
|
|
if not self.suffix_noun_step2c2_success:
|
|
|
|
|
if not self.is_defined:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step1a(modified_word)
|
|
|
|
|
# if self.suffix_noun_step1a_success:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2a(modified_word)
|
|
|
|
|
if not self.suffix_noun_step2a_success:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2b(modified_word)
|
|
|
|
|
if (
|
|
|
|
|
not self.suffix_noun_step2b_success
|
|
|
|
|
and not self.suffix_noun_step2a_success
|
|
|
|
|
):
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2c1(modified_word)
|
|
|
|
|
# or next ? todo : how to deal with or next
|
|
|
|
|
else:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step1b(modified_word)
|
|
|
|
|
if self.suffixe_noun_step1b_success:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2a(modified_word)
|
|
|
|
|
if not self.suffix_noun_step2a_success:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2b(modified_word)
|
|
|
|
|
if (
|
|
|
|
|
not self.suffix_noun_step2b_success
|
|
|
|
|
and not self.suffix_noun_step2a_success
|
|
|
|
|
):
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2c1(modified_word)
|
|
|
|
|
else:
|
|
|
|
|
if not self.is_defined:
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2a(modified_word)
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step2b(modified_word)
|
|
|
|
|
modified_word = self.__Suffix_Noun_Step3(modified_word)
|
|
|
|
|
if not self.is_noun and self.is_verb:
|
|
|
|
|
modified_word = self.__Suffix_All_alef_maqsura(modified_word)
|
|
|
|
|
|
|
|
|
|
# prefixes
|
|
|
|
|
modified_word = self.__Prefix_Step1(modified_word)
|
|
|
|
|
modified_word = self.__Prefix_Step2a(modified_word)
|
|
|
|
|
if not self.prefix_step2a_success:
|
|
|
|
|
modified_word = self.__Prefix_Step2b(modified_word)
|
|
|
|
|
modified_word = self.__Prefix_Step3a_Noun(modified_word)
|
|
|
|
|
if not self.prefix_step3a_noun_success and self.is_noun:
|
|
|
|
|
modified_word = self.__Prefix_Step3b_Noun(modified_word)
|
|
|
|
|
else:
|
|
|
|
|
if not self.prefix_step3b_noun_success and self.is_verb:
|
|
|
|
|
modified_word = self.__Prefix_Step3_Verb(modified_word)
|
|
|
|
|
modified_word = self.__Prefix_Step4_Verb(modified_word)
|
|
|
|
|
|
|
|
|
|
# post normalization stemming
|
|
|
|
|
modified_word = self.__normalize_post(modified_word)
|
|
|
|
|
stemmed_word = modified_word
|
|
|
|
|
return stemmed_word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DanishStemmer(_ScandinavianStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Danish Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Danish vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __consonants: The Danish consonants.
|
|
|
|
|
:type __consonants: unicode
|
|
|
|
|
:cvar __double_consonants: The Danish double consonants.
|
|
|
|
|
:type __double_consonants: tuple
|
|
|
|
|
:cvar __s_ending: Letters that may directly appear before a word final 's'.
|
|
|
|
|
:type __s_ending: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Danish
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/danish/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# The language's vowels and other important characters are defined.
|
|
|
|
|
__vowels = "aeiouy\xE6\xE5\xF8"
|
|
|
|
|
__consonants = "bcdfghjklmnpqrstvwxz"
|
|
|
|
|
__double_consonants = (
|
|
|
|
|
"bb",
|
|
|
|
|
"cc",
|
|
|
|
|
"dd",
|
|
|
|
|
"ff",
|
|
|
|
|
"gg",
|
|
|
|
|
"hh",
|
|
|
|
|
"jj",
|
|
|
|
|
"kk",
|
|
|
|
|
"ll",
|
|
|
|
|
"mm",
|
|
|
|
|
"nn",
|
|
|
|
|
"pp",
|
|
|
|
|
"qq",
|
|
|
|
|
"rr",
|
|
|
|
|
"ss",
|
|
|
|
|
"tt",
|
|
|
|
|
"vv",
|
|
|
|
|
"ww",
|
|
|
|
|
"xx",
|
|
|
|
|
"zz",
|
|
|
|
|
)
|
|
|
|
|
__s_ending = "abcdfghjklmnoprtvyz\xE5"
|
|
|
|
|
|
|
|
|
|
# The different suffixes, divided into the algorithm's steps
|
|
|
|
|
# and organized by length, are listed in tuples.
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"erendes",
|
|
|
|
|
"erende",
|
|
|
|
|
"hedens",
|
|
|
|
|
"ethed",
|
|
|
|
|
"erede",
|
|
|
|
|
"heden",
|
|
|
|
|
"heder",
|
|
|
|
|
"endes",
|
|
|
|
|
"ernes",
|
|
|
|
|
"erens",
|
|
|
|
|
"erets",
|
|
|
|
|
"ered",
|
|
|
|
|
"ende",
|
|
|
|
|
"erne",
|
|
|
|
|
"eren",
|
|
|
|
|
"erer",
|
|
|
|
|
"heds",
|
|
|
|
|
"enes",
|
|
|
|
|
"eres",
|
|
|
|
|
"eret",
|
|
|
|
|
"hed",
|
|
|
|
|
"ene",
|
|
|
|
|
"ere",
|
|
|
|
|
"ens",
|
|
|
|
|
"ers",
|
|
|
|
|
"ets",
|
|
|
|
|
"en",
|
|
|
|
|
"er",
|
|
|
|
|
"es",
|
|
|
|
|
"et",
|
|
|
|
|
"e",
|
|
|
|
|
"s",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = ("gd", "dt", "gt", "kt")
|
|
|
|
|
__step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Danish word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# Every word is put into lower case for normalization.
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
# After this, the required regions are generated
|
|
|
|
|
# by the respective helper method.
|
|
|
|
|
r1 = self._r1_scandinavian(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# Then the actual stemming process starts.
|
|
|
|
|
# Every new step is explicitly indicated
|
|
|
|
|
# according to the descriptions on the Snowball website.
|
|
|
|
|
|
|
|
|
|
# STEP 1
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "s":
|
|
|
|
|
if word[-2] in self.__s_ending:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
if r1.endswith("igst"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "l\xF8st":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r1.endswith(self.__step2_suffixes):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4: Undouble
|
|
|
|
|
for double_cons in self.__double_consonants:
|
|
|
|
|
if word.endswith(double_cons) and len(word) > 3:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DutchStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Dutch Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Dutch vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
|
|
|
|
|
:type __step3b_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Dutch
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/dutch/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE8"
|
|
|
|
|
__step1_suffixes = ("heden", "ene", "en", "se", "s")
|
|
|
|
|
__step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Dutch word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step2_success = False
|
|
|
|
|
|
|
|
|
|
# Vowel accents are removed.
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\xE4", "a")
|
|
|
|
|
.replace("\xE1", "a")
|
|
|
|
|
.replace("\xEB", "e")
|
|
|
|
|
.replace("\xE9", "e")
|
|
|
|
|
.replace("\xED", "i")
|
|
|
|
|
.replace("\xEF", "i")
|
|
|
|
|
.replace("\xF6", "o")
|
|
|
|
|
.replace("\xF3", "o")
|
|
|
|
|
.replace("\xFC", "u")
|
|
|
|
|
.replace("\xFA", "u")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# An initial 'y', a 'y' after a vowel,
|
|
|
|
|
# and an 'i' between self.__vowels is put into upper case.
|
|
|
|
|
# As from now these are treated as consonants.
|
|
|
|
|
if word.startswith("y"):
|
|
|
|
|
word = "".join(("Y", word[1:]))
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i] == "y":
|
|
|
|
|
word = "".join((word[:i], "Y", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word) - 1):
|
|
|
|
|
if (
|
|
|
|
|
word[i - 1] in self.__vowels
|
|
|
|
|
and word[i] == "i"
|
|
|
|
|
and word[i + 1] in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = "".join((word[:i], "I", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# R1 is adjusted so that the region before it
|
|
|
|
|
# contains at least 3 letters.
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
|
|
|
|
|
if 3 > len(word[: i + 1]) > 0:
|
|
|
|
|
r1 = word[3:]
|
|
|
|
|
elif len(word[: i + 1]) == 0:
|
|
|
|
|
return word
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "heden":
|
|
|
|
|
word = suffix_replace(word, suffix, "heid")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "heid")
|
|
|
|
|
if r2.endswith("heden"):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "heid")
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("ene", "en")
|
|
|
|
|
and not word.endswith("heden")
|
|
|
|
|
and word[-len(suffix) - 1] not in self.__vowels
|
|
|
|
|
and word[-len(suffix) - 3 : -len(suffix)] != "gem"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
if word.endswith(("kk", "dd", "tt")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("se", "s")
|
|
|
|
|
and word[-len(suffix) - 1] not in self.__vowels
|
|
|
|
|
and word[-len(suffix) - 1] != "j"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
if r1.endswith("e") and word[-2] not in self.__vowels:
|
|
|
|
|
step2_success = True
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
if word.endswith(("kk", "dd", "tt")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
# STEP 3a
|
|
|
|
|
if r2.endswith("heid") and word[-5] != "c":
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
r1 = r1[:-4]
|
|
|
|
|
r2 = r2[:-4]
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
r1.endswith("en")
|
|
|
|
|
and word[-3] not in self.__vowels
|
|
|
|
|
and word[-5:-2] != "gem"
|
|
|
|
|
):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
if word.endswith(("kk", "dd", "tt")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
# STEP 3b: Derivational suffixes
|
|
|
|
|
for suffix in self.__step3b_suffixes:
|
|
|
|
|
if r2.endswith(suffix):
|
|
|
|
|
if suffix in ("end", "ing"):
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("ig") and word[-3] != "e":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
else:
|
|
|
|
|
if word.endswith(("kk", "dd", "tt")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
elif suffix == "ig" and word[-3] != "e":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "lijk":
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
r1 = r1[:-4]
|
|
|
|
|
|
|
|
|
|
if r1.endswith("e") and word[-2] not in self.__vowels:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
if word.endswith(("kk", "dd", "tt")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
elif suffix == "baar":
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix == "bar" and step2_success:
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4: Undouble vowel
|
|
|
|
|
if len(word) >= 4:
|
|
|
|
|
if word[-1] not in self.__vowels and word[-1] != "I":
|
|
|
|
|
if word[-3:-1] in ("aa", "ee", "oo", "uu"):
|
|
|
|
|
if word[-4] not in self.__vowels:
|
|
|
|
|
word = "".join((word[:-3], word[-3], word[-1]))
|
|
|
|
|
|
|
|
|
|
# All occurrences of 'I' and 'Y' are put back into lower case.
|
|
|
|
|
word = word.replace("I", "i").replace("Y", "y")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EnglishStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The English Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The English vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __double_consonants: The English double consonants.
|
|
|
|
|
:type __double_consonants: tuple
|
|
|
|
|
:cvar __li_ending: Letters that may directly appear before a word final 'li'.
|
|
|
|
|
:type __li_ending: unicode
|
|
|
|
|
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
|
|
|
|
|
:type __step0_suffixes: tuple
|
|
|
|
|
:cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
|
|
|
|
|
:type __step1a_suffixes: tuple
|
|
|
|
|
:cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
|
|
|
|
|
:type __step1b_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
|
|
|
:type __step4_suffixes: tuple
|
|
|
|
|
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
|
|
|
|
|
:type __step5_suffixes: tuple
|
|
|
|
|
:cvar __special_words: A dictionary containing words
|
|
|
|
|
which have to be stemmed specially.
|
|
|
|
|
:type __special_words: dict
|
|
|
|
|
:note: A detailed description of the English
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/english/stemmer.html
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy"
|
|
|
|
|
__double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
|
|
|
|
|
__li_ending = "cdeghkmnrt"
|
|
|
|
|
__step0_suffixes = ("'s'", "'s", "'")
|
|
|
|
|
__step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
|
|
|
|
|
__step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
|
|
|
|
|
__step2_suffixes = (
|
|
|
|
|
"ization",
|
|
|
|
|
"ational",
|
|
|
|
|
"fulness",
|
|
|
|
|
"ousness",
|
|
|
|
|
"iveness",
|
|
|
|
|
"tional",
|
|
|
|
|
"biliti",
|
|
|
|
|
"lessli",
|
|
|
|
|
"entli",
|
|
|
|
|
"ation",
|
|
|
|
|
"alism",
|
|
|
|
|
"aliti",
|
|
|
|
|
"ousli",
|
|
|
|
|
"iviti",
|
|
|
|
|
"fulli",
|
|
|
|
|
"enci",
|
|
|
|
|
"anci",
|
|
|
|
|
"abli",
|
|
|
|
|
"izer",
|
|
|
|
|
"ator",
|
|
|
|
|
"alli",
|
|
|
|
|
"bli",
|
|
|
|
|
"ogi",
|
|
|
|
|
"li",
|
|
|
|
|
)
|
|
|
|
|
__step3_suffixes = (
|
|
|
|
|
"ational",
|
|
|
|
|
"tional",
|
|
|
|
|
"alize",
|
|
|
|
|
"icate",
|
|
|
|
|
"iciti",
|
|
|
|
|
"ative",
|
|
|
|
|
"ical",
|
|
|
|
|
"ness",
|
|
|
|
|
"ful",
|
|
|
|
|
)
|
|
|
|
|
__step4_suffixes = (
|
|
|
|
|
"ement",
|
|
|
|
|
"ance",
|
|
|
|
|
"ence",
|
|
|
|
|
"able",
|
|
|
|
|
"ible",
|
|
|
|
|
"ment",
|
|
|
|
|
"ant",
|
|
|
|
|
"ent",
|
|
|
|
|
"ism",
|
|
|
|
|
"ate",
|
|
|
|
|
"iti",
|
|
|
|
|
"ous",
|
|
|
|
|
"ive",
|
|
|
|
|
"ize",
|
|
|
|
|
"ion",
|
|
|
|
|
"al",
|
|
|
|
|
"er",
|
|
|
|
|
"ic",
|
|
|
|
|
)
|
|
|
|
|
__step5_suffixes = ("e", "l")
|
|
|
|
|
__special_words = {
|
|
|
|
|
"skis": "ski",
|
|
|
|
|
"skies": "sky",
|
|
|
|
|
"dying": "die",
|
|
|
|
|
"lying": "lie",
|
|
|
|
|
"tying": "tie",
|
|
|
|
|
"idly": "idl",
|
|
|
|
|
"gently": "gentl",
|
|
|
|
|
"ugly": "ugli",
|
|
|
|
|
"early": "earli",
|
|
|
|
|
"only": "onli",
|
|
|
|
|
"singly": "singl",
|
|
|
|
|
"sky": "sky",
|
|
|
|
|
"news": "news",
|
|
|
|
|
"howe": "howe",
|
|
|
|
|
"atlas": "atlas",
|
|
|
|
|
"cosmos": "cosmos",
|
|
|
|
|
"bias": "bias",
|
|
|
|
|
"andes": "andes",
|
|
|
|
|
"inning": "inning",
|
|
|
|
|
"innings": "inning",
|
|
|
|
|
"outing": "outing",
|
|
|
|
|
"outings": "outing",
|
|
|
|
|
"canning": "canning",
|
|
|
|
|
"cannings": "canning",
|
|
|
|
|
"herring": "herring",
|
|
|
|
|
"herrings": "herring",
|
|
|
|
|
"earring": "earring",
|
|
|
|
|
"earrings": "earring",
|
|
|
|
|
"proceed": "proceed",
|
|
|
|
|
"proceeds": "proceed",
|
|
|
|
|
"proceeded": "proceed",
|
|
|
|
|
"proceeding": "proceed",
|
|
|
|
|
"exceed": "exceed",
|
|
|
|
|
"exceeds": "exceed",
|
|
|
|
|
"exceeded": "exceed",
|
|
|
|
|
"exceeding": "exceed",
|
|
|
|
|
"succeed": "succeed",
|
|
|
|
|
"succeeds": "succeed",
|
|
|
|
|
"succeeded": "succeed",
|
|
|
|
|
"succeeding": "succeed",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Stem an English word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords or len(word) <= 2:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
elif word in self.__special_words:
|
|
|
|
|
return self.__special_words[word]
|
|
|
|
|
|
|
|
|
|
# Map the different apostrophe characters to a single consistent one
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\u2019", "\x27")
|
|
|
|
|
.replace("\u2018", "\x27")
|
|
|
|
|
.replace("\u201B", "\x27")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if word.startswith("\x27"):
|
|
|
|
|
word = word[1:]
|
|
|
|
|
|
|
|
|
|
if word.startswith("y"):
|
|
|
|
|
word = "".join(("Y", word[1:]))
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i] == "y":
|
|
|
|
|
word = "".join((word[:i], "Y", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
step1a_vowel_found = False
|
|
|
|
|
step1b_vowel_found = False
|
|
|
|
|
|
|
|
|
|
r1 = ""
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
if word.startswith(("gener", "commun", "arsen")):
|
|
|
|
|
if word.startswith(("gener", "arsen")):
|
|
|
|
|
r1 = word[5:]
|
|
|
|
|
else:
|
|
|
|
|
r1 = word[6:]
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(r1)):
|
|
|
|
|
if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
|
|
|
|
|
r2 = r1[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 0
|
|
|
|
|
for suffix in self.__step0_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1a
|
|
|
|
|
for suffix in self.__step1a_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
|
|
|
|
|
if suffix == "sses":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ied", "ies"):
|
|
|
|
|
if len(word[: -len(suffix)]) > 1:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
elif suffix == "s":
|
|
|
|
|
for letter in word[:-2]:
|
|
|
|
|
if letter in self.__vowels:
|
|
|
|
|
step1a_vowel_found = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if step1a_vowel_found:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1b
|
|
|
|
|
for suffix in self.__step1b_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in ("eed", "eedly"):
|
|
|
|
|
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = suffix_replace(word, suffix, "ee")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ee")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ee")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
else:
|
|
|
|
|
for letter in word[: -len(suffix)]:
|
|
|
|
|
if letter in self.__vowels:
|
|
|
|
|
step1b_vowel_found = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if step1b_vowel_found:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if word.endswith(("at", "bl", "iz")):
|
|
|
|
|
word = "".join((word, "e"))
|
|
|
|
|
r1 = "".join((r1, "e"))
|
|
|
|
|
|
|
|
|
|
if len(word) > 5 or len(r1) >= 3:
|
|
|
|
|
r2 = "".join((r2, "e"))
|
|
|
|
|
|
|
|
|
|
elif word.endswith(self.__double_consonants):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
r1 == ""
|
|
|
|
|
and len(word) >= 3
|
|
|
|
|
and word[-1] not in self.__vowels
|
|
|
|
|
and word[-1] not in "wxY"
|
|
|
|
|
and word[-2] in self.__vowels
|
|
|
|
|
and word[-3] not in self.__vowels
|
|
|
|
|
) or (
|
|
|
|
|
r1 == ""
|
|
|
|
|
and len(word) == 2
|
|
|
|
|
and word[0] in self.__vowels
|
|
|
|
|
and word[1] not in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
|
|
|
|
|
word = "".join((word, "e"))
|
|
|
|
|
|
|
|
|
|
if len(r1) > 0:
|
|
|
|
|
r1 = "".join((r1, "e"))
|
|
|
|
|
|
|
|
|
|
if len(r2) > 0:
|
|
|
|
|
r2 = "".join((r2, "e"))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1c
|
|
|
|
|
if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels:
|
|
|
|
|
word = "".join((word[:-1], "i"))
|
|
|
|
|
if len(r1) >= 1:
|
|
|
|
|
r1 = "".join((r1[:-1], "i"))
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= 1:
|
|
|
|
|
r2 = "".join((r2[:-1], "i"))
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "tional":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("enci", "anci", "abli"):
|
|
|
|
|
word = "".join((word[:-1], "e"))
|
|
|
|
|
|
|
|
|
|
if len(r1) >= 1:
|
|
|
|
|
r1 = "".join((r1[:-1], "e"))
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= 1:
|
|
|
|
|
r2 = "".join((r2[:-1], "e"))
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix == "entli":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("izer", "ization"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ize")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ize")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ize")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ational", "ation", "ator"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ate")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ate")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ate")
|
|
|
|
|
else:
|
|
|
|
|
r2 = "e"
|
|
|
|
|
|
|
|
|
|
elif suffix in ("alism", "aliti", "alli"):
|
|
|
|
|
word = suffix_replace(word, suffix, "al")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "al")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "al")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix == "fulness":
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
r1 = r1[:-4]
|
|
|
|
|
r2 = r2[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ousli", "ousness"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ous")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ous")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ous")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix in ("iveness", "iviti"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ive")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ive")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ive")
|
|
|
|
|
else:
|
|
|
|
|
r2 = "e"
|
|
|
|
|
|
|
|
|
|
elif suffix in ("biliti", "bli"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ble")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ble")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ble")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix == "ogi" and word[-4] == "l":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("fulli", "lessli"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "li" and word[-3] in self.__li_ending:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "tional":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "ational":
|
|
|
|
|
word = suffix_replace(word, suffix, "ate")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ate")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ate")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix == "alize":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("icate", "iciti", "ical"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ic")
|
|
|
|
|
|
|
|
|
|
if len(r1) >= len(suffix):
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "ic")
|
|
|
|
|
else:
|
|
|
|
|
r1 = ""
|
|
|
|
|
|
|
|
|
|
if len(r2) >= len(suffix):
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "ic")
|
|
|
|
|
else:
|
|
|
|
|
r2 = ""
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ful", "ness"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
elif suffix == "ative" and r2.endswith(suffix):
|
|
|
|
|
word = word[:-5]
|
|
|
|
|
r1 = r1[:-5]
|
|
|
|
|
r2 = r2[:-5]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4
|
|
|
|
|
for suffix in self.__step4_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r2.endswith(suffix):
|
|
|
|
|
if suffix == "ion":
|
|
|
|
|
if word[-4] in "st":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 5
|
|
|
|
|
if r2.endswith("l") and word[-2] == "l":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
elif r2.endswith("e"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
elif r1.endswith("e"):
|
|
|
|
|
if len(word) >= 4 and (
|
|
|
|
|
word[-2] in self.__vowels
|
|
|
|
|
or word[-2] in "wxY"
|
|
|
|
|
or word[-3] not in self.__vowels
|
|
|
|
|
or word[-4] in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
word = word.replace("Y", "y")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FinnishStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Finnish Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Finnish vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __restricted_vowels: A subset of the Finnish vowels.
|
|
|
|
|
:type __restricted_vowels: unicode
|
|
|
|
|
:cvar __long_vowels: The Finnish vowels in their long forms.
|
|
|
|
|
:type __long_vowels: tuple
|
|
|
|
|
:cvar __consonants: The Finnish consonants.
|
|
|
|
|
:type __consonants: unicode
|
|
|
|
|
:cvar __double_consonants: The Finnish double consonants.
|
|
|
|
|
:type __double_consonants: tuple
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
|
|
|
:type __step4_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Finnish
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/finnish/stemmer.html
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE4\xF6"
|
|
|
|
|
__restricted_vowels = "aeiou\xE4\xF6"
|
|
|
|
|
__long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6")
|
|
|
|
|
__consonants = "bcdfghjklmnpqrstvwxz"
|
|
|
|
|
__double_consonants = (
|
|
|
|
|
"bb",
|
|
|
|
|
"cc",
|
|
|
|
|
"dd",
|
|
|
|
|
"ff",
|
|
|
|
|
"gg",
|
|
|
|
|
"hh",
|
|
|
|
|
"jj",
|
|
|
|
|
"kk",
|
|
|
|
|
"ll",
|
|
|
|
|
"mm",
|
|
|
|
|
"nn",
|
|
|
|
|
"pp",
|
|
|
|
|
"qq",
|
|
|
|
|
"rr",
|
|
|
|
|
"ss",
|
|
|
|
|
"tt",
|
|
|
|
|
"vv",
|
|
|
|
|
"ww",
|
|
|
|
|
"xx",
|
|
|
|
|
"zz",
|
|
|
|
|
)
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"kaan",
|
|
|
|
|
"k\xE4\xE4n",
|
|
|
|
|
"sti",
|
|
|
|
|
"kin",
|
|
|
|
|
"han",
|
|
|
|
|
"h\xE4n",
|
|
|
|
|
"ko",
|
|
|
|
|
"k\xF6",
|
|
|
|
|
"pa",
|
|
|
|
|
"p\xE4",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en")
|
|
|
|
|
__step3_suffixes = (
|
|
|
|
|
"siin",
|
|
|
|
|
"tten",
|
|
|
|
|
"seen",
|
|
|
|
|
"han",
|
|
|
|
|
"hen",
|
|
|
|
|
"hin",
|
|
|
|
|
"hon",
|
|
|
|
|
"h\xE4n",
|
|
|
|
|
"h\xF6n",
|
|
|
|
|
"den",
|
|
|
|
|
"tta",
|
|
|
|
|
"tt\xE4",
|
|
|
|
|
"ssa",
|
|
|
|
|
"ss\xE4",
|
|
|
|
|
"sta",
|
|
|
|
|
"st\xE4",
|
|
|
|
|
"lla",
|
|
|
|
|
"ll\xE4",
|
|
|
|
|
"lta",
|
|
|
|
|
"lt\xE4",
|
|
|
|
|
"lle",
|
|
|
|
|
"ksi",
|
|
|
|
|
"ine",
|
|
|
|
|
"ta",
|
|
|
|
|
"t\xE4",
|
|
|
|
|
"na",
|
|
|
|
|
"n\xE4",
|
|
|
|
|
"a",
|
|
|
|
|
"\xE4",
|
|
|
|
|
"n",
|
|
|
|
|
)
|
|
|
|
|
__step4_suffixes = (
|
|
|
|
|
"impi",
|
|
|
|
|
"impa",
|
|
|
|
|
"imp\xE4",
|
|
|
|
|
"immi",
|
|
|
|
|
"imma",
|
|
|
|
|
"imm\xE4",
|
|
|
|
|
"mpi",
|
|
|
|
|
"mpa",
|
|
|
|
|
"mp\xE4",
|
|
|
|
|
"mmi",
|
|
|
|
|
"mma",
|
|
|
|
|
"mm\xE4",
|
|
|
|
|
"eja",
|
|
|
|
|
"ej\xE4",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Finnish word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step3_success = False
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 1: Particles etc.
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "sti":
|
|
|
|
|
if suffix in r2:
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
else:
|
|
|
|
|
if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6":
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2: Possessives
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "si":
|
|
|
|
|
if word[-3] != "k":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "ni":
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
if word.endswith("kse"):
|
|
|
|
|
word = suffix_replace(word, "kse", "ksi")
|
|
|
|
|
|
|
|
|
|
if r1.endswith("kse"):
|
|
|
|
|
r1 = suffix_replace(r1, "kse", "ksi")
|
|
|
|
|
|
|
|
|
|
if r2.endswith("kse"):
|
|
|
|
|
r2 = suffix_replace(r2, "kse", "ksi")
|
|
|
|
|
|
|
|
|
|
elif suffix == "an":
|
|
|
|
|
if word[-4:-2] in ("ta", "na") or word[-5:-2] in (
|
|
|
|
|
"ssa",
|
|
|
|
|
"sta",
|
|
|
|
|
"lla",
|
|
|
|
|
"lta",
|
|
|
|
|
):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "\xE4n":
|
|
|
|
|
if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in (
|
|
|
|
|
"ss\xE4",
|
|
|
|
|
"st\xE4",
|
|
|
|
|
"ll\xE4",
|
|
|
|
|
"lt\xE4",
|
|
|
|
|
):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix == "en":
|
|
|
|
|
if word[-5:-2] in ("lle", "ine"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3: Cases
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"):
|
|
|
|
|
if (
|
|
|
|
|
(suffix == "han" and word[-4] == "a")
|
|
|
|
|
or (suffix == "hen" and word[-4] == "e")
|
|
|
|
|
or (suffix == "hin" and word[-4] == "i")
|
|
|
|
|
or (suffix == "hon" and word[-4] == "o")
|
|
|
|
|
or (suffix == "h\xE4n" and word[-4] == "\xE4")
|
|
|
|
|
or (suffix == "h\xF6n" and word[-4] == "\xF6")
|
|
|
|
|
):
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
step3_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("siin", "den", "tten"):
|
|
|
|
|
if (
|
|
|
|
|
word[-len(suffix) - 1] == "i"
|
|
|
|
|
and word[-len(suffix) - 2] in self.__restricted_vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
step3_success = True
|
|
|
|
|
else:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
elif suffix == "seen":
|
|
|
|
|
if word[-6:-4] in self.__long_vowels:
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
r1 = r1[:-4]
|
|
|
|
|
r2 = r2[:-4]
|
|
|
|
|
step3_success = True
|
|
|
|
|
else:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
elif suffix in ("a", "\xE4"):
|
|
|
|
|
if word[-2] in self.__vowels and word[-3] in self.__consonants:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
step3_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("tta", "tt\xE4"):
|
|
|
|
|
if word[-4] == "e":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
step3_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix == "n":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
step3_success = True
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
step3_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4: Other endings
|
|
|
|
|
for suffix in self.__step4_suffixes:
|
|
|
|
|
if r2.endswith(suffix):
|
|
|
|
|
if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"):
|
|
|
|
|
if word[-5:-3] != "po":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 5: Plurals
|
|
|
|
|
if step3_success and len(r1) >= 1 and r1[-1] in "ij":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
not step3_success
|
|
|
|
|
and len(r1) >= 2
|
|
|
|
|
and r1[-1] == "t"
|
|
|
|
|
and r1[-2] in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
if r2.endswith("imma"):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
r1 = r1[:-4]
|
|
|
|
|
elif r2.endswith("mma") and r2[-5:-3] != "po":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r1 = r1[:-3]
|
|
|
|
|
|
|
|
|
|
# STEP 6: Tidying up
|
|
|
|
|
if r1[-2:] in self.__long_vowels:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
|
|
|
|
|
if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
|
|
|
|
|
if r1.endswith(("oj", "uj")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
|
|
|
|
|
if r1.endswith("jo"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
|
|
|
|
|
# If the word ends with a double consonant
|
|
|
|
|
# followed by zero or more vowels, the last consonant is removed.
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[-i] in self.__vowels:
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
if i == 1:
|
|
|
|
|
if word[-i - 1 :] in self.__double_consonants:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
else:
|
|
|
|
|
if word[-i - 1 : -i + 1] in self.__double_consonants:
|
|
|
|
|
word = "".join((word[:-i], word[-i + 1 :]))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FrenchStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The French Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The French vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
|
|
|
|
|
:type __step2a_suffixes: tuple
|
|
|
|
|
:cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
|
|
|
|
|
:type __step2b_suffixes: tuple
|
|
|
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
|
|
|
:type __step4_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the French
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/french/stemmer.html
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"issements",
|
|
|
|
|
"issement",
|
|
|
|
|
"atrices",
|
|
|
|
|
"atrice",
|
|
|
|
|
"ateurs",
|
|
|
|
|
"ations",
|
|
|
|
|
"logies",
|
|
|
|
|
"usions",
|
|
|
|
|
"utions",
|
|
|
|
|
"ements",
|
|
|
|
|
"amment",
|
|
|
|
|
"emment",
|
|
|
|
|
"ances",
|
|
|
|
|
"iqUes",
|
|
|
|
|
"ismes",
|
|
|
|
|
"ables",
|
|
|
|
|
"istes",
|
|
|
|
|
"ateur",
|
|
|
|
|
"ation",
|
|
|
|
|
"logie",
|
|
|
|
|
"usion",
|
|
|
|
|
"ution",
|
|
|
|
|
"ences",
|
|
|
|
|
"ement",
|
|
|
|
|
"euses",
|
|
|
|
|
"ments",
|
|
|
|
|
"ance",
|
|
|
|
|
"iqUe",
|
|
|
|
|
"isme",
|
|
|
|
|
"able",
|
|
|
|
|
"iste",
|
|
|
|
|
"ence",
|
|
|
|
|
"it\xE9s",
|
|
|
|
|
"ives",
|
|
|
|
|
"eaux",
|
|
|
|
|
"euse",
|
|
|
|
|
"ment",
|
|
|
|
|
"eux",
|
|
|
|
|
"it\xE9",
|
|
|
|
|
"ive",
|
|
|
|
|
"ifs",
|
|
|
|
|
"aux",
|
|
|
|
|
"if",
|
|
|
|
|
)
|
|
|
|
|
__step2a_suffixes = (
|
|
|
|
|
"issaIent",
|
|
|
|
|
"issantes",
|
|
|
|
|
"iraIent",
|
|
|
|
|
"issante",
|
|
|
|
|
"issants",
|
|
|
|
|
"issions",
|
|
|
|
|
"irions",
|
|
|
|
|
"issais",
|
|
|
|
|
"issait",
|
|
|
|
|
"issant",
|
|
|
|
|
"issent",
|
|
|
|
|
"issiez",
|
|
|
|
|
"issons",
|
|
|
|
|
"irais",
|
|
|
|
|
"irait",
|
|
|
|
|
"irent",
|
|
|
|
|
"iriez",
|
|
|
|
|
"irons",
|
|
|
|
|
"iront",
|
|
|
|
|
"isses",
|
|
|
|
|
"issez",
|
|
|
|
|
"\xEEmes",
|
|
|
|
|
"\xEEtes",
|
|
|
|
|
"irai",
|
|
|
|
|
"iras",
|
|
|
|
|
"irez",
|
|
|
|
|
"isse",
|
|
|
|
|
"ies",
|
|
|
|
|
"ira",
|
|
|
|
|
"\xEEt",
|
|
|
|
|
"ie",
|
|
|
|
|
"ir",
|
|
|
|
|
"is",
|
|
|
|
|
"it",
|
|
|
|
|
"i",
|
|
|
|
|
)
|
|
|
|
|
__step2b_suffixes = (
|
|
|
|
|
"eraIent",
|
|
|
|
|
"assions",
|
|
|
|
|
"erions",
|
|
|
|
|
"assent",
|
|
|
|
|
"assiez",
|
|
|
|
|
"\xE8rent",
|
|
|
|
|
"erais",
|
|
|
|
|
"erait",
|
|
|
|
|
"eriez",
|
|
|
|
|
"erons",
|
|
|
|
|
"eront",
|
|
|
|
|
"aIent",
|
|
|
|
|
"antes",
|
|
|
|
|
"asses",
|
|
|
|
|
"ions",
|
|
|
|
|
"erai",
|
|
|
|
|
"eras",
|
|
|
|
|
"erez",
|
|
|
|
|
"\xE2mes",
|
|
|
|
|
"\xE2tes",
|
|
|
|
|
"ante",
|
|
|
|
|
"ants",
|
|
|
|
|
"asse",
|
|
|
|
|
"\xE9es",
|
|
|
|
|
"era",
|
|
|
|
|
"iez",
|
|
|
|
|
"ais",
|
|
|
|
|
"ait",
|
|
|
|
|
"ant",
|
|
|
|
|
"\xE9e",
|
|
|
|
|
"\xE9s",
|
|
|
|
|
"er",
|
|
|
|
|
"ez",
|
|
|
|
|
"\xE2t",
|
|
|
|
|
"ai",
|
|
|
|
|
"as",
|
|
|
|
|
"\xE9",
|
|
|
|
|
"a",
|
|
|
|
|
)
|
|
|
|
|
__step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a French word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
rv_ending_found = False
|
|
|
|
|
step2a_success = False
|
|
|
|
|
step2b_success = False
|
|
|
|
|
|
|
|
|
|
# Every occurrence of 'u' after 'q' is put into upper case.
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i - 1] == "q" and word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
# Every occurrence of 'u' and 'i'
|
|
|
|
|
# between vowels is put into upper case.
|
|
|
|
|
# Every occurrence of 'y' preceded or
|
|
|
|
|
# followed by a vowel is also put into upper case.
|
|
|
|
|
for i in range(1, len(word) - 1):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
|
|
|
|
|
if word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
elif word[i] == "i":
|
|
|
|
|
word = "".join((word[:i], "I", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
|
|
|
|
|
if word[i] == "y":
|
|
|
|
|
word = "".join((word[:i], "Y", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
rv = self.__rv_french(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 1: Standard suffix removal
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix == "eaux":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("euse", "euses"):
|
|
|
|
|
if suffix in r2:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in r1:
|
|
|
|
|
word = suffix_replace(word, suffix, "eux")
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ement", "ements") and suffix in rv:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "iv" and "iv" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "at" and "at" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
elif word[-3:] == "eus":
|
|
|
|
|
if "eus" in r2:
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
elif "eus" in r1:
|
|
|
|
|
word = "".join((word[:-1], "x"))
|
|
|
|
|
|
|
|
|
|
elif word[-3:] in ("abl", "iqU"):
|
|
|
|
|
if "abl" in r2 or "iqU" in r2:
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
|
|
|
|
|
elif word[-3:] in ("i\xE8r", "I\xE8r"):
|
|
|
|
|
if "i\xE8r" in rv or "I\xE8r" in rv:
|
|
|
|
|
word = "".join((word[:-3], "i"))
|
|
|
|
|
|
|
|
|
|
elif suffix == "amment" and suffix in rv:
|
|
|
|
|
word = suffix_replace(word, "amment", "ant")
|
|
|
|
|
rv = suffix_replace(rv, "amment", "ant")
|
|
|
|
|
rv_ending_found = True
|
|
|
|
|
|
|
|
|
|
elif suffix == "emment" and suffix in rv:
|
|
|
|
|
word = suffix_replace(word, "emment", "ent")
|
|
|
|
|
rv_ending_found = True
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("ment", "ments")
|
|
|
|
|
and suffix in rv
|
|
|
|
|
and not rv.startswith(suffix)
|
|
|
|
|
and rv[rv.rindex(suffix) - 1] in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
rv_ending_found = True
|
|
|
|
|
|
|
|
|
|
elif suffix == "aux" and suffix in r1:
|
|
|
|
|
word = "".join((word[:-2], "l"))
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("issement", "issements")
|
|
|
|
|
and suffix in r1
|
|
|
|
|
and word[-len(suffix) - 1] not in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix
|
|
|
|
|
in (
|
|
|
|
|
"ance",
|
|
|
|
|
"iqUe",
|
|
|
|
|
"isme",
|
|
|
|
|
"able",
|
|
|
|
|
"iste",
|
|
|
|
|
"eux",
|
|
|
|
|
"ances",
|
|
|
|
|
"iqUes",
|
|
|
|
|
"ismes",
|
|
|
|
|
"ables",
|
|
|
|
|
"istes",
|
|
|
|
|
)
|
|
|
|
|
and suffix in r2
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix
|
|
|
|
|
in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations")
|
|
|
|
|
and suffix in r2
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "ic":
|
|
|
|
|
if "ic" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = "".join((word[:-2], "iqU"))
|
|
|
|
|
|
|
|
|
|
elif suffix in ("logie", "logies") and suffix in r2:
|
|
|
|
|
word = suffix_replace(word, suffix, "log")
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2:
|
|
|
|
|
word = suffix_replace(word, suffix, "u")
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ence", "ences") and suffix in r2:
|
|
|
|
|
word = suffix_replace(word, suffix, "ent")
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
if word[-4:] == "abil":
|
|
|
|
|
if "abil" in r2:
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
else:
|
|
|
|
|
word = "".join((word[:-2], "l"))
|
|
|
|
|
|
|
|
|
|
elif word[-2:] == "ic":
|
|
|
|
|
if "ic" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = "".join((word[:-2], "iqU"))
|
|
|
|
|
|
|
|
|
|
elif word[-2:] == "iv":
|
|
|
|
|
if "iv" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "at" and "at" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "ic":
|
|
|
|
|
if "ic" in r2:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = "".join((word[:-2], "iqU"))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2a: Verb suffixes beginning 'i'
|
|
|
|
|
if not step1_success or rv_ending_found:
|
|
|
|
|
for suffix in self.__step2a_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if (
|
|
|
|
|
suffix in rv
|
|
|
|
|
and len(rv) > len(suffix)
|
|
|
|
|
and rv[rv.rindex(suffix) - 1] not in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step2a_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2b: Other verb suffixes
|
|
|
|
|
if not step2a_success:
|
|
|
|
|
for suffix in self.__step2b_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
if suffix == "ions" and "ions" in r2:
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
step2b_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"eraIent",
|
|
|
|
|
"erions",
|
|
|
|
|
"\xE8rent",
|
|
|
|
|
"erais",
|
|
|
|
|
"erait",
|
|
|
|
|
"eriez",
|
|
|
|
|
"erons",
|
|
|
|
|
"eront",
|
|
|
|
|
"erai",
|
|
|
|
|
"eras",
|
|
|
|
|
"erez",
|
|
|
|
|
"\xE9es",
|
|
|
|
|
"era",
|
|
|
|
|
"iez",
|
|
|
|
|
"\xE9e",
|
|
|
|
|
"\xE9s",
|
|
|
|
|
"er",
|
|
|
|
|
"ez",
|
|
|
|
|
"\xE9",
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
step2b_success = True
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"assions",
|
|
|
|
|
"assent",
|
|
|
|
|
"assiez",
|
|
|
|
|
"aIent",
|
|
|
|
|
"antes",
|
|
|
|
|
"asses",
|
|
|
|
|
"\xE2mes",
|
|
|
|
|
"\xE2tes",
|
|
|
|
|
"ante",
|
|
|
|
|
"ants",
|
|
|
|
|
"asse",
|
|
|
|
|
"ais",
|
|
|
|
|
"ait",
|
|
|
|
|
"ant",
|
|
|
|
|
"\xE2t",
|
|
|
|
|
"ai",
|
|
|
|
|
"as",
|
|
|
|
|
"a",
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
step2b_success = True
|
|
|
|
|
if rv.endswith("e"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
if step1_success or step2a_success or step2b_success:
|
|
|
|
|
if word[-1] == "Y":
|
|
|
|
|
word = "".join((word[:-1], "i"))
|
|
|
|
|
elif word[-1] == "\xE7":
|
|
|
|
|
word = "".join((word[:-1], "c"))
|
|
|
|
|
|
|
|
|
|
# STEP 4: Residual suffixes
|
|
|
|
|
else:
|
|
|
|
|
if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
for suffix in self.__step4_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
if suffix == "ion" and suffix in r2 and rv[-4] in "st":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"):
|
|
|
|
|
word = suffix_replace(word, suffix, "i")
|
|
|
|
|
|
|
|
|
|
elif suffix == "e":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
elif suffix == "\xEB" and word[-3:-1] == "gu":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 5: Undouble
|
|
|
|
|
if word.endswith(("enn", "onn", "ett", "ell", "eill")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
# STEP 6: Un-accent
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[-i] not in self.__vowels:
|
|
|
|
|
i += 1
|
|
|
|
|
else:
|
|
|
|
|
if i != 1 and word[-i] in ("\xE9", "\xE8"):
|
|
|
|
|
word = "".join((word[:-i], "e", word[-i + 1 :]))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
word = word.replace("I", "i").replace("U", "u").replace("Y", "y")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
def __rv_french(self, word, vowels):
|
|
|
|
|
"""
|
|
|
|
|
Return the region RV that is used by the French stemmer.
|
|
|
|
|
|
|
|
|
|
If the word begins with two vowels, RV is the region after
|
|
|
|
|
the third letter. Otherwise, it is the region after the first
|
|
|
|
|
vowel not at the beginning of the word, or the end of the word
|
|
|
|
|
if these positions cannot be found. (Exceptionally, u'par',
|
|
|
|
|
u'col' or u'tap' at the beginning of a word is also taken to
|
|
|
|
|
define RV as the region to their right.)
|
|
|
|
|
|
|
|
|
|
:param word: The French word whose region RV is determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:param vowels: The French vowels that are used to determine
|
|
|
|
|
the region RV.
|
|
|
|
|
:type vowels: unicode
|
|
|
|
|
:return: the region RV for the respective French word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the stem method of
|
|
|
|
|
the subclass FrenchStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
rv = ""
|
|
|
|
|
if len(word) >= 2:
|
|
|
|
|
if word.startswith(("par", "col", "tap")) or (
|
|
|
|
|
word[0] in vowels and word[1] in vowels
|
|
|
|
|
):
|
|
|
|
|
rv = word[3:]
|
|
|
|
|
else:
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] in vowels:
|
|
|
|
|
rv = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return rv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GermanStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The German Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The German vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __s_ending: Letters that may directly appear before a word final 's'.
|
|
|
|
|
:type __s_ending: unicode
|
|
|
|
|
:cvar __st_ending: Letter that may directly appear before a word final 'st'.
|
|
|
|
|
:type __st_ending: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the German
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/german/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE4\xF6\xFC"
|
|
|
|
|
__s_ending = "bdfghklmnrt"
|
|
|
|
|
__st_ending = "bdfghklmnt"
|
|
|
|
|
|
|
|
|
|
__step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
|
|
|
|
|
__step2_suffixes = ("est", "en", "er", "st")
|
|
|
|
|
__step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a German word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
word = word.replace("\xDF", "ss")
|
|
|
|
|
|
|
|
|
|
# Every occurrence of 'u' and 'y'
|
|
|
|
|
# between vowels is put into upper case.
|
|
|
|
|
for i in range(1, len(word) - 1):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
|
|
|
|
|
if word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
elif word[i] == "y":
|
|
|
|
|
word = "".join((word[:i], "Y", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# R1 is adjusted so that the region before it
|
|
|
|
|
# contains at least 3 letters.
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
|
|
|
|
|
if 3 > len(word[: i + 1]) > 0:
|
|
|
|
|
r1 = word[3:]
|
|
|
|
|
elif len(word[: i + 1]) == 0:
|
|
|
|
|
return word
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if (
|
|
|
|
|
suffix in ("en", "es", "e")
|
|
|
|
|
and word[-len(suffix) - 4 : -len(suffix)] == "niss"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix) - 1]
|
|
|
|
|
r1 = r1[: -len(suffix) - 1]
|
|
|
|
|
r2 = r2[: -len(suffix) - 1]
|
|
|
|
|
|
|
|
|
|
elif suffix == "s":
|
|
|
|
|
if word[-2] in self.__s_ending:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "st":
|
|
|
|
|
if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r1 = r1[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3: Derivational suffixes
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r2.endswith(suffix):
|
|
|
|
|
if suffix in ("end", "ung"):
|
|
|
|
|
if (
|
|
|
|
|
"ig" in r2[-len(suffix) - 2 : -len(suffix)]
|
|
|
|
|
and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2]
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix) - 2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("ig", "ik", "isch")
|
|
|
|
|
and "e" not in r2[-len(suffix) - 1 : -len(suffix)]
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("lich", "heit"):
|
|
|
|
|
if (
|
|
|
|
|
"er" in r1[-len(suffix) - 2 : -len(suffix)]
|
|
|
|
|
or "en" in r1[-len(suffix) - 2 : -len(suffix)]
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix) - 2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
elif suffix == "keit":
|
|
|
|
|
if "lich" in r2[-len(suffix) - 4 : -len(suffix)]:
|
|
|
|
|
word = word[: -len(suffix) - 4]
|
|
|
|
|
|
|
|
|
|
elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]:
|
|
|
|
|
word = word[: -len(suffix) - 2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Umlaut accents are removed and
|
|
|
|
|
# 'u' and 'y' are put back into lower case.
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\xE4", "a")
|
|
|
|
|
.replace("\xF6", "o")
|
|
|
|
|
.replace("\xFC", "u")
|
|
|
|
|
.replace("U", "u")
|
|
|
|
|
.replace("Y", "y")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HungarianStemmer(_LanguageSpecificStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Hungarian Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Hungarian vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __digraphs: The Hungarian digraphs.
|
|
|
|
|
:type __digraphs: tuple
|
|
|
|
|
:cvar __double_consonants: The Hungarian double consonants.
|
|
|
|
|
:type __double_consonants: tuple
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
|
|
|
:type __step4_suffixes: tuple
|
|
|
|
|
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
|
|
|
|
|
:type __step5_suffixes: tuple
|
|
|
|
|
:cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
|
|
|
|
|
:type __step6_suffixes: tuple
|
|
|
|
|
:cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
|
|
|
|
|
:type __step7_suffixes: tuple
|
|
|
|
|
:cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
|
|
|
|
|
:type __step8_suffixes: tuple
|
|
|
|
|
:cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
|
|
|
|
|
:type __step9_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Hungarian
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
|
|
|
|
|
__digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
|
|
|
|
|
__double_consonants = (
|
|
|
|
|
"bb",
|
|
|
|
|
"cc",
|
|
|
|
|
"ccs",
|
|
|
|
|
"dd",
|
|
|
|
|
"ff",
|
|
|
|
|
"gg",
|
|
|
|
|
"ggy",
|
|
|
|
|
"jj",
|
|
|
|
|
"kk",
|
|
|
|
|
"ll",
|
|
|
|
|
"lly",
|
|
|
|
|
"mm",
|
|
|
|
|
"nn",
|
|
|
|
|
"nny",
|
|
|
|
|
"pp",
|
|
|
|
|
"rr",
|
|
|
|
|
"ss",
|
|
|
|
|
"ssz",
|
|
|
|
|
"tt",
|
|
|
|
|
"tty",
|
|
|
|
|
"vv",
|
|
|
|
|
"zz",
|
|
|
|
|
"zzs",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__step1_suffixes = ("al", "el")
|
|
|
|
|
__step2_suffixes = (
|
|
|
|
|
"k\xE9ppen",
|
|
|
|
|
"onk\xE9nt",
|
|
|
|
|
"enk\xE9nt",
|
|
|
|
|
"ank\xE9nt",
|
|
|
|
|
"k\xE9pp",
|
|
|
|
|
"k\xE9nt",
|
|
|
|
|
"ban",
|
|
|
|
|
"ben",
|
|
|
|
|
"nak",
|
|
|
|
|
"nek",
|
|
|
|
|
"val",
|
|
|
|
|
"vel",
|
|
|
|
|
"t\xF3l",
|
|
|
|
|
"t\xF5l",
|
|
|
|
|
"r\xF3l",
|
|
|
|
|
"r\xF5l",
|
|
|
|
|
"b\xF3l",
|
|
|
|
|
"b\xF5l",
|
|
|
|
|
"hoz",
|
|
|
|
|
"hez",
|
|
|
|
|
"h\xF6z",
|
|
|
|
|
"n\xE1l",
|
|
|
|
|
"n\xE9l",
|
|
|
|
|
"\xE9rt",
|
|
|
|
|
"kor",
|
|
|
|
|
"ba",
|
|
|
|
|
"be",
|
|
|
|
|
"ra",
|
|
|
|
|
"re",
|
|
|
|
|
"ig",
|
|
|
|
|
"at",
|
|
|
|
|
"et",
|
|
|
|
|
"ot",
|
|
|
|
|
"\xF6t",
|
|
|
|
|
"ul",
|
|
|
|
|
"\xFCl",
|
|
|
|
|
"v\xE1",
|
|
|
|
|
"v\xE9",
|
|
|
|
|
"en",
|
|
|
|
|
"on",
|
|
|
|
|
"an",
|
|
|
|
|
"\xF6n",
|
|
|
|
|
"n",
|
|
|
|
|
"t",
|
|
|
|
|
)
|
|
|
|
|
__step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
|
|
|
|
|
__step4_suffixes = (
|
|
|
|
|
"astul",
|
|
|
|
|
"est\xFCl",
|
|
|
|
|
"\xE1stul",
|
|
|
|
|
"\xE9st\xFCl",
|
|
|
|
|
"stul",
|
|
|
|
|
"st\xFCl",
|
|
|
|
|
)
|
|
|
|
|
__step5_suffixes = ("\xE1", "\xE9")
|
|
|
|
|
__step6_suffixes = (
|
|
|
|
|
"ok\xE9",
|
|
|
|
|
"\xF6k\xE9",
|
|
|
|
|
"ak\xE9",
|
|
|
|
|
"ek\xE9",
|
|
|
|
|
"\xE1k\xE9",
|
|
|
|
|
"\xE1\xE9i",
|
|
|
|
|
"\xE9k\xE9",
|
|
|
|
|
"\xE9\xE9i",
|
|
|
|
|
"k\xE9",
|
|
|
|
|
"\xE9i",
|
|
|
|
|
"\xE9\xE9",
|
|
|
|
|
"\xE9",
|
|
|
|
|
)
|
|
|
|
|
__step7_suffixes = (
|
|
|
|
|
"\xE1juk",
|
|
|
|
|
"\xE9j\xFCk",
|
|
|
|
|
"\xFCnk",
|
|
|
|
|
"unk",
|
|
|
|
|
"juk",
|
|
|
|
|
"j\xFCk",
|
|
|
|
|
"\xE1nk",
|
|
|
|
|
"\xE9nk",
|
|
|
|
|
"nk",
|
|
|
|
|
"uk",
|
|
|
|
|
"\xFCk",
|
|
|
|
|
"em",
|
|
|
|
|
"om",
|
|
|
|
|
"am",
|
|
|
|
|
"od",
|
|
|
|
|
"ed",
|
|
|
|
|
"ad",
|
|
|
|
|
"\xF6d",
|
|
|
|
|
"ja",
|
|
|
|
|
"je",
|
|
|
|
|
"\xE1m",
|
|
|
|
|
"\xE1d",
|
|
|
|
|
"\xE9m",
|
|
|
|
|
"\xE9d",
|
|
|
|
|
"m",
|
|
|
|
|
"d",
|
|
|
|
|
"a",
|
|
|
|
|
"e",
|
|
|
|
|
"o",
|
|
|
|
|
"\xE1",
|
|
|
|
|
"\xE9",
|
|
|
|
|
)
|
|
|
|
|
__step8_suffixes = (
|
|
|
|
|
"jaitok",
|
|
|
|
|
"jeitek",
|
|
|
|
|
"jaink",
|
|
|
|
|
"jeink",
|
|
|
|
|
"aitok",
|
|
|
|
|
"eitek",
|
|
|
|
|
"\xE1itok",
|
|
|
|
|
"\xE9itek",
|
|
|
|
|
"jaim",
|
|
|
|
|
"jeim",
|
|
|
|
|
"jaid",
|
|
|
|
|
"jeid",
|
|
|
|
|
"eink",
|
|
|
|
|
"aink",
|
|
|
|
|
"itek",
|
|
|
|
|
"jeik",
|
|
|
|
|
"jaik",
|
|
|
|
|
"\xE1ink",
|
|
|
|
|
"\xE9ink",
|
|
|
|
|
"aim",
|
|
|
|
|
"eim",
|
|
|
|
|
"aid",
|
|
|
|
|
"eid",
|
|
|
|
|
"jai",
|
|
|
|
|
"jei",
|
|
|
|
|
"ink",
|
|
|
|
|
"aik",
|
|
|
|
|
"eik",
|
|
|
|
|
"\xE1im",
|
|
|
|
|
"\xE1id",
|
|
|
|
|
"\xE1ik",
|
|
|
|
|
"\xE9im",
|
|
|
|
|
"\xE9id",
|
|
|
|
|
"\xE9ik",
|
|
|
|
|
"im",
|
|
|
|
|
"id",
|
|
|
|
|
"ai",
|
|
|
|
|
"ei",
|
|
|
|
|
"ik",
|
|
|
|
|
"\xE1i",
|
|
|
|
|
"\xE9i",
|
|
|
|
|
"i",
|
|
|
|
|
)
|
|
|
|
|
__step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem an Hungarian word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
|
|
|
|
|
|
|
|
|
|
# STEP 1: Remove instrumental case
|
|
|
|
|
if r1.endswith(self.__step1_suffixes):
|
|
|
|
|
for double_cons in self.__double_consonants:
|
|
|
|
|
if word[-2 - len(double_cons) : -2] == double_cons:
|
|
|
|
|
word = "".join((word[:-4], word[-3]))
|
|
|
|
|
|
|
|
|
|
if r1[-2 - len(double_cons) : -2] == double_cons:
|
|
|
|
|
r1 = "".join((r1[:-4], r1[-3]))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2: Remove frequent cases
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r1.endswith("\xE1"):
|
|
|
|
|
word = "".join((word[:-1], "a"))
|
|
|
|
|
r1 = suffix_replace(r1, "\xE1", "a")
|
|
|
|
|
|
|
|
|
|
elif r1.endswith("\xE9"):
|
|
|
|
|
word = "".join((word[:-1], "e"))
|
|
|
|
|
r1 = suffix_replace(r1, "\xE9", "e")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3: Remove special cases
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "\xE9n":
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "a")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4: Remove other cases
|
|
|
|
|
for suffix in self.__step4_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "\xE1stul":
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "a")
|
|
|
|
|
|
|
|
|
|
elif suffix == "\xE9st\xFCl":
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 5: Remove factive case
|
|
|
|
|
for suffix in self.__step5_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
for double_cons in self.__double_consonants:
|
|
|
|
|
if word[-1 - len(double_cons) : -1] == double_cons:
|
|
|
|
|
word = "".join((word[:-3], word[-2]))
|
|
|
|
|
|
|
|
|
|
if r1[-1 - len(double_cons) : -1] == double_cons:
|
|
|
|
|
r1 = "".join((r1[:-3], r1[-2]))
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 6: Remove owned
|
|
|
|
|
for suffix in self.__step6_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in ("\xE1k\xE9", "\xE1\xE9i"):
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "a")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"):
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 7: Remove singular owner suffixes
|
|
|
|
|
for suffix in self.__step7_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"):
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "a")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"):
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 8: Remove plural owner suffixes
|
|
|
|
|
for suffix in self.__step8_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in (
|
|
|
|
|
"\xE1im",
|
|
|
|
|
"\xE1id",
|
|
|
|
|
"\xE1i",
|
|
|
|
|
"\xE1ink",
|
|
|
|
|
"\xE1itok",
|
|
|
|
|
"\xE1ik",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "a")
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"\xE9im",
|
|
|
|
|
"\xE9id",
|
|
|
|
|
"\xE9i",
|
|
|
|
|
"\xE9ink",
|
|
|
|
|
"\xE9itek",
|
|
|
|
|
"\xE9ik",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 9: Remove plural suffixes
|
|
|
|
|
for suffix in self.__step9_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "\xE1k":
|
|
|
|
|
word = suffix_replace(word, suffix, "a")
|
|
|
|
|
elif suffix == "\xE9k":
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
def __r1_hungarian(self, word, vowels, digraphs):
|
|
|
|
|
"""
|
|
|
|
|
Return the region R1 that is used by the Hungarian stemmer.
|
|
|
|
|
|
|
|
|
|
If the word begins with a vowel, R1 is defined as the region
|
|
|
|
|
after the first consonant or digraph (= two letters stand for
|
|
|
|
|
one phoneme) in the word. If the word begins with a consonant,
|
|
|
|
|
it is defined as the region after the first vowel in the word.
|
|
|
|
|
If the word does not contain both a vowel and consonant, R1
|
|
|
|
|
is the null region at the end of the word.
|
|
|
|
|
|
|
|
|
|
:param word: The Hungarian word whose region R1 is determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:param vowels: The Hungarian vowels that are used to determine
|
|
|
|
|
the region R1.
|
|
|
|
|
:type vowels: unicode
|
|
|
|
|
:param digraphs: The digraphs that are used to determine the
|
|
|
|
|
region R1.
|
|
|
|
|
:type digraphs: tuple
|
|
|
|
|
:return: the region R1 for the respective word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the stem method of the subclass
|
|
|
|
|
HungarianStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
r1 = ""
|
|
|
|
|
if word[0] in vowels:
|
|
|
|
|
for digraph in digraphs:
|
|
|
|
|
if digraph in word[1:]:
|
|
|
|
|
r1 = word[word.index(digraph[-1]) + 1 :]
|
|
|
|
|
return r1
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in vowels:
|
|
|
|
|
r1 = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] in vowels:
|
|
|
|
|
r1 = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return r1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ItalianStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Italian Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Italian vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
|
|
|
|
|
:type __step0_suffixes: tuple
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Italian
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/italian/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
|
|
|
|
|
__step0_suffixes = (
|
|
|
|
|
"gliela",
|
|
|
|
|
"gliele",
|
|
|
|
|
"glieli",
|
|
|
|
|
"glielo",
|
|
|
|
|
"gliene",
|
|
|
|
|
"sene",
|
|
|
|
|
"mela",
|
|
|
|
|
"mele",
|
|
|
|
|
"meli",
|
|
|
|
|
"melo",
|
|
|
|
|
"mene",
|
|
|
|
|
"tela",
|
|
|
|
|
"tele",
|
|
|
|
|
"teli",
|
|
|
|
|
"telo",
|
|
|
|
|
"tene",
|
|
|
|
|
"cela",
|
|
|
|
|
"cele",
|
|
|
|
|
"celi",
|
|
|
|
|
"celo",
|
|
|
|
|
"cene",
|
|
|
|
|
"vela",
|
|
|
|
|
"vele",
|
|
|
|
|
"veli",
|
|
|
|
|
"velo",
|
|
|
|
|
"vene",
|
|
|
|
|
"gli",
|
|
|
|
|
"ci",
|
|
|
|
|
"la",
|
|
|
|
|
"le",
|
|
|
|
|
"li",
|
|
|
|
|
"lo",
|
|
|
|
|
"mi",
|
|
|
|
|
"ne",
|
|
|
|
|
"si",
|
|
|
|
|
"ti",
|
|
|
|
|
"vi",
|
|
|
|
|
)
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"atrice",
|
|
|
|
|
"atrici",
|
|
|
|
|
"azione",
|
|
|
|
|
"azioni",
|
|
|
|
|
"uzione",
|
|
|
|
|
"uzioni",
|
|
|
|
|
"usione",
|
|
|
|
|
"usioni",
|
|
|
|
|
"amento",
|
|
|
|
|
"amenti",
|
|
|
|
|
"imento",
|
|
|
|
|
"imenti",
|
|
|
|
|
"amente",
|
|
|
|
|
"abile",
|
|
|
|
|
"abili",
|
|
|
|
|
"ibile",
|
|
|
|
|
"ibili",
|
|
|
|
|
"mente",
|
|
|
|
|
"atore",
|
|
|
|
|
"atori",
|
|
|
|
|
"logia",
|
|
|
|
|
"logie",
|
|
|
|
|
"anza",
|
|
|
|
|
"anze",
|
|
|
|
|
"iche",
|
|
|
|
|
"ichi",
|
|
|
|
|
"ismo",
|
|
|
|
|
"ismi",
|
|
|
|
|
"ista",
|
|
|
|
|
"iste",
|
|
|
|
|
"isti",
|
|
|
|
|
"ist\xE0",
|
|
|
|
|
"ist\xE8",
|
|
|
|
|
"ist\xEC",
|
|
|
|
|
"ante",
|
|
|
|
|
"anti",
|
|
|
|
|
"enza",
|
|
|
|
|
"enze",
|
|
|
|
|
"ico",
|
|
|
|
|
"ici",
|
|
|
|
|
"ica",
|
|
|
|
|
"ice",
|
|
|
|
|
"oso",
|
|
|
|
|
"osi",
|
|
|
|
|
"osa",
|
|
|
|
|
"ose",
|
|
|
|
|
"it\xE0",
|
|
|
|
|
"ivo",
|
|
|
|
|
"ivi",
|
|
|
|
|
"iva",
|
|
|
|
|
"ive",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = (
|
|
|
|
|
"erebbero",
|
|
|
|
|
"irebbero",
|
|
|
|
|
"assero",
|
|
|
|
|
"assimo",
|
|
|
|
|
"eranno",
|
|
|
|
|
"erebbe",
|
|
|
|
|
"eremmo",
|
|
|
|
|
"ereste",
|
|
|
|
|
"eresti",
|
|
|
|
|
"essero",
|
|
|
|
|
"iranno",
|
|
|
|
|
"irebbe",
|
|
|
|
|
"iremmo",
|
|
|
|
|
"ireste",
|
|
|
|
|
"iresti",
|
|
|
|
|
"iscano",
|
|
|
|
|
"iscono",
|
|
|
|
|
"issero",
|
|
|
|
|
"arono",
|
|
|
|
|
"avamo",
|
|
|
|
|
"avano",
|
|
|
|
|
"avate",
|
|
|
|
|
"eremo",
|
|
|
|
|
"erete",
|
|
|
|
|
"erono",
|
|
|
|
|
"evamo",
|
|
|
|
|
"evano",
|
|
|
|
|
"evate",
|
|
|
|
|
"iremo",
|
|
|
|
|
"irete",
|
|
|
|
|
"irono",
|
|
|
|
|
"ivamo",
|
|
|
|
|
"ivano",
|
|
|
|
|
"ivate",
|
|
|
|
|
"ammo",
|
|
|
|
|
"ando",
|
|
|
|
|
"asse",
|
|
|
|
|
"assi",
|
|
|
|
|
"emmo",
|
|
|
|
|
"enda",
|
|
|
|
|
"ende",
|
|
|
|
|
"endi",
|
|
|
|
|
"endo",
|
|
|
|
|
"erai",
|
|
|
|
|
"erei",
|
|
|
|
|
"Yamo",
|
|
|
|
|
"iamo",
|
|
|
|
|
"immo",
|
|
|
|
|
"irai",
|
|
|
|
|
"irei",
|
|
|
|
|
"isca",
|
|
|
|
|
"isce",
|
|
|
|
|
"isci",
|
|
|
|
|
"isco",
|
|
|
|
|
"ano",
|
|
|
|
|
"are",
|
|
|
|
|
"ata",
|
|
|
|
|
"ate",
|
|
|
|
|
"ati",
|
|
|
|
|
"ato",
|
|
|
|
|
"ava",
|
|
|
|
|
"avi",
|
|
|
|
|
"avo",
|
|
|
|
|
"er\xE0",
|
|
|
|
|
"ere",
|
|
|
|
|
"er\xF2",
|
|
|
|
|
"ete",
|
|
|
|
|
"eva",
|
|
|
|
|
"evi",
|
|
|
|
|
"evo",
|
|
|
|
|
"ir\xE0",
|
|
|
|
|
"ire",
|
|
|
|
|
"ir\xF2",
|
|
|
|
|
"ita",
|
|
|
|
|
"ite",
|
|
|
|
|
"iti",
|
|
|
|
|
"ito",
|
|
|
|
|
"iva",
|
|
|
|
|
"ivi",
|
|
|
|
|
"ivo",
|
|
|
|
|
"ono",
|
|
|
|
|
"uta",
|
|
|
|
|
"ute",
|
|
|
|
|
"uti",
|
|
|
|
|
"uto",
|
|
|
|
|
"ar",
|
|
|
|
|
"ir",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem an Italian word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
|
|
|
|
|
# All acute accents are replaced by grave accents.
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\xE1", "\xE0")
|
|
|
|
|
.replace("\xE9", "\xE8")
|
|
|
|
|
.replace("\xED", "\xEC")
|
|
|
|
|
.replace("\xF3", "\xF2")
|
|
|
|
|
.replace("\xFA", "\xF9")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Every occurrence of 'u' after 'q'
|
|
|
|
|
# is put into upper case.
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i - 1] == "q" and word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
# Every occurrence of 'u' and 'i'
|
|
|
|
|
# between vowels is put into upper case.
|
|
|
|
|
for i in range(1, len(word) - 1):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
|
|
|
|
|
if word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
elif word[i] == "i":
|
|
|
|
|
word = "".join((word[:i], "I", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 0: Attached pronoun
|
|
|
|
|
for suffix in self.__step0_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"):
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "e")
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "e")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "e")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1: Standard suffix removal
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix == "amente" and r1.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
word = word[:-6]
|
|
|
|
|
r2 = r2[:-6]
|
|
|
|
|
rv = rv[:-6]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("iv"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(("os", "ic")):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith("abil"):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
rv = rv[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(
|
|
|
|
|
suffix
|
|
|
|
|
):
|
|
|
|
|
step1_success = True
|
|
|
|
|
word = word[:-6]
|
|
|
|
|
rv = rv[:-6]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
if suffix in ("azione", "azioni", "atore", "atori"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("ic"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("logia", "logie"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = word[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("uzione", "uzioni", "usione", "usioni"):
|
|
|
|
|
word = word[:-5]
|
|
|
|
|
rv = rv[:-5]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("enza", "enze"):
|
|
|
|
|
word = suffix_replace(word, suffix, "te")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "te")
|
|
|
|
|
|
|
|
|
|
elif suffix == "it\xE0":
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
rv = rv[:-3]
|
|
|
|
|
|
|
|
|
|
if r2.endswith(("ic", "iv")):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith("abil"):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
rv = rv[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ivo", "ivi", "iva", "ive"):
|
|
|
|
|
word = word[:-3]
|
|
|
|
|
r2 = r2[:-3]
|
|
|
|
|
rv = rv[:-3]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("ic"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2: Verb suffixes
|
|
|
|
|
if not step1_success:
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3a
|
|
|
|
|
if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
rv = rv[:-1]
|
|
|
|
|
|
|
|
|
|
if rv.endswith("i"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
rv = rv[:-1]
|
|
|
|
|
|
|
|
|
|
# STEP 3b
|
|
|
|
|
if rv.endswith(("ch", "gh")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
word = word.replace("I", "i").replace("U", "u")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NorwegianStemmer(_ScandinavianStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Norwegian Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Norwegian vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __s_ending: Letters that may directly appear before a word final 's'.
|
|
|
|
|
:type __s_ending: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Norwegian
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE6\xE5\xF8"
|
|
|
|
|
__s_ending = "bcdfghjlmnoprtvyz"
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"hetenes",
|
|
|
|
|
"hetene",
|
|
|
|
|
"hetens",
|
|
|
|
|
"heter",
|
|
|
|
|
"heten",
|
|
|
|
|
"endes",
|
|
|
|
|
"ande",
|
|
|
|
|
"ende",
|
|
|
|
|
"edes",
|
|
|
|
|
"enes",
|
|
|
|
|
"erte",
|
|
|
|
|
"ede",
|
|
|
|
|
"ane",
|
|
|
|
|
"ene",
|
|
|
|
|
"ens",
|
|
|
|
|
"ers",
|
|
|
|
|
"ets",
|
|
|
|
|
"het",
|
|
|
|
|
"ast",
|
|
|
|
|
"ert",
|
|
|
|
|
"en",
|
|
|
|
|
"ar",
|
|
|
|
|
"er",
|
|
|
|
|
"as",
|
|
|
|
|
"es",
|
|
|
|
|
"et",
|
|
|
|
|
"a",
|
|
|
|
|
"e",
|
|
|
|
|
"s",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
__step2_suffixes = ("dt", "vt")
|
|
|
|
|
|
|
|
|
|
__step3_suffixes = (
|
|
|
|
|
"hetslov",
|
|
|
|
|
"eleg",
|
|
|
|
|
"elig",
|
|
|
|
|
"elov",
|
|
|
|
|
"slov",
|
|
|
|
|
"leg",
|
|
|
|
|
"eig",
|
|
|
|
|
"lig",
|
|
|
|
|
"els",
|
|
|
|
|
"lov",
|
|
|
|
|
"ig",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Norwegian word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
r1 = self._r1_scandinavian(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 1
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in ("erte", "ert"):
|
|
|
|
|
word = suffix_replace(word, suffix, "er")
|
|
|
|
|
r1 = suffix_replace(r1, suffix, "er")
|
|
|
|
|
|
|
|
|
|
elif suffix == "s":
|
|
|
|
|
if word[-2] in self.__s_ending or (
|
|
|
|
|
word[-2] == "k" and word[-3] not in self.__vowels
|
|
|
|
|
):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PortugueseStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Portuguese Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Portuguese vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
|
|
|
|
|
:type __step4_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Portuguese
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"amentos",
|
|
|
|
|
"imentos",
|
|
|
|
|
"uço~es",
|
|
|
|
|
"amento",
|
|
|
|
|
"imento",
|
|
|
|
|
"adoras",
|
|
|
|
|
"adores",
|
|
|
|
|
"a\xE7o~es",
|
|
|
|
|
"logias",
|
|
|
|
|
"\xEAncias",
|
|
|
|
|
"amente",
|
|
|
|
|
"idades",
|
|
|
|
|
"an\xE7as",
|
|
|
|
|
"ismos",
|
|
|
|
|
"istas",
|
|
|
|
|
"adora",
|
|
|
|
|
"a\xE7a~o",
|
|
|
|
|
"antes",
|
|
|
|
|
"\xE2ncia",
|
|
|
|
|
"logia",
|
|
|
|
|
"uça~o",
|
|
|
|
|
"\xEAncia",
|
|
|
|
|
"mente",
|
|
|
|
|
"idade",
|
|
|
|
|
"an\xE7a",
|
|
|
|
|
"ezas",
|
|
|
|
|
"icos",
|
|
|
|
|
"icas",
|
|
|
|
|
"ismo",
|
|
|
|
|
"\xE1vel",
|
|
|
|
|
"\xEDvel",
|
|
|
|
|
"ista",
|
|
|
|
|
"osos",
|
|
|
|
|
"osas",
|
|
|
|
|
"ador",
|
|
|
|
|
"ante",
|
|
|
|
|
"ivas",
|
|
|
|
|
"ivos",
|
|
|
|
|
"iras",
|
|
|
|
|
"eza",
|
|
|
|
|
"ico",
|
|
|
|
|
"ica",
|
|
|
|
|
"oso",
|
|
|
|
|
"osa",
|
|
|
|
|
"iva",
|
|
|
|
|
"ivo",
|
|
|
|
|
"ira",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = (
|
|
|
|
|
"ar\xEDamos",
|
|
|
|
|
"er\xEDamos",
|
|
|
|
|
"ir\xEDamos",
|
|
|
|
|
"\xE1ssemos",
|
|
|
|
|
"\xEAssemos",
|
|
|
|
|
"\xEDssemos",
|
|
|
|
|
"ar\xEDeis",
|
|
|
|
|
"er\xEDeis",
|
|
|
|
|
"ir\xEDeis",
|
|
|
|
|
"\xE1sseis",
|
|
|
|
|
"\xE9sseis",
|
|
|
|
|
"\xEDsseis",
|
|
|
|
|
"\xE1ramos",
|
|
|
|
|
"\xE9ramos",
|
|
|
|
|
"\xEDramos",
|
|
|
|
|
"\xE1vamos",
|
|
|
|
|
"aremos",
|
|
|
|
|
"eremos",
|
|
|
|
|
"iremos",
|
|
|
|
|
"ariam",
|
|
|
|
|
"eriam",
|
|
|
|
|
"iriam",
|
|
|
|
|
"assem",
|
|
|
|
|
"essem",
|
|
|
|
|
"issem",
|
|
|
|
|
"ara~o",
|
|
|
|
|
"era~o",
|
|
|
|
|
"ira~o",
|
|
|
|
|
"arias",
|
|
|
|
|
"erias",
|
|
|
|
|
"irias",
|
|
|
|
|
"ardes",
|
|
|
|
|
"erdes",
|
|
|
|
|
"irdes",
|
|
|
|
|
"asses",
|
|
|
|
|
"esses",
|
|
|
|
|
"isses",
|
|
|
|
|
"astes",
|
|
|
|
|
"estes",
|
|
|
|
|
"istes",
|
|
|
|
|
"\xE1reis",
|
|
|
|
|
"areis",
|
|
|
|
|
"\xE9reis",
|
|
|
|
|
"ereis",
|
|
|
|
|
"\xEDreis",
|
|
|
|
|
"ireis",
|
|
|
|
|
"\xE1veis",
|
|
|
|
|
"\xEDamos",
|
|
|
|
|
"armos",
|
|
|
|
|
"ermos",
|
|
|
|
|
"irmos",
|
|
|
|
|
"aria",
|
|
|
|
|
"eria",
|
|
|
|
|
"iria",
|
|
|
|
|
"asse",
|
|
|
|
|
"esse",
|
|
|
|
|
"isse",
|
|
|
|
|
"aste",
|
|
|
|
|
"este",
|
|
|
|
|
"iste",
|
|
|
|
|
"arei",
|
|
|
|
|
"erei",
|
|
|
|
|
"irei",
|
|
|
|
|
"aram",
|
|
|
|
|
"eram",
|
|
|
|
|
"iram",
|
|
|
|
|
"avam",
|
|
|
|
|
"arem",
|
|
|
|
|
"erem",
|
|
|
|
|
"irem",
|
|
|
|
|
"ando",
|
|
|
|
|
"endo",
|
|
|
|
|
"indo",
|
|
|
|
|
"adas",
|
|
|
|
|
"idas",
|
|
|
|
|
"ar\xE1s",
|
|
|
|
|
"aras",
|
|
|
|
|
"er\xE1s",
|
|
|
|
|
"eras",
|
|
|
|
|
"ir\xE1s",
|
|
|
|
|
"avas",
|
|
|
|
|
"ares",
|
|
|
|
|
"eres",
|
|
|
|
|
"ires",
|
|
|
|
|
"\xEDeis",
|
|
|
|
|
"ados",
|
|
|
|
|
"idos",
|
|
|
|
|
"\xE1mos",
|
|
|
|
|
"amos",
|
|
|
|
|
"emos",
|
|
|
|
|
"imos",
|
|
|
|
|
"iras",
|
|
|
|
|
"ada",
|
|
|
|
|
"ida",
|
|
|
|
|
"ar\xE1",
|
|
|
|
|
"ara",
|
|
|
|
|
"er\xE1",
|
|
|
|
|
"era",
|
|
|
|
|
"ir\xE1",
|
|
|
|
|
"ava",
|
|
|
|
|
"iam",
|
|
|
|
|
"ado",
|
|
|
|
|
"ido",
|
|
|
|
|
"ias",
|
|
|
|
|
"ais",
|
|
|
|
|
"eis",
|
|
|
|
|
"ira",
|
|
|
|
|
"ia",
|
|
|
|
|
"ei",
|
|
|
|
|
"am",
|
|
|
|
|
"em",
|
|
|
|
|
"ar",
|
|
|
|
|
"er",
|
|
|
|
|
"ir",
|
|
|
|
|
"as",
|
|
|
|
|
"es",
|
|
|
|
|
"is",
|
|
|
|
|
"eu",
|
|
|
|
|
"iu",
|
|
|
|
|
"ou",
|
|
|
|
|
)
|
|
|
|
|
__step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Portuguese word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
step2_success = False
|
|
|
|
|
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\xE3", "a~")
|
|
|
|
|
.replace("\xF5", "o~")
|
|
|
|
|
.replace("q\xFC", "qu")
|
|
|
|
|
.replace("g\xFC", "gu")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 1: Standard suffix removal
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix == "amente" and r1.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
word = word[:-6]
|
|
|
|
|
r2 = r2[:-6]
|
|
|
|
|
rv = rv[:-6]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("iv"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(("os", "ic", "ad")):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix in ("ira", "iras")
|
|
|
|
|
and rv.endswith(suffix)
|
|
|
|
|
and word[-len(suffix) - 1 : -len(suffix)] == "e"
|
|
|
|
|
):
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
word = suffix_replace(word, suffix, "ir")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "ir")
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
|
|
|
|
|
if suffix in ("logia", "logias"):
|
|
|
|
|
word = suffix_replace(word, suffix, "log")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "log")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("uça~o", "uço~es"):
|
|
|
|
|
word = suffix_replace(word, suffix, "u")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "u")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("\xEAncia", "\xEAncias"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ente")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "ente")
|
|
|
|
|
|
|
|
|
|
elif suffix == "mente":
|
|
|
|
|
word = word[:-5]
|
|
|
|
|
r2 = r2[:-5]
|
|
|
|
|
rv = rv[:-5]
|
|
|
|
|
|
|
|
|
|
if r2.endswith(("ante", "avel", "ivel")):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
rv = rv[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("idade", "idades"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r2.endswith(("ic", "iv")):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith("abil"):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
rv = rv[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("iva", "ivo", "ivas", "ivos"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2: Verb suffixes
|
|
|
|
|
if not step1_success:
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
step2_success = True
|
|
|
|
|
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
if step1_success or step2_success:
|
|
|
|
|
if rv.endswith("i") and word[-2] == "c":
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
rv = rv[:-1]
|
|
|
|
|
|
|
|
|
|
### STEP 4: Residual suffix
|
|
|
|
|
if not step1_success and not step2_success:
|
|
|
|
|
for suffix in self.__step4_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 5
|
|
|
|
|
if rv.endswith(("e", "\xE9", "\xEA")):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
rv = rv[:-1]
|
|
|
|
|
|
|
|
|
|
if (word.endswith("gu") and rv.endswith("u")) or (
|
|
|
|
|
word.endswith("ci") and rv.endswith("i")
|
|
|
|
|
):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
elif word.endswith("\xE7"):
|
|
|
|
|
word = suffix_replace(word, "\xE7", "c")
|
|
|
|
|
|
|
|
|
|
word = word.replace("a~", "\xE3").replace("o~", "\xF5")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RomanianStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Romanian Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Romanian vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
|
|
|
|
|
:type __step0_suffixes: tuple
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Romanian
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/romanian/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiou\u0103\xE2\xEE"
|
|
|
|
|
__step0_suffixes = (
|
|
|
|
|
"iilor",
|
|
|
|
|
"ului",
|
|
|
|
|
"elor",
|
|
|
|
|
"iile",
|
|
|
|
|
"ilor",
|
|
|
|
|
"atei",
|
|
|
|
|
"a\u0163ie",
|
|
|
|
|
"a\u0163ia",
|
|
|
|
|
"aua",
|
|
|
|
|
"ele",
|
|
|
|
|
"iua",
|
|
|
|
|
"iei",
|
|
|
|
|
"ile",
|
|
|
|
|
"ul",
|
|
|
|
|
"ea",
|
|
|
|
|
"ii",
|
|
|
|
|
)
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"abilitate",
|
|
|
|
|
"abilitati",
|
|
|
|
|
"abilit\u0103\u0163i",
|
|
|
|
|
"ibilitate",
|
|
|
|
|
"abilit\u0103i",
|
|
|
|
|
"ivitate",
|
|
|
|
|
"ivitati",
|
|
|
|
|
"ivit\u0103\u0163i",
|
|
|
|
|
"icitate",
|
|
|
|
|
"icitati",
|
|
|
|
|
"icit\u0103\u0163i",
|
|
|
|
|
"icatori",
|
|
|
|
|
"ivit\u0103i",
|
|
|
|
|
"icit\u0103i",
|
|
|
|
|
"icator",
|
|
|
|
|
"a\u0163iune",
|
|
|
|
|
"atoare",
|
|
|
|
|
"\u0103toare",
|
|
|
|
|
"i\u0163iune",
|
|
|
|
|
"itoare",
|
|
|
|
|
"iciva",
|
|
|
|
|
"icive",
|
|
|
|
|
"icivi",
|
|
|
|
|
"iciv\u0103",
|
|
|
|
|
"icala",
|
|
|
|
|
"icale",
|
|
|
|
|
"icali",
|
|
|
|
|
"ical\u0103",
|
|
|
|
|
"ativa",
|
|
|
|
|
"ative",
|
|
|
|
|
"ativi",
|
|
|
|
|
"ativ\u0103",
|
|
|
|
|
"atori",
|
|
|
|
|
"\u0103tori",
|
|
|
|
|
"itiva",
|
|
|
|
|
"itive",
|
|
|
|
|
"itivi",
|
|
|
|
|
"itiv\u0103",
|
|
|
|
|
"itori",
|
|
|
|
|
"iciv",
|
|
|
|
|
"ical",
|
|
|
|
|
"ativ",
|
|
|
|
|
"ator",
|
|
|
|
|
"\u0103tor",
|
|
|
|
|
"itiv",
|
|
|
|
|
"itor",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = (
|
|
|
|
|
"abila",
|
|
|
|
|
"abile",
|
|
|
|
|
"abili",
|
|
|
|
|
"abil\u0103",
|
|
|
|
|
"ibila",
|
|
|
|
|
"ibile",
|
|
|
|
|
"ibili",
|
|
|
|
|
"ibil\u0103",
|
|
|
|
|
"atori",
|
|
|
|
|
"itate",
|
|
|
|
|
"itati",
|
|
|
|
|
"it\u0103\u0163i",
|
|
|
|
|
"abil",
|
|
|
|
|
"ibil",
|
|
|
|
|
"oasa",
|
|
|
|
|
"oas\u0103",
|
|
|
|
|
"oase",
|
|
|
|
|
"anta",
|
|
|
|
|
"ante",
|
|
|
|
|
"anti",
|
|
|
|
|
"ant\u0103",
|
|
|
|
|
"ator",
|
|
|
|
|
"it\u0103i",
|
|
|
|
|
"iune",
|
|
|
|
|
"iuni",
|
|
|
|
|
"isme",
|
|
|
|
|
"ista",
|
|
|
|
|
"iste",
|
|
|
|
|
"isti",
|
|
|
|
|
"ist\u0103",
|
|
|
|
|
"i\u015Fti",
|
|
|
|
|
"ata",
|
|
|
|
|
"at\u0103",
|
|
|
|
|
"ati",
|
|
|
|
|
"ate",
|
|
|
|
|
"uta",
|
|
|
|
|
"ut\u0103",
|
|
|
|
|
"uti",
|
|
|
|
|
"ute",
|
|
|
|
|
"ita",
|
|
|
|
|
"it\u0103",
|
|
|
|
|
"iti",
|
|
|
|
|
"ite",
|
|
|
|
|
"ica",
|
|
|
|
|
"ice",
|
|
|
|
|
"ici",
|
|
|
|
|
"ic\u0103",
|
|
|
|
|
"osi",
|
|
|
|
|
"o\u015Fi",
|
|
|
|
|
"ant",
|
|
|
|
|
"iva",
|
|
|
|
|
"ive",
|
|
|
|
|
"ivi",
|
|
|
|
|
"iv\u0103",
|
|
|
|
|
"ism",
|
|
|
|
|
"ist",
|
|
|
|
|
"at",
|
|
|
|
|
"ut",
|
|
|
|
|
"it",
|
|
|
|
|
"ic",
|
|
|
|
|
"os",
|
|
|
|
|
"iv",
|
|
|
|
|
)
|
|
|
|
|
__step3_suffixes = (
|
|
|
|
|
"seser\u0103\u0163i",
|
|
|
|
|
"aser\u0103\u0163i",
|
|
|
|
|
"iser\u0103\u0163i",
|
|
|
|
|
"\xE2ser\u0103\u0163i",
|
|
|
|
|
"user\u0103\u0163i",
|
|
|
|
|
"seser\u0103m",
|
|
|
|
|
"aser\u0103m",
|
|
|
|
|
"iser\u0103m",
|
|
|
|
|
"\xE2ser\u0103m",
|
|
|
|
|
"user\u0103m",
|
|
|
|
|
"ser\u0103\u0163i",
|
|
|
|
|
"sese\u015Fi",
|
|
|
|
|
"seser\u0103",
|
|
|
|
|
"easc\u0103",
|
|
|
|
|
"ar\u0103\u0163i",
|
|
|
|
|
"ur\u0103\u0163i",
|
|
|
|
|
"ir\u0103\u0163i",
|
|
|
|
|
"\xE2r\u0103\u0163i",
|
|
|
|
|
"ase\u015Fi",
|
|
|
|
|
"aser\u0103",
|
|
|
|
|
"ise\u015Fi",
|
|
|
|
|
"iser\u0103",
|
|
|
|
|
"\xe2se\u015Fi",
|
|
|
|
|
"\xE2ser\u0103",
|
|
|
|
|
"use\u015Fi",
|
|
|
|
|
"user\u0103",
|
|
|
|
|
"ser\u0103m",
|
|
|
|
|
"sesem",
|
|
|
|
|
"indu",
|
|
|
|
|
"\xE2ndu",
|
|
|
|
|
"eaz\u0103",
|
|
|
|
|
"e\u015Fti",
|
|
|
|
|
"e\u015Fte",
|
|
|
|
|
"\u0103\u015Fti",
|
|
|
|
|
"\u0103\u015Fte",
|
|
|
|
|
"ea\u0163i",
|
|
|
|
|
"ia\u0163i",
|
|
|
|
|
"ar\u0103m",
|
|
|
|
|
"ur\u0103m",
|
|
|
|
|
"ir\u0103m",
|
|
|
|
|
"\xE2r\u0103m",
|
|
|
|
|
"asem",
|
|
|
|
|
"isem",
|
|
|
|
|
"\xE2sem",
|
|
|
|
|
"usem",
|
|
|
|
|
"se\u015Fi",
|
|
|
|
|
"ser\u0103",
|
|
|
|
|
"sese",
|
|
|
|
|
"are",
|
|
|
|
|
"ere",
|
|
|
|
|
"ire",
|
|
|
|
|
"\xE2re",
|
|
|
|
|
"ind",
|
|
|
|
|
"\xE2nd",
|
|
|
|
|
"eze",
|
|
|
|
|
"ezi",
|
|
|
|
|
"esc",
|
|
|
|
|
"\u0103sc",
|
|
|
|
|
"eam",
|
|
|
|
|
"eai",
|
|
|
|
|
"eau",
|
|
|
|
|
"iam",
|
|
|
|
|
"iai",
|
|
|
|
|
"iau",
|
|
|
|
|
"a\u015Fi",
|
|
|
|
|
"ar\u0103",
|
|
|
|
|
"u\u015Fi",
|
|
|
|
|
"ur\u0103",
|
|
|
|
|
"i\u015Fi",
|
|
|
|
|
"ir\u0103",
|
|
|
|
|
"\xE2\u015Fi",
|
|
|
|
|
"\xe2r\u0103",
|
|
|
|
|
"ase",
|
|
|
|
|
"ise",
|
|
|
|
|
"\xE2se",
|
|
|
|
|
"use",
|
|
|
|
|
"a\u0163i",
|
|
|
|
|
"e\u0163i",
|
|
|
|
|
"i\u0163i",
|
|
|
|
|
"\xe2\u0163i",
|
|
|
|
|
"sei",
|
|
|
|
|
"ez",
|
|
|
|
|
"am",
|
|
|
|
|
"ai",
|
|
|
|
|
"au",
|
|
|
|
|
"ea",
|
|
|
|
|
"ia",
|
|
|
|
|
"ui",
|
|
|
|
|
"\xE2i",
|
|
|
|
|
"\u0103m",
|
|
|
|
|
"em",
|
|
|
|
|
"im",
|
|
|
|
|
"\xE2m",
|
|
|
|
|
"se",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Romanian word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
step2_success = False
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word) - 1):
|
|
|
|
|
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
|
|
|
|
|
if word[i] == "u":
|
|
|
|
|
word = "".join((word[:i], "U", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
elif word[i] == "i":
|
|
|
|
|
word = "".join((word[:i], "I", word[i + 1 :]))
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 0: Removal of plurals and other simplifications
|
|
|
|
|
for suffix in self.__step0_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in r1:
|
|
|
|
|
if suffix in ("ul", "ului"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
else:
|
|
|
|
|
rv = ""
|
|
|
|
|
|
|
|
|
|
elif (
|
|
|
|
|
suffix == "aua"
|
|
|
|
|
or suffix == "atei"
|
|
|
|
|
or (suffix == "ile" and word[-5:-3] != "ab")
|
|
|
|
|
):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ea", "ele", "elor"):
|
|
|
|
|
word = suffix_replace(word, suffix, "e")
|
|
|
|
|
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
rv = suffix_replace(rv, suffix, "e")
|
|
|
|
|
else:
|
|
|
|
|
rv = ""
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"):
|
|
|
|
|
word = suffix_replace(word, suffix, "i")
|
|
|
|
|
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
rv = suffix_replace(rv, suffix, "i")
|
|
|
|
|
else:
|
|
|
|
|
rv = ""
|
|
|
|
|
|
|
|
|
|
elif suffix in ("a\u0163ie", "a\u0163ia"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1: Reduction of combining suffixes
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
|
|
replacement_done = False
|
|
|
|
|
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in r1:
|
|
|
|
|
step1_success = True
|
|
|
|
|
replacement_done = True
|
|
|
|
|
|
|
|
|
|
if suffix in (
|
|
|
|
|
"abilitate",
|
|
|
|
|
"abilitati",
|
|
|
|
|
"abilit\u0103i",
|
|
|
|
|
"abilit\u0103\u0163i",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "abil")
|
|
|
|
|
|
|
|
|
|
elif suffix == "ibilitate":
|
|
|
|
|
word = word[:-5]
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"ivitate",
|
|
|
|
|
"ivitati",
|
|
|
|
|
"ivit\u0103i",
|
|
|
|
|
"ivit\u0103\u0163i",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "iv")
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"icitate",
|
|
|
|
|
"icitati",
|
|
|
|
|
"icit\u0103i",
|
|
|
|
|
"icit\u0103\u0163i",
|
|
|
|
|
"icator",
|
|
|
|
|
"icatori",
|
|
|
|
|
"iciv",
|
|
|
|
|
"iciva",
|
|
|
|
|
"icive",
|
|
|
|
|
"icivi",
|
|
|
|
|
"iciv\u0103",
|
|
|
|
|
"ical",
|
|
|
|
|
"icala",
|
|
|
|
|
"icale",
|
|
|
|
|
"icali",
|
|
|
|
|
"ical\u0103",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "ic")
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"ativ",
|
|
|
|
|
"ativa",
|
|
|
|
|
"ative",
|
|
|
|
|
"ativi",
|
|
|
|
|
"ativ\u0103",
|
|
|
|
|
"a\u0163iune",
|
|
|
|
|
"atoare",
|
|
|
|
|
"ator",
|
|
|
|
|
"atori",
|
|
|
|
|
"\u0103toare",
|
|
|
|
|
"\u0103tor",
|
|
|
|
|
"\u0103tori",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "at")
|
|
|
|
|
|
|
|
|
|
if suffix in r2:
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "at")
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"itiv",
|
|
|
|
|
"itiva",
|
|
|
|
|
"itive",
|
|
|
|
|
"itivi",
|
|
|
|
|
"itiv\u0103",
|
|
|
|
|
"i\u0163iune",
|
|
|
|
|
"itoare",
|
|
|
|
|
"itor",
|
|
|
|
|
"itori",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "it")
|
|
|
|
|
|
|
|
|
|
if suffix in r2:
|
|
|
|
|
r2 = suffix_replace(r2, suffix, "it")
|
|
|
|
|
else:
|
|
|
|
|
step1_success = False
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not replacement_done:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2: Removal of standard suffixes
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in r2:
|
|
|
|
|
step2_success = True
|
|
|
|
|
|
|
|
|
|
if suffix in ("iune", "iuni"):
|
|
|
|
|
if word[-5] == "\u0163":
|
|
|
|
|
word = "".join((word[:-5], "t"))
|
|
|
|
|
|
|
|
|
|
elif suffix in (
|
|
|
|
|
"ism",
|
|
|
|
|
"isme",
|
|
|
|
|
"ist",
|
|
|
|
|
"ista",
|
|
|
|
|
"iste",
|
|
|
|
|
"isti",
|
|
|
|
|
"ist\u0103",
|
|
|
|
|
"i\u015Fti",
|
|
|
|
|
):
|
|
|
|
|
word = suffix_replace(word, suffix, "ist")
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3: Removal of verb suffixes
|
|
|
|
|
if not step1_success and not step2_success:
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
if suffix in (
|
|
|
|
|
"seser\u0103\u0163i",
|
|
|
|
|
"seser\u0103m",
|
|
|
|
|
"ser\u0103\u0163i",
|
|
|
|
|
"sese\u015Fi",
|
|
|
|
|
"seser\u0103",
|
|
|
|
|
"ser\u0103m",
|
|
|
|
|
"sesem",
|
|
|
|
|
"se\u015Fi",
|
|
|
|
|
"ser\u0103",
|
|
|
|
|
"sese",
|
|
|
|
|
"a\u0163i",
|
|
|
|
|
"e\u0163i",
|
|
|
|
|
"i\u0163i",
|
|
|
|
|
"\xE2\u0163i",
|
|
|
|
|
"sei",
|
|
|
|
|
"\u0103m",
|
|
|
|
|
"em",
|
|
|
|
|
"im",
|
|
|
|
|
"\xE2m",
|
|
|
|
|
"se",
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
else:
|
|
|
|
|
if (
|
|
|
|
|
not rv.startswith(suffix)
|
|
|
|
|
and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 4: Removal of final vowel
|
|
|
|
|
for suffix in ("ie", "a", "e", "i", "\u0103"):
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
if suffix in rv:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
word = word.replace("I", "i").replace("U", "u")
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RussianStemmer(_LanguageSpecificStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Russian Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __perfective_gerund_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __perfective_gerund_suffixes: tuple
|
|
|
|
|
:cvar __adjectival_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __adjectival_suffixes: tuple
|
|
|
|
|
:cvar __reflexive_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __reflexive_suffixes: tuple
|
|
|
|
|
:cvar __verb_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __verb_suffixes: tuple
|
|
|
|
|
:cvar __noun_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __noun_suffixes: tuple
|
|
|
|
|
:cvar __superlative_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __superlative_suffixes: tuple
|
|
|
|
|
:cvar __derivational_suffixes: Suffixes to be deleted.
|
|
|
|
|
:type __derivational_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Russian
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/russian/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__perfective_gerund_suffixes = (
|
|
|
|
|
"ivshis'",
|
|
|
|
|
"yvshis'",
|
|
|
|
|
"vshis'",
|
|
|
|
|
"ivshi",
|
|
|
|
|
"yvshi",
|
|
|
|
|
"vshi",
|
|
|
|
|
"iv",
|
|
|
|
|
"yv",
|
|
|
|
|
"v",
|
|
|
|
|
)
|
|
|
|
|
__adjectival_suffixes = (
|
|
|
|
|
"ui^ushchi^ui^u",
|
|
|
|
|
"ui^ushchi^ai^a",
|
|
|
|
|
"ui^ushchimi",
|
|
|
|
|
"ui^ushchymi",
|
|
|
|
|
"ui^ushchego",
|
|
|
|
|
"ui^ushchogo",
|
|
|
|
|
"ui^ushchemu",
|
|
|
|
|
"ui^ushchomu",
|
|
|
|
|
"ui^ushchikh",
|
|
|
|
|
"ui^ushchykh",
|
|
|
|
|
"ui^ushchui^u",
|
|
|
|
|
"ui^ushchaia",
|
|
|
|
|
"ui^ushchoi^u",
|
|
|
|
|
"ui^ushchei^u",
|
|
|
|
|
"i^ushchi^ui^u",
|
|
|
|
|
"i^ushchi^ai^a",
|
|
|
|
|
"ui^ushchee",
|
|
|
|
|
"ui^ushchie",
|
|
|
|
|
"ui^ushchye",
|
|
|
|
|
"ui^ushchoe",
|
|
|
|
|
"ui^ushchei`",
|
|
|
|
|
"ui^ushchii`",
|
|
|
|
|
"ui^ushchyi`",
|
|
|
|
|
"ui^ushchoi`",
|
|
|
|
|
"ui^ushchem",
|
|
|
|
|
"ui^ushchim",
|
|
|
|
|
"ui^ushchym",
|
|
|
|
|
"ui^ushchom",
|
|
|
|
|
"i^ushchimi",
|
|
|
|
|
"i^ushchymi",
|
|
|
|
|
"i^ushchego",
|
|
|
|
|
"i^ushchogo",
|
|
|
|
|
"i^ushchemu",
|
|
|
|
|
"i^ushchomu",
|
|
|
|
|
"i^ushchikh",
|
|
|
|
|
"i^ushchykh",
|
|
|
|
|
"i^ushchui^u",
|
|
|
|
|
"i^ushchai^a",
|
|
|
|
|
"i^ushchoi^u",
|
|
|
|
|
"i^ushchei^u",
|
|
|
|
|
"i^ushchee",
|
|
|
|
|
"i^ushchie",
|
|
|
|
|
"i^ushchye",
|
|
|
|
|
"i^ushchoe",
|
|
|
|
|
"i^ushchei`",
|
|
|
|
|
"i^ushchii`",
|
|
|
|
|
"i^ushchyi`",
|
|
|
|
|
"i^ushchoi`",
|
|
|
|
|
"i^ushchem",
|
|
|
|
|
"i^ushchim",
|
|
|
|
|
"i^ushchym",
|
|
|
|
|
"i^ushchom",
|
|
|
|
|
"shchi^ui^u",
|
|
|
|
|
"shchi^ai^a",
|
|
|
|
|
"ivshi^ui^u",
|
|
|
|
|
"ivshi^ai^a",
|
|
|
|
|
"yvshi^ui^u",
|
|
|
|
|
"yvshi^ai^a",
|
|
|
|
|
"shchimi",
|
|
|
|
|
"shchymi",
|
|
|
|
|
"shchego",
|
|
|
|
|
"shchogo",
|
|
|
|
|
"shchemu",
|
|
|
|
|
"shchomu",
|
|
|
|
|
"shchikh",
|
|
|
|
|
"shchykh",
|
|
|
|
|
"shchui^u",
|
|
|
|
|
"shchai^a",
|
|
|
|
|
"shchoi^u",
|
|
|
|
|
"shchei^u",
|
|
|
|
|
"ivshimi",
|
|
|
|
|
"ivshymi",
|
|
|
|
|
"ivshego",
|
|
|
|
|
"ivshogo",
|
|
|
|
|
"ivshemu",
|
|
|
|
|
"ivshomu",
|
|
|
|
|
"ivshikh",
|
|
|
|
|
"ivshykh",
|
|
|
|
|
"ivshui^u",
|
|
|
|
|
"ivshai^a",
|
|
|
|
|
"ivshoi^u",
|
|
|
|
|
"ivshei^u",
|
|
|
|
|
"yvshimi",
|
|
|
|
|
"yvshymi",
|
|
|
|
|
"yvshego",
|
|
|
|
|
"yvshogo",
|
|
|
|
|
"yvshemu",
|
|
|
|
|
"yvshomu",
|
|
|
|
|
"yvshikh",
|
|
|
|
|
"yvshykh",
|
|
|
|
|
"yvshui^u",
|
|
|
|
|
"yvshai^a",
|
|
|
|
|
"yvshoi^u",
|
|
|
|
|
"yvshei^u",
|
|
|
|
|
"vshi^ui^u",
|
|
|
|
|
"vshi^ai^a",
|
|
|
|
|
"shchee",
|
|
|
|
|
"shchie",
|
|
|
|
|
"shchye",
|
|
|
|
|
"shchoe",
|
|
|
|
|
"shchei`",
|
|
|
|
|
"shchii`",
|
|
|
|
|
"shchyi`",
|
|
|
|
|
"shchoi`",
|
|
|
|
|
"shchem",
|
|
|
|
|
"shchim",
|
|
|
|
|
"shchym",
|
|
|
|
|
"shchom",
|
|
|
|
|
"ivshee",
|
|
|
|
|
"ivshie",
|
|
|
|
|
"ivshye",
|
|
|
|
|
"ivshoe",
|
|
|
|
|
"ivshei`",
|
|
|
|
|
"ivshii`",
|
|
|
|
|
"ivshyi`",
|
|
|
|
|
"ivshoi`",
|
|
|
|
|
"ivshem",
|
|
|
|
|
"ivshim",
|
|
|
|
|
"ivshym",
|
|
|
|
|
"ivshom",
|
|
|
|
|
"yvshee",
|
|
|
|
|
"yvshie",
|
|
|
|
|
"yvshye",
|
|
|
|
|
"yvshoe",
|
|
|
|
|
"yvshei`",
|
|
|
|
|
"yvshii`",
|
|
|
|
|
"yvshyi`",
|
|
|
|
|
"yvshoi`",
|
|
|
|
|
"yvshem",
|
|
|
|
|
"yvshim",
|
|
|
|
|
"yvshym",
|
|
|
|
|
"yvshom",
|
|
|
|
|
"vshimi",
|
|
|
|
|
"vshymi",
|
|
|
|
|
"vshego",
|
|
|
|
|
"vshogo",
|
|
|
|
|
"vshemu",
|
|
|
|
|
"vshomu",
|
|
|
|
|
"vshikh",
|
|
|
|
|
"vshykh",
|
|
|
|
|
"vshui^u",
|
|
|
|
|
"vshai^a",
|
|
|
|
|
"vshoi^u",
|
|
|
|
|
"vshei^u",
|
|
|
|
|
"emi^ui^u",
|
|
|
|
|
"emi^ai^a",
|
|
|
|
|
"nni^ui^u",
|
|
|
|
|
"nni^ai^a",
|
|
|
|
|
"vshee",
|
|
|
|
|
"vshie",
|
|
|
|
|
"vshye",
|
|
|
|
|
"vshoe",
|
|
|
|
|
"vshei`",
|
|
|
|
|
"vshii`",
|
|
|
|
|
"vshyi`",
|
|
|
|
|
"vshoi`",
|
|
|
|
|
"vshem",
|
|
|
|
|
"vshim",
|
|
|
|
|
"vshym",
|
|
|
|
|
"vshom",
|
|
|
|
|
"emimi",
|
|
|
|
|
"emymi",
|
|
|
|
|
"emego",
|
|
|
|
|
"emogo",
|
|
|
|
|
"ememu",
|
|
|
|
|
"emomu",
|
|
|
|
|
"emikh",
|
|
|
|
|
"emykh",
|
|
|
|
|
"emui^u",
|
|
|
|
|
"emai^a",
|
|
|
|
|
"emoi^u",
|
|
|
|
|
"emei^u",
|
|
|
|
|
"nnimi",
|
|
|
|
|
"nnymi",
|
|
|
|
|
"nnego",
|
|
|
|
|
"nnogo",
|
|
|
|
|
"nnemu",
|
|
|
|
|
"nnomu",
|
|
|
|
|
"nnikh",
|
|
|
|
|
"nnykh",
|
|
|
|
|
"nnui^u",
|
|
|
|
|
"nnai^a",
|
|
|
|
|
"nnoi^u",
|
|
|
|
|
"nnei^u",
|
|
|
|
|
"emee",
|
|
|
|
|
"emie",
|
|
|
|
|
"emye",
|
|
|
|
|
"emoe",
|
|
|
|
|
"emei`",
|
|
|
|
|
"emii`",
|
|
|
|
|
"emyi`",
|
|
|
|
|
"emoi`",
|
|
|
|
|
"emem",
|
|
|
|
|
"emim",
|
|
|
|
|
"emym",
|
|
|
|
|
"emom",
|
|
|
|
|
"nnee",
|
|
|
|
|
"nnie",
|
|
|
|
|
"nnye",
|
|
|
|
|
"nnoe",
|
|
|
|
|
"nnei`",
|
|
|
|
|
"nnii`",
|
|
|
|
|
"nnyi`",
|
|
|
|
|
"nnoi`",
|
|
|
|
|
"nnem",
|
|
|
|
|
"nnim",
|
|
|
|
|
"nnym",
|
|
|
|
|
"nnom",
|
|
|
|
|
"i^ui^u",
|
|
|
|
|
"i^ai^a",
|
|
|
|
|
"imi",
|
|
|
|
|
"ymi",
|
|
|
|
|
"ego",
|
|
|
|
|
"ogo",
|
|
|
|
|
"emu",
|
|
|
|
|
"omu",
|
|
|
|
|
"ikh",
|
|
|
|
|
"ykh",
|
|
|
|
|
"ui^u",
|
|
|
|
|
"ai^a",
|
|
|
|
|
"oi^u",
|
|
|
|
|
"ei^u",
|
|
|
|
|
"ee",
|
|
|
|
|
"ie",
|
|
|
|
|
"ye",
|
|
|
|
|
"oe",
|
|
|
|
|
"ei`",
|
|
|
|
|
"ii`",
|
|
|
|
|
"yi`",
|
|
|
|
|
"oi`",
|
|
|
|
|
"em",
|
|
|
|
|
"im",
|
|
|
|
|
"ym",
|
|
|
|
|
"om",
|
|
|
|
|
)
|
|
|
|
|
__reflexive_suffixes = ("si^a", "s'")
|
|
|
|
|
__verb_suffixes = (
|
|
|
|
|
"esh'",
|
|
|
|
|
"ei`te",
|
|
|
|
|
"ui`te",
|
|
|
|
|
"ui^ut",
|
|
|
|
|
"ish'",
|
|
|
|
|
"ete",
|
|
|
|
|
"i`te",
|
|
|
|
|
"i^ut",
|
|
|
|
|
"nno",
|
|
|
|
|
"ila",
|
|
|
|
|
"yla",
|
|
|
|
|
"ena",
|
|
|
|
|
"ite",
|
|
|
|
|
"ili",
|
|
|
|
|
"yli",
|
|
|
|
|
"ilo",
|
|
|
|
|
"ylo",
|
|
|
|
|
"eno",
|
|
|
|
|
"i^at",
|
|
|
|
|
"uet",
|
|
|
|
|
"eny",
|
|
|
|
|
"it'",
|
|
|
|
|
"yt'",
|
|
|
|
|
"ui^u",
|
|
|
|
|
"la",
|
|
|
|
|
"na",
|
|
|
|
|
"li",
|
|
|
|
|
"em",
|
|
|
|
|
"lo",
|
|
|
|
|
"no",
|
|
|
|
|
"et",
|
|
|
|
|
"ny",
|
|
|
|
|
"t'",
|
|
|
|
|
"ei`",
|
|
|
|
|
"ui`",
|
|
|
|
|
"il",
|
|
|
|
|
"yl",
|
|
|
|
|
"im",
|
|
|
|
|
"ym",
|
|
|
|
|
"en",
|
|
|
|
|
"it",
|
|
|
|
|
"yt",
|
|
|
|
|
"i^u",
|
|
|
|
|
"i`",
|
|
|
|
|
"l",
|
|
|
|
|
"n",
|
|
|
|
|
)
|
|
|
|
|
__noun_suffixes = (
|
|
|
|
|
"ii^ami",
|
|
|
|
|
"ii^akh",
|
|
|
|
|
"i^ami",
|
|
|
|
|
"ii^am",
|
|
|
|
|
"i^akh",
|
|
|
|
|
"ami",
|
|
|
|
|
"iei`",
|
|
|
|
|
"i^am",
|
|
|
|
|
"iem",
|
|
|
|
|
"akh",
|
|
|
|
|
"ii^u",
|
|
|
|
|
"'i^u",
|
|
|
|
|
"ii^a",
|
|
|
|
|
"'i^a",
|
|
|
|
|
"ev",
|
|
|
|
|
"ov",
|
|
|
|
|
"ie",
|
|
|
|
|
"'e",
|
|
|
|
|
"ei",
|
|
|
|
|
"ii",
|
|
|
|
|
"ei`",
|
|
|
|
|
"oi`",
|
|
|
|
|
"ii`",
|
|
|
|
|
"em",
|
|
|
|
|
"am",
|
|
|
|
|
"om",
|
|
|
|
|
"i^u",
|
|
|
|
|
"i^a",
|
|
|
|
|
"a",
|
|
|
|
|
"e",
|
|
|
|
|
"i",
|
|
|
|
|
"i`",
|
|
|
|
|
"o",
|
|
|
|
|
"u",
|
|
|
|
|
"y",
|
|
|
|
|
"'",
|
|
|
|
|
)
|
|
|
|
|
__superlative_suffixes = ("ei`she", "ei`sh")
|
|
|
|
|
__derivational_suffixes = ("ost'", "ost")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Russian word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
chr_exceeded = False
|
|
|
|
|
for i in range(len(word)):
|
|
|
|
|
if ord(word[i]) > 255:
|
|
|
|
|
chr_exceeded = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not chr_exceeded:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
word = self.__cyrillic_to_roman(word)
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
adjectival_removed = False
|
|
|
|
|
verb_removed = False
|
|
|
|
|
undouble_success = False
|
|
|
|
|
superlative_removed = False
|
|
|
|
|
|
|
|
|
|
rv, r2 = self.__regions_russian(word)
|
|
|
|
|
|
|
|
|
|
# Step 1
|
|
|
|
|
for suffix in self.__perfective_gerund_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
if suffix in ("v", "vshi", "vshis'"):
|
|
|
|
|
if (
|
|
|
|
|
rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
|
|
|
|
|
or rv[-len(suffix) - 1 : -len(suffix)] == "a"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
step1_success = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not step1_success:
|
|
|
|
|
for suffix in self.__reflexive_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
for suffix in self.__adjectival_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
if suffix in (
|
|
|
|
|
"i^ushchi^ui^u",
|
|
|
|
|
"i^ushchi^ai^a",
|
|
|
|
|
"i^ushchui^u",
|
|
|
|
|
"i^ushchai^a",
|
|
|
|
|
"i^ushchoi^u",
|
|
|
|
|
"i^ushchei^u",
|
|
|
|
|
"i^ushchimi",
|
|
|
|
|
"i^ushchymi",
|
|
|
|
|
"i^ushchego",
|
|
|
|
|
"i^ushchogo",
|
|
|
|
|
"i^ushchemu",
|
|
|
|
|
"i^ushchomu",
|
|
|
|
|
"i^ushchikh",
|
|
|
|
|
"i^ushchykh",
|
|
|
|
|
"shchi^ui^u",
|
|
|
|
|
"shchi^ai^a",
|
|
|
|
|
"i^ushchee",
|
|
|
|
|
"i^ushchie",
|
|
|
|
|
"i^ushchye",
|
|
|
|
|
"i^ushchoe",
|
|
|
|
|
"i^ushchei`",
|
|
|
|
|
"i^ushchii`",
|
|
|
|
|
"i^ushchyi`",
|
|
|
|
|
"i^ushchoi`",
|
|
|
|
|
"i^ushchem",
|
|
|
|
|
"i^ushchim",
|
|
|
|
|
"i^ushchym",
|
|
|
|
|
"i^ushchom",
|
|
|
|
|
"vshi^ui^u",
|
|
|
|
|
"vshi^ai^a",
|
|
|
|
|
"shchui^u",
|
|
|
|
|
"shchai^a",
|
|
|
|
|
"shchoi^u",
|
|
|
|
|
"shchei^u",
|
|
|
|
|
"emi^ui^u",
|
|
|
|
|
"emi^ai^a",
|
|
|
|
|
"nni^ui^u",
|
|
|
|
|
"nni^ai^a",
|
|
|
|
|
"shchimi",
|
|
|
|
|
"shchymi",
|
|
|
|
|
"shchego",
|
|
|
|
|
"shchogo",
|
|
|
|
|
"shchemu",
|
|
|
|
|
"shchomu",
|
|
|
|
|
"shchikh",
|
|
|
|
|
"shchykh",
|
|
|
|
|
"vshui^u",
|
|
|
|
|
"vshai^a",
|
|
|
|
|
"vshoi^u",
|
|
|
|
|
"vshei^u",
|
|
|
|
|
"shchee",
|
|
|
|
|
"shchie",
|
|
|
|
|
"shchye",
|
|
|
|
|
"shchoe",
|
|
|
|
|
"shchei`",
|
|
|
|
|
"shchii`",
|
|
|
|
|
"shchyi`",
|
|
|
|
|
"shchoi`",
|
|
|
|
|
"shchem",
|
|
|
|
|
"shchim",
|
|
|
|
|
"shchym",
|
|
|
|
|
"shchom",
|
|
|
|
|
"vshimi",
|
|
|
|
|
"vshymi",
|
|
|
|
|
"vshego",
|
|
|
|
|
"vshogo",
|
|
|
|
|
"vshemu",
|
|
|
|
|
"vshomu",
|
|
|
|
|
"vshikh",
|
|
|
|
|
"vshykh",
|
|
|
|
|
"emui^u",
|
|
|
|
|
"emai^a",
|
|
|
|
|
"emoi^u",
|
|
|
|
|
"emei^u",
|
|
|
|
|
"nnui^u",
|
|
|
|
|
"nnai^a",
|
|
|
|
|
"nnoi^u",
|
|
|
|
|
"nnei^u",
|
|
|
|
|
"vshee",
|
|
|
|
|
"vshie",
|
|
|
|
|
"vshye",
|
|
|
|
|
"vshoe",
|
|
|
|
|
"vshei`",
|
|
|
|
|
"vshii`",
|
|
|
|
|
"vshyi`",
|
|
|
|
|
"vshoi`",
|
|
|
|
|
"vshem",
|
|
|
|
|
"vshim",
|
|
|
|
|
"vshym",
|
|
|
|
|
"vshom",
|
|
|
|
|
"emimi",
|
|
|
|
|
"emymi",
|
|
|
|
|
"emego",
|
|
|
|
|
"emogo",
|
|
|
|
|
"ememu",
|
|
|
|
|
"emomu",
|
|
|
|
|
"emikh",
|
|
|
|
|
"emykh",
|
|
|
|
|
"nnimi",
|
|
|
|
|
"nnymi",
|
|
|
|
|
"nnego",
|
|
|
|
|
"nnogo",
|
|
|
|
|
"nnemu",
|
|
|
|
|
"nnomu",
|
|
|
|
|
"nnikh",
|
|
|
|
|
"nnykh",
|
|
|
|
|
"emee",
|
|
|
|
|
"emie",
|
|
|
|
|
"emye",
|
|
|
|
|
"emoe",
|
|
|
|
|
"emei`",
|
|
|
|
|
"emii`",
|
|
|
|
|
"emyi`",
|
|
|
|
|
"emoi`",
|
|
|
|
|
"emem",
|
|
|
|
|
"emim",
|
|
|
|
|
"emym",
|
|
|
|
|
"emom",
|
|
|
|
|
"nnee",
|
|
|
|
|
"nnie",
|
|
|
|
|
"nnye",
|
|
|
|
|
"nnoe",
|
|
|
|
|
"nnei`",
|
|
|
|
|
"nnii`",
|
|
|
|
|
"nnyi`",
|
|
|
|
|
"nnoi`",
|
|
|
|
|
"nnem",
|
|
|
|
|
"nnim",
|
|
|
|
|
"nnym",
|
|
|
|
|
"nnom",
|
|
|
|
|
):
|
|
|
|
|
if (
|
|
|
|
|
rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
|
|
|
|
|
or rv[-len(suffix) - 1 : -len(suffix)] == "a"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
adjectival_removed = True
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
adjectival_removed = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not adjectival_removed:
|
|
|
|
|
for suffix in self.__verb_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
if suffix in (
|
|
|
|
|
"la",
|
|
|
|
|
"na",
|
|
|
|
|
"ete",
|
|
|
|
|
"i`te",
|
|
|
|
|
"li",
|
|
|
|
|
"i`",
|
|
|
|
|
"l",
|
|
|
|
|
"em",
|
|
|
|
|
"n",
|
|
|
|
|
"lo",
|
|
|
|
|
"no",
|
|
|
|
|
"et",
|
|
|
|
|
"i^ut",
|
|
|
|
|
"ny",
|
|
|
|
|
"t'",
|
|
|
|
|
"esh'",
|
|
|
|
|
"nno",
|
|
|
|
|
):
|
|
|
|
|
if (
|
|
|
|
|
rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
|
|
|
|
|
or rv[-len(suffix) - 1 : -len(suffix)] == "a"
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
verb_removed = True
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
verb_removed = True
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not adjectival_removed and not verb_removed:
|
|
|
|
|
for suffix in self.__noun_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Step 2
|
|
|
|
|
if rv.endswith("i"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r2 = r2[:-1]
|
|
|
|
|
|
|
|
|
|
# Step 3
|
|
|
|
|
for suffix in self.__derivational_suffixes:
|
|
|
|
|
if r2.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Step 4
|
|
|
|
|
if word.endswith("nn"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
undouble_success = True
|
|
|
|
|
|
|
|
|
|
if not undouble_success:
|
|
|
|
|
for suffix in self.__superlative_suffixes:
|
|
|
|
|
if word.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
superlative_removed = True
|
|
|
|
|
break
|
|
|
|
|
if word.endswith("nn"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
if not undouble_success and not superlative_removed:
|
|
|
|
|
if word.endswith("'"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
word = self.__roman_to_cyrillic(word)
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
def __regions_russian(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Return the regions RV and R2 which are used by the Russian stemmer.
|
|
|
|
|
|
|
|
|
|
In any word, RV is the region after the first vowel,
|
|
|
|
|
or the end of the word if it contains no vowel.
|
|
|
|
|
|
|
|
|
|
R2 is the region after the first non-vowel following
|
|
|
|
|
a vowel in R1, or the end of the word if there is no such non-vowel.
|
|
|
|
|
|
|
|
|
|
R1 is the region after the first non-vowel following a vowel,
|
|
|
|
|
or the end of the word if there is no such non-vowel.
|
|
|
|
|
|
|
|
|
|
:param word: The Russian word whose regions RV and R2 are determined.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: the regions RV and R2 for the respective Russian word.
|
|
|
|
|
:rtype: tuple
|
|
|
|
|
:note: This helper method is invoked by the stem method of the subclass
|
|
|
|
|
RussianStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
r1 = ""
|
|
|
|
|
r2 = ""
|
|
|
|
|
rv = ""
|
|
|
|
|
|
|
|
|
|
vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
|
|
|
|
|
word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E")
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(word)):
|
|
|
|
|
if word[i] not in vowels and word[i - 1] in vowels:
|
|
|
|
|
r1 = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(r1)):
|
|
|
|
|
if r1[i] not in vowels and r1[i - 1] in vowels:
|
|
|
|
|
r2 = r1[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
for i in range(len(word)):
|
|
|
|
|
if word[i] in vowels:
|
|
|
|
|
rv = word[i + 1 :]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
|
|
|
|
|
rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
|
|
|
|
|
|
|
|
|
|
return (rv, r2)
|
|
|
|
|
|
|
|
|
|
def __cyrillic_to_roman(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Transliterate a Russian word into the Roman alphabet.
|
|
|
|
|
|
|
|
|
|
A Russian word whose letters consist of the Cyrillic
|
|
|
|
|
alphabet are transliterated into the Roman alphabet
|
|
|
|
|
in order to ease the forthcoming stemming process.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is transliterated.
|
|
|
|
|
:type word: unicode
|
|
|
|
|
:return: the transliterated word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the stem method of the subclass
|
|
|
|
|
RussianStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("\u0410", "a")
|
|
|
|
|
.replace("\u0430", "a")
|
|
|
|
|
.replace("\u0411", "b")
|
|
|
|
|
.replace("\u0431", "b")
|
|
|
|
|
.replace("\u0412", "v")
|
|
|
|
|
.replace("\u0432", "v")
|
|
|
|
|
.replace("\u0413", "g")
|
|
|
|
|
.replace("\u0433", "g")
|
|
|
|
|
.replace("\u0414", "d")
|
|
|
|
|
.replace("\u0434", "d")
|
|
|
|
|
.replace("\u0415", "e")
|
|
|
|
|
.replace("\u0435", "e")
|
|
|
|
|
.replace("\u0401", "e")
|
|
|
|
|
.replace("\u0451", "e")
|
|
|
|
|
.replace("\u0416", "zh")
|
|
|
|
|
.replace("\u0436", "zh")
|
|
|
|
|
.replace("\u0417", "z")
|
|
|
|
|
.replace("\u0437", "z")
|
|
|
|
|
.replace("\u0418", "i")
|
|
|
|
|
.replace("\u0438", "i")
|
|
|
|
|
.replace("\u0419", "i`")
|
|
|
|
|
.replace("\u0439", "i`")
|
|
|
|
|
.replace("\u041A", "k")
|
|
|
|
|
.replace("\u043A", "k")
|
|
|
|
|
.replace("\u041B", "l")
|
|
|
|
|
.replace("\u043B", "l")
|
|
|
|
|
.replace("\u041C", "m")
|
|
|
|
|
.replace("\u043C", "m")
|
|
|
|
|
.replace("\u041D", "n")
|
|
|
|
|
.replace("\u043D", "n")
|
|
|
|
|
.replace("\u041E", "o")
|
|
|
|
|
.replace("\u043E", "o")
|
|
|
|
|
.replace("\u041F", "p")
|
|
|
|
|
.replace("\u043F", "p")
|
|
|
|
|
.replace("\u0420", "r")
|
|
|
|
|
.replace("\u0440", "r")
|
|
|
|
|
.replace("\u0421", "s")
|
|
|
|
|
.replace("\u0441", "s")
|
|
|
|
|
.replace("\u0422", "t")
|
|
|
|
|
.replace("\u0442", "t")
|
|
|
|
|
.replace("\u0423", "u")
|
|
|
|
|
.replace("\u0443", "u")
|
|
|
|
|
.replace("\u0424", "f")
|
|
|
|
|
.replace("\u0444", "f")
|
|
|
|
|
.replace("\u0425", "kh")
|
|
|
|
|
.replace("\u0445", "kh")
|
|
|
|
|
.replace("\u0426", "t^s")
|
|
|
|
|
.replace("\u0446", "t^s")
|
|
|
|
|
.replace("\u0427", "ch")
|
|
|
|
|
.replace("\u0447", "ch")
|
|
|
|
|
.replace("\u0428", "sh")
|
|
|
|
|
.replace("\u0448", "sh")
|
|
|
|
|
.replace("\u0429", "shch")
|
|
|
|
|
.replace("\u0449", "shch")
|
|
|
|
|
.replace("\u042A", "''")
|
|
|
|
|
.replace("\u044A", "''")
|
|
|
|
|
.replace("\u042B", "y")
|
|
|
|
|
.replace("\u044B", "y")
|
|
|
|
|
.replace("\u042C", "'")
|
|
|
|
|
.replace("\u044C", "'")
|
|
|
|
|
.replace("\u042D", "e`")
|
|
|
|
|
.replace("\u044D", "e`")
|
|
|
|
|
.replace("\u042E", "i^u")
|
|
|
|
|
.replace("\u044E", "i^u")
|
|
|
|
|
.replace("\u042F", "i^a")
|
|
|
|
|
.replace("\u044F", "i^a")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
def __roman_to_cyrillic(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Transliterate a Russian word back into the Cyrillic alphabet.
|
|
|
|
|
|
|
|
|
|
A Russian word formerly transliterated into the Roman alphabet
|
|
|
|
|
in order to ease the stemming process, is transliterated back
|
|
|
|
|
into the Cyrillic alphabet, its original form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is transliterated.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: word, the transliterated word.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
:note: This helper method is invoked by the stem method of the subclass
|
|
|
|
|
RussianStemmer. It is not to be invoked directly!
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = (
|
|
|
|
|
word.replace("i^u", "\u044E")
|
|
|
|
|
.replace("i^a", "\u044F")
|
|
|
|
|
.replace("shch", "\u0449")
|
|
|
|
|
.replace("kh", "\u0445")
|
|
|
|
|
.replace("t^s", "\u0446")
|
|
|
|
|
.replace("ch", "\u0447")
|
|
|
|
|
.replace("e`", "\u044D")
|
|
|
|
|
.replace("i`", "\u0439")
|
|
|
|
|
.replace("sh", "\u0448")
|
|
|
|
|
.replace("k", "\u043A")
|
|
|
|
|
.replace("e", "\u0435")
|
|
|
|
|
.replace("zh", "\u0436")
|
|
|
|
|
.replace("a", "\u0430")
|
|
|
|
|
.replace("b", "\u0431")
|
|
|
|
|
.replace("v", "\u0432")
|
|
|
|
|
.replace("g", "\u0433")
|
|
|
|
|
.replace("d", "\u0434")
|
|
|
|
|
.replace("e", "\u0435")
|
|
|
|
|
.replace("z", "\u0437")
|
|
|
|
|
.replace("i", "\u0438")
|
|
|
|
|
.replace("l", "\u043B")
|
|
|
|
|
.replace("m", "\u043C")
|
|
|
|
|
.replace("n", "\u043D")
|
|
|
|
|
.replace("o", "\u043E")
|
|
|
|
|
.replace("p", "\u043F")
|
|
|
|
|
.replace("r", "\u0440")
|
|
|
|
|
.replace("s", "\u0441")
|
|
|
|
|
.replace("t", "\u0442")
|
|
|
|
|
.replace("u", "\u0443")
|
|
|
|
|
.replace("f", "\u0444")
|
|
|
|
|
.replace("''", "\u044A")
|
|
|
|
|
.replace("y", "\u044B")
|
|
|
|
|
.replace("'", "\u044C")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpanishStemmer(_StandardStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Spanish Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Spanish vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
|
|
|
|
|
:type __step0_suffixes: tuple
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
|
|
|
|
|
:type __step2a_suffixes: tuple
|
|
|
|
|
:cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
|
|
|
|
|
:type __step2b_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Spanish
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/spanish/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
|
|
|
|
|
__step0_suffixes = (
|
|
|
|
|
"selas",
|
|
|
|
|
"selos",
|
|
|
|
|
"sela",
|
|
|
|
|
"selo",
|
|
|
|
|
"las",
|
|
|
|
|
"les",
|
|
|
|
|
"los",
|
|
|
|
|
"nos",
|
|
|
|
|
"me",
|
|
|
|
|
"se",
|
|
|
|
|
"la",
|
|
|
|
|
"le",
|
|
|
|
|
"lo",
|
|
|
|
|
)
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"amientos",
|
|
|
|
|
"imientos",
|
|
|
|
|
"amiento",
|
|
|
|
|
"imiento",
|
|
|
|
|
"aciones",
|
|
|
|
|
"uciones",
|
|
|
|
|
"adoras",
|
|
|
|
|
"adores",
|
|
|
|
|
"ancias",
|
|
|
|
|
"log\xEDas",
|
|
|
|
|
"encias",
|
|
|
|
|
"amente",
|
|
|
|
|
"idades",
|
|
|
|
|
"anzas",
|
|
|
|
|
"ismos",
|
|
|
|
|
"ables",
|
|
|
|
|
"ibles",
|
|
|
|
|
"istas",
|
|
|
|
|
"adora",
|
|
|
|
|
"aci\xF3n",
|
|
|
|
|
"antes",
|
|
|
|
|
"ancia",
|
|
|
|
|
"log\xEDa",
|
|
|
|
|
"uci\xf3n",
|
|
|
|
|
"encia",
|
|
|
|
|
"mente",
|
|
|
|
|
"anza",
|
|
|
|
|
"icos",
|
|
|
|
|
"icas",
|
|
|
|
|
"ismo",
|
|
|
|
|
"able",
|
|
|
|
|
"ible",
|
|
|
|
|
"ista",
|
|
|
|
|
"osos",
|
|
|
|
|
"osas",
|
|
|
|
|
"ador",
|
|
|
|
|
"ante",
|
|
|
|
|
"idad",
|
|
|
|
|
"ivas",
|
|
|
|
|
"ivos",
|
|
|
|
|
"ico",
|
|
|
|
|
"ica",
|
|
|
|
|
"oso",
|
|
|
|
|
"osa",
|
|
|
|
|
"iva",
|
|
|
|
|
"ivo",
|
|
|
|
|
)
|
|
|
|
|
__step2a_suffixes = (
|
|
|
|
|
"yeron",
|
|
|
|
|
"yendo",
|
|
|
|
|
"yamos",
|
|
|
|
|
"yais",
|
|
|
|
|
"yan",
|
|
|
|
|
"yen",
|
|
|
|
|
"yas",
|
|
|
|
|
"yes",
|
|
|
|
|
"ya",
|
|
|
|
|
"ye",
|
|
|
|
|
"yo",
|
|
|
|
|
"y\xF3",
|
|
|
|
|
)
|
|
|
|
|
__step2b_suffixes = (
|
|
|
|
|
"ar\xEDamos",
|
|
|
|
|
"er\xEDamos",
|
|
|
|
|
"ir\xEDamos",
|
|
|
|
|
"i\xE9ramos",
|
|
|
|
|
"i\xE9semos",
|
|
|
|
|
"ar\xEDais",
|
|
|
|
|
"aremos",
|
|
|
|
|
"er\xEDais",
|
|
|
|
|
"eremos",
|
|
|
|
|
"ir\xEDais",
|
|
|
|
|
"iremos",
|
|
|
|
|
"ierais",
|
|
|
|
|
"ieseis",
|
|
|
|
|
"asteis",
|
|
|
|
|
"isteis",
|
|
|
|
|
"\xE1bamos",
|
|
|
|
|
"\xE1ramos",
|
|
|
|
|
"\xE1semos",
|
|
|
|
|
"ar\xEDan",
|
|
|
|
|
"ar\xEDas",
|
|
|
|
|
"ar\xE9is",
|
|
|
|
|
"er\xEDan",
|
|
|
|
|
"er\xEDas",
|
|
|
|
|
"er\xE9is",
|
|
|
|
|
"ir\xEDan",
|
|
|
|
|
"ir\xEDas",
|
|
|
|
|
"ir\xE9is",
|
|
|
|
|
"ieran",
|
|
|
|
|
"iesen",
|
|
|
|
|
"ieron",
|
|
|
|
|
"iendo",
|
|
|
|
|
"ieras",
|
|
|
|
|
"ieses",
|
|
|
|
|
"abais",
|
|
|
|
|
"arais",
|
|
|
|
|
"aseis",
|
|
|
|
|
"\xE9amos",
|
|
|
|
|
"ar\xE1n",
|
|
|
|
|
"ar\xE1s",
|
|
|
|
|
"ar\xEDa",
|
|
|
|
|
"er\xE1n",
|
|
|
|
|
"er\xE1s",
|
|
|
|
|
"er\xEDa",
|
|
|
|
|
"ir\xE1n",
|
|
|
|
|
"ir\xE1s",
|
|
|
|
|
"ir\xEDa",
|
|
|
|
|
"iera",
|
|
|
|
|
"iese",
|
|
|
|
|
"aste",
|
|
|
|
|
"iste",
|
|
|
|
|
"aban",
|
|
|
|
|
"aran",
|
|
|
|
|
"asen",
|
|
|
|
|
"aron",
|
|
|
|
|
"ando",
|
|
|
|
|
"abas",
|
|
|
|
|
"adas",
|
|
|
|
|
"idas",
|
|
|
|
|
"aras",
|
|
|
|
|
"ases",
|
|
|
|
|
"\xEDais",
|
|
|
|
|
"ados",
|
|
|
|
|
"idos",
|
|
|
|
|
"amos",
|
|
|
|
|
"imos",
|
|
|
|
|
"emos",
|
|
|
|
|
"ar\xE1",
|
|
|
|
|
"ar\xE9",
|
|
|
|
|
"er\xE1",
|
|
|
|
|
"er\xE9",
|
|
|
|
|
"ir\xE1",
|
|
|
|
|
"ir\xE9",
|
|
|
|
|
"aba",
|
|
|
|
|
"ada",
|
|
|
|
|
"ida",
|
|
|
|
|
"ara",
|
|
|
|
|
"ase",
|
|
|
|
|
"\xEDan",
|
|
|
|
|
"ado",
|
|
|
|
|
"ido",
|
|
|
|
|
"\xEDas",
|
|
|
|
|
"\xE1is",
|
|
|
|
|
"\xE9is",
|
|
|
|
|
"\xEDa",
|
|
|
|
|
"ad",
|
|
|
|
|
"ed",
|
|
|
|
|
"id",
|
|
|
|
|
"an",
|
|
|
|
|
"i\xF3",
|
|
|
|
|
"ar",
|
|
|
|
|
"er",
|
|
|
|
|
"ir",
|
|
|
|
|
"as",
|
|
|
|
|
"\xEDs",
|
|
|
|
|
"en",
|
|
|
|
|
"es",
|
|
|
|
|
)
|
|
|
|
|
__step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Spanish word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
step1_success = False
|
|
|
|
|
|
|
|
|
|
r1, r2 = self._r1r2_standard(word, self.__vowels)
|
|
|
|
|
rv = self._rv_standard(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 0: Attached pronoun
|
|
|
|
|
for suffix in self.__step0_suffixes:
|
|
|
|
|
if not (word.endswith(suffix) and rv.endswith(suffix)):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
rv[: -len(suffix)].endswith(
|
|
|
|
|
(
|
|
|
|
|
"ando",
|
|
|
|
|
"\xE1ndo",
|
|
|
|
|
"ar",
|
|
|
|
|
"\xE1r",
|
|
|
|
|
"er",
|
|
|
|
|
"\xE9r",
|
|
|
|
|
"iendo",
|
|
|
|
|
"i\xE9ndo",
|
|
|
|
|
"ir",
|
|
|
|
|
"\xEDr",
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
) or (
|
|
|
|
|
rv[: -len(suffix)].endswith("yendo")
|
|
|
|
|
and word[: -len(suffix)].endswith("uyendo")
|
|
|
|
|
):
|
|
|
|
|
|
|
|
|
|
word = self.__replace_accented(word[: -len(suffix)])
|
|
|
|
|
r1 = self.__replace_accented(r1[: -len(suffix)])
|
|
|
|
|
r2 = self.__replace_accented(r2[: -len(suffix)])
|
|
|
|
|
rv = self.__replace_accented(rv[: -len(suffix)])
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 1: Standard suffix removal
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if not word.endswith(suffix):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if suffix == "amente" and r1.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
word = word[:-6]
|
|
|
|
|
r2 = r2[:-6]
|
|
|
|
|
rv = rv[:-6]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("iv"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
r2 = r2[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(("os", "ic", "ad")):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif r2.endswith(suffix):
|
|
|
|
|
step1_success = True
|
|
|
|
|
if suffix in (
|
|
|
|
|
"adora",
|
|
|
|
|
"ador",
|
|
|
|
|
"aci\xF3n",
|
|
|
|
|
"adoras",
|
|
|
|
|
"adores",
|
|
|
|
|
"aciones",
|
|
|
|
|
"ante",
|
|
|
|
|
"antes",
|
|
|
|
|
"ancia",
|
|
|
|
|
"ancias",
|
|
|
|
|
):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r2.endswith("ic"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("log\xEDa", "log\xEDas"):
|
|
|
|
|
word = suffix_replace(word, suffix, "log")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "log")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("uci\xF3n", "uciones"):
|
|
|
|
|
word = suffix_replace(word, suffix, "u")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "u")
|
|
|
|
|
|
|
|
|
|
elif suffix in ("encia", "encias"):
|
|
|
|
|
word = suffix_replace(word, suffix, "ente")
|
|
|
|
|
rv = suffix_replace(rv, suffix, "ente")
|
|
|
|
|
|
|
|
|
|
elif suffix == "mente":
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if r2.endswith(("ante", "able", "ible")):
|
|
|
|
|
word = word[:-4]
|
|
|
|
|
rv = rv[:-4]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("idad", "idades"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
for pre_suff in ("abil", "ic", "iv"):
|
|
|
|
|
if r2.endswith(pre_suff):
|
|
|
|
|
word = word[: -len(pre_suff)]
|
|
|
|
|
rv = rv[: -len(pre_suff)]
|
|
|
|
|
|
|
|
|
|
elif suffix in ("ivo", "iva", "ivos", "ivas"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r2 = r2[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
if r2.endswith("at"):
|
|
|
|
|
word = word[:-2]
|
|
|
|
|
rv = rv[:-2]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2a: Verb suffixes beginning 'y'
|
|
|
|
|
if not step1_success:
|
|
|
|
|
for suffix in self.__step2a_suffixes:
|
|
|
|
|
if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u":
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2b: Other verb suffixes
|
|
|
|
|
for suffix in self.__step2b_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
if suffix in ("en", "es", "\xE9is", "emos"):
|
|
|
|
|
if word.endswith("gu"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
if rv.endswith("gu"):
|
|
|
|
|
rv = rv[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3: Residual suffix
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if rv.endswith(suffix):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
if suffix in ("e", "\xE9"):
|
|
|
|
|
rv = rv[: -len(suffix)]
|
|
|
|
|
|
|
|
|
|
if word[-2:] == "gu" and rv.endswith("u"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
word = self.__replace_accented(word)
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
def __replace_accented(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Replaces all accented letters on a word with their non-accented
|
|
|
|
|
counterparts.
|
|
|
|
|
|
|
|
|
|
:param word: A spanish word, with or without accents
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: a word with the accented letters (á, é, í, ó, ú) replaced with
|
|
|
|
|
their non-accented counterparts (a, e, i, o, u)
|
|
|
|
|
:rtype: str or unicode
|
|
|
|
|
"""
|
|
|
|
|
return (
|
|
|
|
|
word.replace("\xE1", "a")
|
|
|
|
|
.replace("\xE9", "e")
|
|
|
|
|
.replace("\xED", "i")
|
|
|
|
|
.replace("\xF3", "o")
|
|
|
|
|
.replace("\xFA", "u")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SwedishStemmer(_ScandinavianStemmer):
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
The Swedish Snowball stemmer.
|
|
|
|
|
|
|
|
|
|
:cvar __vowels: The Swedish vowels.
|
|
|
|
|
:type __vowels: unicode
|
|
|
|
|
:cvar __s_ending: Letters that may directly appear before a word final 's'.
|
|
|
|
|
:type __s_ending: unicode
|
|
|
|
|
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
|
|
|
|
|
:type __step1_suffixes: tuple
|
|
|
|
|
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
|
|
|
|
|
:type __step2_suffixes: tuple
|
|
|
|
|
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
|
|
|
|
|
:type __step3_suffixes: tuple
|
|
|
|
|
:note: A detailed description of the Swedish
|
|
|
|
|
stemming algorithm can be found under
|
|
|
|
|
http://snowball.tartarus.org/algorithms/swedish/stemmer.html
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__vowels = "aeiouy\xE4\xE5\xF6"
|
|
|
|
|
__s_ending = "bcdfghjklmnoprtvy"
|
|
|
|
|
__step1_suffixes = (
|
|
|
|
|
"heterna",
|
|
|
|
|
"hetens",
|
|
|
|
|
"heter",
|
|
|
|
|
"heten",
|
|
|
|
|
"anden",
|
|
|
|
|
"arnas",
|
|
|
|
|
"ernas",
|
|
|
|
|
"ornas",
|
|
|
|
|
"andes",
|
|
|
|
|
"andet",
|
|
|
|
|
"arens",
|
|
|
|
|
"arna",
|
|
|
|
|
"erna",
|
|
|
|
|
"orna",
|
|
|
|
|
"ande",
|
|
|
|
|
"arne",
|
|
|
|
|
"aste",
|
|
|
|
|
"aren",
|
|
|
|
|
"ades",
|
|
|
|
|
"erns",
|
|
|
|
|
"ade",
|
|
|
|
|
"are",
|
|
|
|
|
"ern",
|
|
|
|
|
"ens",
|
|
|
|
|
"het",
|
|
|
|
|
"ast",
|
|
|
|
|
"ad",
|
|
|
|
|
"en",
|
|
|
|
|
"ar",
|
|
|
|
|
"er",
|
|
|
|
|
"or",
|
|
|
|
|
"as",
|
|
|
|
|
"es",
|
|
|
|
|
"at",
|
|
|
|
|
"a",
|
|
|
|
|
"e",
|
|
|
|
|
"s",
|
|
|
|
|
)
|
|
|
|
|
__step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
|
|
|
|
|
__step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
|
|
|
|
|
|
|
|
|
|
def stem(self, word):
|
|
|
|
|
"""
|
|
|
|
|
Stem a Swedish word and return the stemmed form.
|
|
|
|
|
|
|
|
|
|
:param word: The word that is stemmed.
|
|
|
|
|
:type word: str or unicode
|
|
|
|
|
:return: The stemmed form.
|
|
|
|
|
:rtype: unicode
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
word = word.lower()
|
|
|
|
|
|
|
|
|
|
if word in self.stopwords:
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
r1 = self._r1_scandinavian(word, self.__vowels)
|
|
|
|
|
|
|
|
|
|
# STEP 1
|
|
|
|
|
for suffix in self.__step1_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix == "s":
|
|
|
|
|
if word[-2] in self.__s_ending:
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
else:
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
r1 = r1[: -len(suffix)]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 2
|
|
|
|
|
for suffix in self.__step2_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
r1 = r1[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# STEP 3
|
|
|
|
|
for suffix in self.__step3_suffixes:
|
|
|
|
|
if r1.endswith(suffix):
|
|
|
|
|
if suffix in ("els", "lig", "ig"):
|
|
|
|
|
word = word[: -len(suffix)]
|
|
|
|
|
elif suffix in ("fullt", "l\xF6st"):
|
|
|
|
|
word = word[:-1]
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
return word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def demo():
|
|
|
|
|
"""
|
|
|
|
|
This function provides a demonstration of the Snowball stemmers.
|
|
|
|
|
|
|
|
|
|
After invoking this function and specifying a language,
|
|
|
|
|
it stems an excerpt of the Universal Declaration of Human Rights
|
|
|
|
|
(which is a part of the NLTK corpus collection) and then prints
|
|
|
|
|
out the original and the stemmed text.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from nltk.corpus import udhr
|
|
|
|
|
|
|
|
|
|
udhr_corpus = {
|
|
|
|
|
"arabic": "Arabic_Alarabia-Arabic",
|
|
|
|
|
"danish": "Danish_Dansk-Latin1",
|
|
|
|
|
"dutch": "Dutch_Nederlands-Latin1",
|
|
|
|
|
"english": "English-Latin1",
|
|
|
|
|
"finnish": "Finnish_Suomi-Latin1",
|
|
|
|
|
"french": "French_Francais-Latin1",
|
|
|
|
|
"german": "German_Deutsch-Latin1",
|
|
|
|
|
"hungarian": "Hungarian_Magyar-UTF8",
|
|
|
|
|
"italian": "Italian_Italiano-Latin1",
|
|
|
|
|
"norwegian": "Norwegian-Latin1",
|
|
|
|
|
"porter": "English-Latin1",
|
|
|
|
|
"portuguese": "Portuguese_Portugues-Latin1",
|
|
|
|
|
"romanian": "Romanian_Romana-Latin2",
|
|
|
|
|
"russian": "Russian-UTF8",
|
|
|
|
|
"spanish": "Spanish-Latin1",
|
|
|
|
|
"swedish": "Swedish_Svenska-Latin1",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
print("\n")
|
|
|
|
|
print("******************************")
|
|
|
|
|
print("Demo for the Snowball stemmers")
|
|
|
|
|
print("******************************")
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
|
|
|
|
|
language = input(
|
|
|
|
|
"Please enter the name of the language "
|
|
|
|
|
+ "to be demonstrated\n"
|
|
|
|
|
+ "/".join(SnowballStemmer.languages)
|
|
|
|
|
+ "\n"
|
|
|
|
|
+ "(enter 'exit' in order to leave): "
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if language == "exit":
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if language not in SnowballStemmer.languages:
|
|
|
|
|
print(
|
|
|
|
|
(
|
|
|
|
|
"\nOops, there is no stemmer for this language. "
|
|
|
|
|
+ "Please try again.\n"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
stemmer = SnowballStemmer(language)
|
|
|
|
|
excerpt = udhr.words(udhr_corpus[language])[:300]
|
|
|
|
|
|
|
|
|
|
stemmed = " ".join(stemmer.stem(word) for word in excerpt)
|
|
|
|
|
stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip()
|
|
|
|
|
excerpt = " ".join(excerpt)
|
|
|
|
|
excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip()
|
|
|
|
|
|
|
|
|
|
print("\n")
|
|
|
|
|
print("-" * 70)
|
|
|
|
|
print("ORIGINAL".center(70))
|
|
|
|
|
print(excerpt)
|
|
|
|
|
print("\n\n")
|
|
|
|
|
print("STEMMED RESULTS".center(70))
|
|
|
|
|
print(stemmed)
|
|
|
|
|
print("-" * 70)
|
|
|
|
|
print("\n")
|