Spaces:
Sleeping
Sleeping
# | |
# Natural Language Toolkit: Snowball Stemmer | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Peter Michael Stahl <pemistahl@gmail.com> | |
# Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions) | |
# Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer) | |
# Assem Chelli <assem.ch@gmail.com> (reviewer arabicstemmer) | |
# Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer) | |
# Algorithms: Dr Martin Porter <martin@tartarus.org> | |
# Assem Chelli <assem.ch@gmail.com> arabic stemming algorithm | |
# Benzahia Lakhdar <lakhdar.benzahia@gmail.com> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Snowball stemmers | |
This module provides a port of the Snowball stemmers | |
developed by Martin Porter. | |
There is also a demo function: `snowball.demo()`. | |
""" | |
import re | |
from nltk.corpus import stopwords | |
from nltk.stem import porter | |
from nltk.stem.api import StemmerI | |
from nltk.stem.util import prefix_replace, suffix_replace | |
class SnowballStemmer(StemmerI): | |
""" | |
Snowball Stemmer | |
The following languages are supported: | |
Arabic, Danish, Dutch, English, Finnish, French, German, | |
Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, | |
Spanish and Swedish. | |
The algorithm for English is documented here: | |
Porter, M. \"An algorithm for suffix stripping.\" | |
Program 14.3 (1980): 130-137. | |
The algorithms have been developed by Martin Porter. | |
These stemmers are called Snowball, because Porter created | |
a programming language with this name for creating | |
new stemming algorithms. There is more information available | |
at http://snowball.tartarus.org/ | |
The stemmer is invoked as shown below: | |
>>> from nltk.stem import SnowballStemmer # See which languages are supported | |
>>> print(" ".join(SnowballStemmer.languages)) # doctest: +NORMALIZE_WHITESPACE | |
arabic danish dutch english finnish french german hungarian | |
italian norwegian porter portuguese romanian russian | |
spanish swedish | |
>>> stemmer = SnowballStemmer("german") # Choose a language | |
>>> stemmer.stem("Autobahnen") # Stem a word | |
'autobahn' | |
Invoking the stemmers that way is useful if you do not know the | |
language to be stemmed at runtime. Alternatively, if you already know | |
the language, then you can invoke the language specific stemmer directly: | |
>>> from nltk.stem.snowball import GermanStemmer | |
>>> stemmer = GermanStemmer() | |
>>> stemmer.stem("Autobahnen") | |
'autobahn' | |
:param language: The language whose subclass is instantiated. | |
:type language: str or unicode | |
:param ignore_stopwords: If set to True, stopwords are | |
not stemmed and returned unchanged. | |
Set to False by default. | |
:type ignore_stopwords: bool | |
:raise ValueError: If there is no stemmer for the specified | |
language, a ValueError is raised. | |
""" | |
languages = ( | |
"arabic", | |
"danish", | |
"dutch", | |
"english", | |
"finnish", | |
"french", | |
"german", | |
"hungarian", | |
"italian", | |
"norwegian", | |
"porter", | |
"portuguese", | |
"romanian", | |
"russian", | |
"spanish", | |
"swedish", | |
) | |
def __init__(self, language, ignore_stopwords=False): | |
if language not in self.languages: | |
raise ValueError(f"The language '{language}' is not supported.") | |
stemmerclass = globals()[language.capitalize() + "Stemmer"] | |
self.stemmer = stemmerclass(ignore_stopwords) | |
self.stem = self.stemmer.stem | |
self.stopwords = self.stemmer.stopwords | |
def stem(self, token): | |
return self.stemmer.stem(self, token) | |
class _LanguageSpecificStemmer(StemmerI): | |
""" | |
This helper subclass offers the possibility | |
to invoke a specific stemmer directly. | |
This is useful if you already know the language to be stemmed at runtime. | |
Create an instance of the Snowball stemmer. | |
:param ignore_stopwords: If set to True, stopwords are | |
not stemmed and returned unchanged. | |
Set to False by default. | |
:type ignore_stopwords: bool | |
""" | |
def __init__(self, ignore_stopwords=False): | |
# The language is the name of the class, minus the final "Stemmer". | |
language = type(self).__name__.lower() | |
if language.endswith("stemmer"): | |
language = language[:-7] | |
self.stopwords = set() | |
if ignore_stopwords: | |
try: | |
for word in stopwords.words(language): | |
self.stopwords.add(word) | |
except OSError as e: | |
raise ValueError( | |
"{!r} has no list of stopwords. Please set" | |
" 'ignore_stopwords' to 'False'.".format(self) | |
) from e | |
def __repr__(self): | |
""" | |
Print out the string representation of the respective class. | |
""" | |
return f"<{type(self).__name__}>" | |
class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer): | |
""" | |
A word stemmer based on the original Porter stemming algorithm. | |
Porter, M. \"An algorithm for suffix stripping.\" | |
Program 14.3 (1980): 130-137. | |
A few minor modifications have been made to Porter's basic | |
algorithm. See the source code of the module | |
nltk.stem.porter for more information. | |
""" | |
def __init__(self, ignore_stopwords=False): | |
_LanguageSpecificStemmer.__init__(self, ignore_stopwords) | |
porter.PorterStemmer.__init__(self) | |
class _ScandinavianStemmer(_LanguageSpecificStemmer): | |
""" | |
This subclass encapsulates a method for defining the string region R1. | |
It is used by the Danish, Norwegian, and Swedish stemmer. | |
""" | |
def _r1_scandinavian(self, word, vowels): | |
""" | |
Return the region R1 that is used by the Scandinavian stemmers. | |
R1 is the region after the first non-vowel following a vowel, | |
or is the null region at the end of the word if there is no | |
such non-vowel. But then R1 is adjusted so that the region | |
before it contains at least three letters. | |
:param word: The word whose region R1 is determined. | |
:type word: str or unicode | |
:param vowels: The vowels of the respective language that are | |
used to determine the region R1. | |
:type vowels: unicode | |
:return: the region R1 for the respective word. | |
:rtype: unicode | |
:note: This helper method is invoked by the respective stem method of | |
the subclasses DanishStemmer, NorwegianStemmer, and | |
SwedishStemmer. It is not to be invoked directly! | |
""" | |
r1 = "" | |
for i in range(1, len(word)): | |
if word[i] not in vowels and word[i - 1] in vowels: | |
if 3 > len(word[: i + 1]) > 0: | |
r1 = word[3:] | |
elif len(word[: i + 1]) >= 3: | |
r1 = word[i + 1 :] | |
else: | |
return word | |
break | |
return r1 | |
class _StandardStemmer(_LanguageSpecificStemmer): | |
""" | |
This subclass encapsulates two methods for defining the standard versions | |
of the string regions R1, R2, and RV. | |
""" | |
def _r1r2_standard(self, word, vowels): | |
""" | |
Return the standard interpretations of the string regions R1 and R2. | |
R1 is the region after the first non-vowel following a vowel, | |
or is the null region at the end of the word if there is no | |
such non-vowel. | |
R2 is the region after the first non-vowel following a vowel | |
in R1, or is the null region at the end of the word if there | |
is no such non-vowel. | |
:param word: The word whose regions R1 and R2 are determined. | |
:type word: str or unicode | |
:param vowels: The vowels of the respective language that are | |
used to determine the regions R1 and R2. | |
:type vowels: unicode | |
:return: (r1,r2), the regions R1 and R2 for the respective word. | |
:rtype: tuple | |
:note: This helper method is invoked by the respective stem method of | |
the subclasses DutchStemmer, FinnishStemmer, | |
FrenchStemmer, GermanStemmer, ItalianStemmer, | |
PortugueseStemmer, RomanianStemmer, and SpanishStemmer. | |
It is not to be invoked directly! | |
:note: A detailed description of how to define R1 and R2 | |
can be found at http://snowball.tartarus.org/texts/r1r2.html | |
""" | |
r1 = "" | |
r2 = "" | |
for i in range(1, len(word)): | |
if word[i] not in vowels and word[i - 1] in vowels: | |
r1 = word[i + 1 :] | |
break | |
for i in range(1, len(r1)): | |
if r1[i] not in vowels and r1[i - 1] in vowels: | |
r2 = r1[i + 1 :] | |
break | |
return (r1, r2) | |
def _rv_standard(self, word, vowels): | |
""" | |
Return the standard interpretation of the string region RV. | |
If the second letter is a consonant, RV is the region after the | |
next following vowel. If the first two letters are vowels, RV is | |
the region after the next following consonant. Otherwise, RV is | |
the region after the third letter. | |
:param word: The word whose region RV is determined. | |
:type word: str or unicode | |
:param vowels: The vowels of the respective language that are | |
used to determine the region RV. | |
:type vowels: unicode | |
:return: the region RV for the respective word. | |
:rtype: unicode | |
:note: This helper method is invoked by the respective stem method of | |
the subclasses ItalianStemmer, PortugueseStemmer, | |
RomanianStemmer, and SpanishStemmer. It is not to be | |
invoked directly! | |
""" | |
rv = "" | |
if len(word) >= 2: | |
if word[1] not in vowels: | |
for i in range(2, len(word)): | |
if word[i] in vowels: | |
rv = word[i + 1 :] | |
break | |
elif word[0] in vowels and word[1] in vowels: | |
for i in range(2, len(word)): | |
if word[i] not in vowels: | |
rv = word[i + 1 :] | |
break | |
else: | |
rv = word[3:] | |
return rv | |
class ArabicStemmer(_StandardStemmer): | |
""" | |
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) | |
The Snowball Arabic light Stemmer | |
Algorithm: | |
- Assem Chelli | |
- Abdelkrim Aries | |
- Lakhdar Benzahia | |
NLTK Version Author: | |
- Lakhdar Benzahia | |
""" | |
# Normalize_pre stes | |
__vocalization = re.compile( | |
r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]" | |
) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ | |
__kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda | |
__arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟ | |
# Normalize_post | |
__last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ | |
# normalize other hamza's | |
__initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ | |
__waw_hamza = re.compile(r"[\u0624]") # ؤ | |
__yeh_hamza = re.compile(r"[\u0626]") # ئ | |
__alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ | |
# Checks | |
__checks1 = ( | |
"\u0643\u0627\u0644", | |
"\u0628\u0627\u0644", # بال، كال | |
"\u0627\u0644", | |
"\u0644\u0644", # لل، ال | |
) | |
__checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات | |
# Suffixes | |
__suffix_noun_step1a = ( | |
"\u064a", | |
"\u0643", | |
"\u0647", # ي، ك، ه | |
"\u0646\u0627", | |
"\u0643\u0645", | |
"\u0647\u0627", | |
"\u0647\u0646", | |
"\u0647\u0645", # نا، كم، ها، هن، هم | |
"\u0643\u0645\u0627", | |
"\u0647\u0645\u0627", # كما، هما | |
) | |
__suffix_noun_step1b = "\u0646" # ن | |
__suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و | |
__suffix_noun_step2b = "\u0627\u062a" # ات | |
__suffix_noun_step2c1 = "\u062a" # ت | |
__suffix_noun_step2c2 = "\u0629" # ة | |
__suffix_noun_step3 = "\u064a" # ي | |
__suffix_verb_step1 = ( | |
"\u0647", | |
"\u0643", # ه، ك | |
"\u0646\u064a", | |
"\u0646\u0627", | |
"\u0647\u0627", | |
"\u0647\u0645", # ني، نا، ها، هم | |
"\u0647\u0646", | |
"\u0643\u0645", | |
"\u0643\u0646", # هن، كم، كن | |
"\u0647\u0645\u0627", | |
"\u0643\u0645\u0627", | |
"\u0643\u0645\u0648", # هما، كما، كمو | |
) | |
__suffix_verb_step2a = ( | |
"\u062a", | |
"\u0627", | |
"\u0646", | |
"\u064a", # ت، ا، ن، ي | |
"\u0646\u0627", | |
"\u062a\u0627", | |
"\u062a\u0646", # نا، تا، تن Past | |
"\u0627\u0646", | |
"\u0648\u0646", | |
"\u064a\u0646", # ان، هن، ين Present | |
"\u062a\u0645\u0627", # تما | |
) | |
__suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم | |
__suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو | |
__suffix_all_alef_maqsura = "\u0649" # ى | |
# Prefixes | |
__prefix_step1 = ( | |
"\u0623", # أ | |
"\u0623\u0623", | |
"\u0623\u0622", | |
"\u0623\u0624", | |
"\u0623\u0627", | |
"\u0623\u0625", # أأ، أآ، أؤ، أا، أإ | |
) | |
__prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال | |
__prefix_step2b = ("\u0641", "\u0648") # ف، و | |
__prefix_step3a_noun = ( | |
"\u0627\u0644", | |
"\u0644\u0644", # لل، ال | |
"\u0643\u0627\u0644", | |
"\u0628\u0627\u0644", # بال، كال | |
) | |
__prefix_step3b_noun = ( | |
"\u0628", | |
"\u0643", | |
"\u0644", # ب، ك، ل | |
"\u0628\u0628", | |
"\u0643\u0643", # بب، كك | |
) | |
__prefix_step3_verb = ( | |
"\u0633\u064a", | |
"\u0633\u062a", | |
"\u0633\u0646", | |
"\u0633\u0623", | |
) # سي، ست، سن، سأ | |
__prefix_step4_verb = ( | |
"\u064a\u0633\u062a", | |
"\u0646\u0633\u062a", | |
"\u062a\u0633\u062a", | |
) # يست، نست، تست | |
# Suffixes added due to Conjugation Verbs | |
__conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك | |
__conjugation_suffix_verb_2 = ( | |
"\u0646\u064a", | |
"\u0646\u0627", | |
"\u0647\u0627", # ني، نا، ها | |
"\u0647\u0645", | |
"\u0647\u0646", | |
"\u0643\u0645", # هم، هن، كم | |
"\u0643\u0646", # كن | |
) | |
__conjugation_suffix_verb_3 = ( | |
"\u0647\u0645\u0627", | |
"\u0643\u0645\u0627", | |
"\u0643\u0645\u0648", | |
) # هما، كما، كمو | |
__conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي | |
__conjugation_suffix_verb_past = ( | |
"\u0646\u0627", | |
"\u062a\u0627", | |
"\u062a\u0646", | |
) # نا، تا، تن | |
__conjugation_suffix_verb_present = ( | |
"\u0627\u0646", | |
"\u0648\u0646", | |
"\u064a\u0646", | |
) # ان، ون، ين | |
# Suffixes added due to derivation Names | |
__conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه | |
__conjugation_suffix_noun_2 = ( | |
"\u0646\u0627", | |
"\u0643\u0645", # نا، كم | |
"\u0647\u0627", | |
"\u0647\u0646", | |
"\u0647\u0645", # ها، هن، هم | |
) | |
__conjugation_suffix_noun_3 = ( | |
"\u0643\u0645\u0627", | |
"\u0647\u0645\u0627", | |
) # كما، هما | |
# Prefixes added due to derivation Names | |
__prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا | |
__articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال | |
__articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل | |
# Prepositions letters | |
__prepositions1 = ("\u0643", "\u0644") # ك، ل | |
__prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك | |
is_verb = True | |
is_noun = True | |
is_defined = False | |
suffixes_verb_step1_success = False | |
suffix_verb_step2a_success = False | |
suffix_verb_step2b_success = False | |
suffix_noun_step2c2_success = False | |
suffix_noun_step1a_success = False | |
suffix_noun_step2a_success = False | |
suffix_noun_step2b_success = False | |
suffixe_noun_step1b_success = False | |
prefix_step2a_success = False | |
prefix_step3a_noun_success = False | |
prefix_step3b_noun_success = False | |
def __normalize_pre(self, token): | |
""" | |
:param token: string | |
:return: normalized token type string | |
""" | |
# strip diacritics | |
token = self.__vocalization.sub("", token) | |
# strip kasheeda | |
token = self.__kasheeda.sub("", token) | |
# strip punctuation marks | |
token = self.__arabic_punctuation_marks.sub("", token) | |
return token | |
def __normalize_post(self, token): | |
# normalize last hamza | |
for hamza in self.__last_hamzat: | |
if token.endswith(hamza): | |
token = suffix_replace(token, hamza, "\u0621") | |
break | |
# normalize other hamzat | |
token = self.__initial_hamzat.sub("\u0627", token) | |
token = self.__waw_hamza.sub("\u0648", token) | |
token = self.__yeh_hamza.sub("\u064a", token) | |
token = self.__alefat.sub("\u0627", token) | |
return token | |
def __checks_1(self, token): | |
for prefix in self.__checks1: | |
if token.startswith(prefix): | |
if prefix in self.__articles_3len and len(token) > 4: | |
self.is_noun = True | |
self.is_verb = False | |
self.is_defined = True | |
break | |
if prefix in self.__articles_2len and len(token) > 3: | |
self.is_noun = True | |
self.is_verb = False | |
self.is_defined = True | |
break | |
def __checks_2(self, token): | |
for suffix in self.__checks2: | |
if token.endswith(suffix): | |
if suffix == "\u0629" and len(token) > 2: | |
self.is_noun = True | |
self.is_verb = False | |
break | |
if suffix == "\u0627\u062a" and len(token) > 3: | |
self.is_noun = True | |
self.is_verb = False | |
break | |
def __Suffix_Verb_Step1(self, token): | |
for suffix in self.__suffix_verb_step1: | |
if token.endswith(suffix): | |
if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: | |
token = token[:-1] | |
self.suffixes_verb_step1_success = True | |
break | |
if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: | |
token = token[:-2] | |
self.suffixes_verb_step1_success = True | |
break | |
if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: | |
token = token[:-3] | |
self.suffixes_verb_step1_success = True | |
break | |
return token | |
def __Suffix_Verb_Step2a(self, token): | |
for suffix in self.__suffix_verb_step2a: | |
if token.endswith(suffix) and len(token) > 3: | |
if suffix == "\u062a" and len(token) >= 4: | |
token = token[:-1] | |
self.suffix_verb_step2a_success = True | |
break | |
if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: | |
token = token[:-1] | |
self.suffix_verb_step2a_success = True | |
break | |
if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: | |
token = token[:-2] # past | |
self.suffix_verb_step2a_success = True | |
break | |
if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: | |
token = token[:-2] # present | |
self.suffix_verb_step2a_success = True | |
break | |
if suffix == "\u062a\u0645\u0627" and len(token) >= 6: | |
token = token[:-3] | |
self.suffix_verb_step2a_success = True | |
break | |
return token | |
def __Suffix_Verb_Step2c(self, token): | |
for suffix in self.__suffix_verb_step2c: | |
if token.endswith(suffix): | |
if suffix == "\u062a\u0645\u0648" and len(token) >= 6: | |
token = token[:-3] | |
break | |
if suffix == "\u0648" and len(token) >= 4: | |
token = token[:-1] | |
break | |
return token | |
def __Suffix_Verb_Step2b(self, token): | |
for suffix in self.__suffix_verb_step2b: | |
if token.endswith(suffix) and len(token) >= 5: | |
token = token[:-2] | |
self.suffix_verb_step2b_success = True | |
break | |
return token | |
def __Suffix_Noun_Step2c2(self, token): | |
for suffix in self.__suffix_noun_step2c2: | |
if token.endswith(suffix) and len(token) >= 3: | |
token = token[:-1] | |
self.suffix_noun_step2c2_success = True | |
break | |
return token | |
def __Suffix_Noun_Step1a(self, token): | |
for suffix in self.__suffix_noun_step1a: | |
if token.endswith(suffix): | |
if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: | |
token = token[:-1] | |
self.suffix_noun_step1a_success = True | |
break | |
if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: | |
token = token[:-2] | |
self.suffix_noun_step1a_success = True | |
break | |
if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: | |
token = token[:-3] | |
self.suffix_noun_step1a_success = True | |
break | |
return token | |
def __Suffix_Noun_Step2a(self, token): | |
for suffix in self.__suffix_noun_step2a: | |
if token.endswith(suffix) and len(token) > 4: | |
token = token[:-1] | |
self.suffix_noun_step2a_success = True | |
break | |
return token | |
def __Suffix_Noun_Step2b(self, token): | |
for suffix in self.__suffix_noun_step2b: | |
if token.endswith(suffix) and len(token) >= 5: | |
token = token[:-2] | |
self.suffix_noun_step2b_success = True | |
break | |
return token | |
def __Suffix_Noun_Step2c1(self, token): | |
for suffix in self.__suffix_noun_step2c1: | |
if token.endswith(suffix) and len(token) >= 4: | |
token = token[:-1] | |
break | |
return token | |
def __Suffix_Noun_Step1b(self, token): | |
for suffix in self.__suffix_noun_step1b: | |
if token.endswith(suffix) and len(token) > 5: | |
token = token[:-1] | |
self.suffixe_noun_step1b_success = True | |
break | |
return token | |
def __Suffix_Noun_Step3(self, token): | |
for suffix in self.__suffix_noun_step3: | |
if token.endswith(suffix) and len(token) >= 3: | |
token = token[:-1] # ya' nisbiya | |
break | |
return token | |
def __Suffix_All_alef_maqsura(self, token): | |
for suffix in self.__suffix_all_alef_maqsura: | |
if token.endswith(suffix): | |
token = suffix_replace(token, suffix, "\u064a") | |
return token | |
def __Prefix_Step1(self, token): | |
for prefix in self.__prefix_step1: | |
if token.startswith(prefix) and len(token) > 3: | |
if prefix == "\u0623\u0623": | |
token = prefix_replace(token, prefix, "\u0623") | |
break | |
elif prefix == "\u0623\u0622": | |
token = prefix_replace(token, prefix, "\u0622") | |
break | |
elif prefix == "\u0623\u0624": | |
token = prefix_replace(token, prefix, "\u0624") | |
break | |
elif prefix == "\u0623\u0627": | |
token = prefix_replace(token, prefix, "\u0627") | |
break | |
elif prefix == "\u0623\u0625": | |
token = prefix_replace(token, prefix, "\u0625") | |
break | |
return token | |
def __Prefix_Step2a(self, token): | |
for prefix in self.__prefix_step2a: | |
if token.startswith(prefix) and len(token) > 5: | |
token = token[len(prefix) :] | |
self.prefix_step2a_success = True | |
break | |
return token | |
def __Prefix_Step2b(self, token): | |
for prefix in self.__prefix_step2b: | |
if token.startswith(prefix) and len(token) > 3: | |
if token[:2] not in self.__prefixes1: | |
token = token[len(prefix) :] | |
break | |
return token | |
def __Prefix_Step3a_Noun(self, token): | |
for prefix in self.__prefix_step3a_noun: | |
if token.startswith(prefix): | |
if prefix in self.__articles_2len and len(token) > 4: | |
token = token[len(prefix) :] | |
self.prefix_step3a_noun_success = True | |
break | |
if prefix in self.__articles_3len and len(token) > 5: | |
token = token[len(prefix) :] | |
break | |
return token | |
def __Prefix_Step3b_Noun(self, token): | |
for prefix in self.__prefix_step3b_noun: | |
if token.startswith(prefix): | |
if len(token) > 3: | |
if prefix == "\u0628": | |
token = token[len(prefix) :] | |
self.prefix_step3b_noun_success = True | |
break | |
if prefix in self.__prepositions2: | |
token = prefix_replace(token, prefix, prefix[1]) | |
self.prefix_step3b_noun_success = True | |
break | |
if prefix in self.__prepositions1 and len(token) > 4: | |
token = token[len(prefix) :] # BUG: cause confusion | |
self.prefix_step3b_noun_success = True | |
break | |
return token | |
def __Prefix_Step3_Verb(self, token): | |
for prefix in self.__prefix_step3_verb: | |
if token.startswith(prefix) and len(token) > 4: | |
token = prefix_replace(token, prefix, prefix[1]) | |
break | |
return token | |
def __Prefix_Step4_Verb(self, token): | |
for prefix in self.__prefix_step4_verb: | |
if token.startswith(prefix) and len(token) > 4: | |
token = prefix_replace(token, prefix, "\u0627\u0633\u062a") | |
self.is_verb = True | |
self.is_noun = False | |
break | |
return token | |
def stem(self, word): | |
""" | |
Stem an Arabic word and return the stemmed form. | |
:param word: string | |
:return: string | |
""" | |
# set initial values | |
self.is_verb = True | |
self.is_noun = True | |
self.is_defined = False | |
self.suffix_verb_step2a_success = False | |
self.suffix_verb_step2b_success = False | |
self.suffix_noun_step2c2_success = False | |
self.suffix_noun_step1a_success = False | |
self.suffix_noun_step2a_success = False | |
self.suffix_noun_step2b_success = False | |
self.suffixe_noun_step1b_success = False | |
self.prefix_step2a_success = False | |
self.prefix_step3a_noun_success = False | |
self.prefix_step3b_noun_success = False | |
modified_word = word | |
# guess type and properties | |
# checks1 | |
self.__checks_1(modified_word) | |
# checks2 | |
self.__checks_2(modified_word) | |
# Pre_Normalization | |
modified_word = self.__normalize_pre(modified_word) | |
# Avoid stopwords | |
if modified_word in self.stopwords or len(modified_word) <= 2: | |
return modified_word | |
# Start stemming | |
if self.is_verb: | |
modified_word = self.__Suffix_Verb_Step1(modified_word) | |
if self.suffixes_verb_step1_success: | |
modified_word = self.__Suffix_Verb_Step2a(modified_word) | |
if not self.suffix_verb_step2a_success: | |
modified_word = self.__Suffix_Verb_Step2c(modified_word) | |
# or next TODO: How to deal with or next instruction | |
else: | |
modified_word = self.__Suffix_Verb_Step2b(modified_word) | |
if not self.suffix_verb_step2b_success: | |
modified_word = self.__Suffix_Verb_Step2a(modified_word) | |
if self.is_noun: | |
modified_word = self.__Suffix_Noun_Step2c2(modified_word) | |
if not self.suffix_noun_step2c2_success: | |
if not self.is_defined: | |
modified_word = self.__Suffix_Noun_Step1a(modified_word) | |
# if self.suffix_noun_step1a_success: | |
modified_word = self.__Suffix_Noun_Step2a(modified_word) | |
if not self.suffix_noun_step2a_success: | |
modified_word = self.__Suffix_Noun_Step2b(modified_word) | |
if ( | |
not self.suffix_noun_step2b_success | |
and not self.suffix_noun_step2a_success | |
): | |
modified_word = self.__Suffix_Noun_Step2c1(modified_word) | |
# or next ? todo : how to deal with or next | |
else: | |
modified_word = self.__Suffix_Noun_Step1b(modified_word) | |
if self.suffixe_noun_step1b_success: | |
modified_word = self.__Suffix_Noun_Step2a(modified_word) | |
if not self.suffix_noun_step2a_success: | |
modified_word = self.__Suffix_Noun_Step2b(modified_word) | |
if ( | |
not self.suffix_noun_step2b_success | |
and not self.suffix_noun_step2a_success | |
): | |
modified_word = self.__Suffix_Noun_Step2c1(modified_word) | |
else: | |
if not self.is_defined: | |
modified_word = self.__Suffix_Noun_Step2a(modified_word) | |
modified_word = self.__Suffix_Noun_Step2b(modified_word) | |
modified_word = self.__Suffix_Noun_Step3(modified_word) | |
if not self.is_noun and self.is_verb: | |
modified_word = self.__Suffix_All_alef_maqsura(modified_word) | |
# prefixes | |
modified_word = self.__Prefix_Step1(modified_word) | |
modified_word = self.__Prefix_Step2a(modified_word) | |
if not self.prefix_step2a_success: | |
modified_word = self.__Prefix_Step2b(modified_word) | |
modified_word = self.__Prefix_Step3a_Noun(modified_word) | |
if not self.prefix_step3a_noun_success and self.is_noun: | |
modified_word = self.__Prefix_Step3b_Noun(modified_word) | |
else: | |
if not self.prefix_step3b_noun_success and self.is_verb: | |
modified_word = self.__Prefix_Step3_Verb(modified_word) | |
modified_word = self.__Prefix_Step4_Verb(modified_word) | |
# post normalization stemming | |
modified_word = self.__normalize_post(modified_word) | |
stemmed_word = modified_word | |
return stemmed_word | |
class DanishStemmer(_ScandinavianStemmer): | |
""" | |
The Danish Snowball stemmer. | |
:cvar __vowels: The Danish vowels. | |
:type __vowels: unicode | |
:cvar __consonants: The Danish consonants. | |
:type __consonants: unicode | |
:cvar __double_consonants: The Danish double consonants. | |
:type __double_consonants: tuple | |
:cvar __s_ending: Letters that may directly appear before a word final 's'. | |
:type __s_ending: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the Danish | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/danish/stemmer.html | |
""" | |
# The language's vowels and other important characters are defined. | |
__vowels = "aeiouy\xE6\xE5\xF8" | |
__consonants = "bcdfghjklmnpqrstvwxz" | |
__double_consonants = ( | |
"bb", | |
"cc", | |
"dd", | |
"ff", | |
"gg", | |
"hh", | |
"jj", | |
"kk", | |
"ll", | |
"mm", | |
"nn", | |
"pp", | |
"qq", | |
"rr", | |
"ss", | |
"tt", | |
"vv", | |
"ww", | |
"xx", | |
"zz", | |
) | |
__s_ending = "abcdfghjklmnoprtvyz\xE5" | |
# The different suffixes, divided into the algorithm's steps | |
# and organized by length, are listed in tuples. | |
__step1_suffixes = ( | |
"erendes", | |
"erende", | |
"hedens", | |
"ethed", | |
"erede", | |
"heden", | |
"heder", | |
"endes", | |
"ernes", | |
"erens", | |
"erets", | |
"ered", | |
"ende", | |
"erne", | |
"eren", | |
"erer", | |
"heds", | |
"enes", | |
"eres", | |
"eret", | |
"hed", | |
"ene", | |
"ere", | |
"ens", | |
"ers", | |
"ets", | |
"en", | |
"er", | |
"es", | |
"et", | |
"e", | |
"s", | |
) | |
__step2_suffixes = ("gd", "dt", "gt", "kt") | |
__step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") | |
def stem(self, word): | |
""" | |
Stem a Danish word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
# Every word is put into lower case for normalization. | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
# After this, the required regions are generated | |
# by the respective helper method. | |
r1 = self._r1_scandinavian(word, self.__vowels) | |
# Then the actual stemming process starts. | |
# Every new step is explicitly indicated | |
# according to the descriptions on the Snowball website. | |
# STEP 1 | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "s": | |
if word[-2] in self.__s_ending: | |
word = word[:-1] | |
r1 = r1[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 2 | |
for suffix in self.__step2_suffixes: | |
if r1.endswith(suffix): | |
word = word[:-1] | |
r1 = r1[:-1] | |
break | |
# STEP 3 | |
if r1.endswith("igst"): | |
word = word[:-2] | |
r1 = r1[:-2] | |
for suffix in self.__step3_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "l\xF8st": | |
word = word[:-1] | |
r1 = r1[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
if r1.endswith(self.__step2_suffixes): | |
word = word[:-1] | |
r1 = r1[:-1] | |
break | |
# STEP 4: Undouble | |
for double_cons in self.__double_consonants: | |
if word.endswith(double_cons) and len(word) > 3: | |
word = word[:-1] | |
break | |
return word | |
class DutchStemmer(_StandardStemmer): | |
""" | |
The Dutch Snowball stemmer. | |
:cvar __vowels: The Dutch vowels. | |
:type __vowels: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. | |
:type __step3b_suffixes: tuple | |
:note: A detailed description of the Dutch | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/dutch/stemmer.html | |
""" | |
__vowels = "aeiouy\xE8" | |
__step1_suffixes = ("heden", "ene", "en", "se", "s") | |
__step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") | |
def stem(self, word): | |
""" | |
Stem a Dutch word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step2_success = False | |
# Vowel accents are removed. | |
word = ( | |
word.replace("\xE4", "a") | |
.replace("\xE1", "a") | |
.replace("\xEB", "e") | |
.replace("\xE9", "e") | |
.replace("\xED", "i") | |
.replace("\xEF", "i") | |
.replace("\xF6", "o") | |
.replace("\xF3", "o") | |
.replace("\xFC", "u") | |
.replace("\xFA", "u") | |
) | |
# An initial 'y', a 'y' after a vowel, | |
# and an 'i' between self.__vowels is put into upper case. | |
# As from now these are treated as consonants. | |
if word.startswith("y"): | |
word = "".join(("Y", word[1:])) | |
for i in range(1, len(word)): | |
if word[i - 1] in self.__vowels and word[i] == "y": | |
word = "".join((word[:i], "Y", word[i + 1 :])) | |
for i in range(1, len(word) - 1): | |
if ( | |
word[i - 1] in self.__vowels | |
and word[i] == "i" | |
and word[i + 1] in self.__vowels | |
): | |
word = "".join((word[:i], "I", word[i + 1 :])) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
# R1 is adjusted so that the region before it | |
# contains at least 3 letters. | |
for i in range(1, len(word)): | |
if word[i] not in self.__vowels and word[i - 1] in self.__vowels: | |
if 3 > len(word[: i + 1]) > 0: | |
r1 = word[3:] | |
elif len(word[: i + 1]) == 0: | |
return word | |
break | |
# STEP 1 | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "heden": | |
word = suffix_replace(word, suffix, "heid") | |
r1 = suffix_replace(r1, suffix, "heid") | |
if r2.endswith("heden"): | |
r2 = suffix_replace(r2, suffix, "heid") | |
elif ( | |
suffix in ("ene", "en") | |
and not word.endswith("heden") | |
and word[-len(suffix) - 1] not in self.__vowels | |
and word[-len(suffix) - 3 : -len(suffix)] != "gem" | |
): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
if word.endswith(("kk", "dd", "tt")): | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
elif ( | |
suffix in ("se", "s") | |
and word[-len(suffix) - 1] not in self.__vowels | |
and word[-len(suffix) - 1] != "j" | |
): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 2 | |
if r1.endswith("e") and word[-2] not in self.__vowels: | |
step2_success = True | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
if word.endswith(("kk", "dd", "tt")): | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
# STEP 3a | |
if r2.endswith("heid") and word[-5] != "c": | |
word = word[:-4] | |
r1 = r1[:-4] | |
r2 = r2[:-4] | |
if ( | |
r1.endswith("en") | |
and word[-3] not in self.__vowels | |
and word[-5:-2] != "gem" | |
): | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
if word.endswith(("kk", "dd", "tt")): | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
# STEP 3b: Derivational suffixes | |
for suffix in self.__step3b_suffixes: | |
if r2.endswith(suffix): | |
if suffix in ("end", "ing"): | |
word = word[:-3] | |
r2 = r2[:-3] | |
if r2.endswith("ig") and word[-3] != "e": | |
word = word[:-2] | |
else: | |
if word.endswith(("kk", "dd", "tt")): | |
word = word[:-1] | |
elif suffix == "ig" and word[-3] != "e": | |
word = word[:-2] | |
elif suffix == "lijk": | |
word = word[:-4] | |
r1 = r1[:-4] | |
if r1.endswith("e") and word[-2] not in self.__vowels: | |
word = word[:-1] | |
if word.endswith(("kk", "dd", "tt")): | |
word = word[:-1] | |
elif suffix == "baar": | |
word = word[:-4] | |
elif suffix == "bar" and step2_success: | |
word = word[:-3] | |
break | |
# STEP 4: Undouble vowel | |
if len(word) >= 4: | |
if word[-1] not in self.__vowels and word[-1] != "I": | |
if word[-3:-1] in ("aa", "ee", "oo", "uu"): | |
if word[-4] not in self.__vowels: | |
word = "".join((word[:-3], word[-3], word[-1])) | |
# All occurrences of 'I' and 'Y' are put back into lower case. | |
word = word.replace("I", "i").replace("Y", "y") | |
return word | |
class EnglishStemmer(_StandardStemmer): | |
""" | |
The English Snowball stemmer. | |
:cvar __vowels: The English vowels. | |
:type __vowels: unicode | |
:cvar __double_consonants: The English double consonants. | |
:type __double_consonants: tuple | |
:cvar __li_ending: Letters that may directly appear before a word final 'li'. | |
:type __li_ending: unicode | |
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. | |
:type __step0_suffixes: tuple | |
:cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. | |
:type __step1a_suffixes: tuple | |
:cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. | |
:type __step1b_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. | |
:type __step4_suffixes: tuple | |
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. | |
:type __step5_suffixes: tuple | |
:cvar __special_words: A dictionary containing words | |
which have to be stemmed specially. | |
:type __special_words: dict | |
:note: A detailed description of the English | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/english/stemmer.html | |
""" | |
__vowels = "aeiouy" | |
__double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") | |
__li_ending = "cdeghkmnrt" | |
__step0_suffixes = ("'s'", "'s", "'") | |
__step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") | |
__step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") | |
__step2_suffixes = ( | |
"ization", | |
"ational", | |
"fulness", | |
"ousness", | |
"iveness", | |
"tional", | |
"biliti", | |
"lessli", | |
"entli", | |
"ation", | |
"alism", | |
"aliti", | |
"ousli", | |
"iviti", | |
"fulli", | |
"enci", | |
"anci", | |
"abli", | |
"izer", | |
"ator", | |
"alli", | |
"bli", | |
"ogi", | |
"li", | |
) | |
__step3_suffixes = ( | |
"ational", | |
"tional", | |
"alize", | |
"icate", | |
"iciti", | |
"ative", | |
"ical", | |
"ness", | |
"ful", | |
) | |
__step4_suffixes = ( | |
"ement", | |
"ance", | |
"ence", | |
"able", | |
"ible", | |
"ment", | |
"ant", | |
"ent", | |
"ism", | |
"ate", | |
"iti", | |
"ous", | |
"ive", | |
"ize", | |
"ion", | |
"al", | |
"er", | |
"ic", | |
) | |
__step5_suffixes = ("e", "l") | |
__special_words = { | |
"skis": "ski", | |
"skies": "sky", | |
"dying": "die", | |
"lying": "lie", | |
"tying": "tie", | |
"idly": "idl", | |
"gently": "gentl", | |
"ugly": "ugli", | |
"early": "earli", | |
"only": "onli", | |
"singly": "singl", | |
"sky": "sky", | |
"news": "news", | |
"howe": "howe", | |
"atlas": "atlas", | |
"cosmos": "cosmos", | |
"bias": "bias", | |
"andes": "andes", | |
"inning": "inning", | |
"innings": "inning", | |
"outing": "outing", | |
"outings": "outing", | |
"canning": "canning", | |
"cannings": "canning", | |
"herring": "herring", | |
"herrings": "herring", | |
"earring": "earring", | |
"earrings": "earring", | |
"proceed": "proceed", | |
"proceeds": "proceed", | |
"proceeded": "proceed", | |
"proceeding": "proceed", | |
"exceed": "exceed", | |
"exceeds": "exceed", | |
"exceeded": "exceed", | |
"exceeding": "exceed", | |
"succeed": "succeed", | |
"succeeds": "succeed", | |
"succeeded": "succeed", | |
"succeeding": "succeed", | |
} | |
def stem(self, word): | |
""" | |
Stem an English word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords or len(word) <= 2: | |
return word | |
elif word in self.__special_words: | |
return self.__special_words[word] | |
# Map the different apostrophe characters to a single consistent one | |
word = ( | |
word.replace("\u2019", "\x27") | |
.replace("\u2018", "\x27") | |
.replace("\u201B", "\x27") | |
) | |
if word.startswith("\x27"): | |
word = word[1:] | |
if word.startswith("y"): | |
word = "".join(("Y", word[1:])) | |
for i in range(1, len(word)): | |
if word[i - 1] in self.__vowels and word[i] == "y": | |
word = "".join((word[:i], "Y", word[i + 1 :])) | |
step1a_vowel_found = False | |
step1b_vowel_found = False | |
r1 = "" | |
r2 = "" | |
if word.startswith(("gener", "commun", "arsen")): | |
if word.startswith(("gener", "arsen")): | |
r1 = word[5:] | |
else: | |
r1 = word[6:] | |
for i in range(1, len(r1)): | |
if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: | |
r2 = r1[i + 1 :] | |
break | |
else: | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
# STEP 0 | |
for suffix in self.__step0_suffixes: | |
if word.endswith(suffix): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 1a | |
for suffix in self.__step1a_suffixes: | |
if word.endswith(suffix): | |
if suffix == "sses": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix in ("ied", "ies"): | |
if len(word[: -len(suffix)]) > 1: | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
else: | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
elif suffix == "s": | |
for letter in word[:-2]: | |
if letter in self.__vowels: | |
step1a_vowel_found = True | |
break | |
if step1a_vowel_found: | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
break | |
# STEP 1b | |
for suffix in self.__step1b_suffixes: | |
if word.endswith(suffix): | |
if suffix in ("eed", "eedly"): | |
if r1.endswith(suffix): | |
word = suffix_replace(word, suffix, "ee") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ee") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ee") | |
else: | |
r2 = "" | |
else: | |
for letter in word[: -len(suffix)]: | |
if letter in self.__vowels: | |
step1b_vowel_found = True | |
break | |
if step1b_vowel_found: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
if word.endswith(("at", "bl", "iz")): | |
word = "".join((word, "e")) | |
r1 = "".join((r1, "e")) | |
if len(word) > 5 or len(r1) >= 3: | |
r2 = "".join((r2, "e")) | |
elif word.endswith(self.__double_consonants): | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
elif ( | |
r1 == "" | |
and len(word) >= 3 | |
and word[-1] not in self.__vowels | |
and word[-1] not in "wxY" | |
and word[-2] in self.__vowels | |
and word[-3] not in self.__vowels | |
) or ( | |
r1 == "" | |
and len(word) == 2 | |
and word[0] in self.__vowels | |
and word[1] not in self.__vowels | |
): | |
word = "".join((word, "e")) | |
if len(r1) > 0: | |
r1 = "".join((r1, "e")) | |
if len(r2) > 0: | |
r2 = "".join((r2, "e")) | |
break | |
# STEP 1c | |
if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: | |
word = "".join((word[:-1], "i")) | |
if len(r1) >= 1: | |
r1 = "".join((r1[:-1], "i")) | |
else: | |
r1 = "" | |
if len(r2) >= 1: | |
r2 = "".join((r2[:-1], "i")) | |
else: | |
r2 = "" | |
# STEP 2 | |
for suffix in self.__step2_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
if suffix == "tional": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix in ("enci", "anci", "abli"): | |
word = "".join((word[:-1], "e")) | |
if len(r1) >= 1: | |
r1 = "".join((r1[:-1], "e")) | |
else: | |
r1 = "" | |
if len(r2) >= 1: | |
r2 = "".join((r2[:-1], "e")) | |
else: | |
r2 = "" | |
elif suffix == "entli": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix in ("izer", "ization"): | |
word = suffix_replace(word, suffix, "ize") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ize") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ize") | |
else: | |
r2 = "" | |
elif suffix in ("ational", "ation", "ator"): | |
word = suffix_replace(word, suffix, "ate") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ate") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ate") | |
else: | |
r2 = "e" | |
elif suffix in ("alism", "aliti", "alli"): | |
word = suffix_replace(word, suffix, "al") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "al") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "al") | |
else: | |
r2 = "" | |
elif suffix == "fulness": | |
word = word[:-4] | |
r1 = r1[:-4] | |
r2 = r2[:-4] | |
elif suffix in ("ousli", "ousness"): | |
word = suffix_replace(word, suffix, "ous") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ous") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ous") | |
else: | |
r2 = "" | |
elif suffix in ("iveness", "iviti"): | |
word = suffix_replace(word, suffix, "ive") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ive") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ive") | |
else: | |
r2 = "e" | |
elif suffix in ("biliti", "bli"): | |
word = suffix_replace(word, suffix, "ble") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ble") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ble") | |
else: | |
r2 = "" | |
elif suffix == "ogi" and word[-4] == "l": | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
elif suffix in ("fulli", "lessli"): | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix == "li" and word[-3] in self.__li_ending: | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
break | |
# STEP 3 | |
for suffix in self.__step3_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
if suffix == "tional": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix == "ational": | |
word = suffix_replace(word, suffix, "ate") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ate") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ate") | |
else: | |
r2 = "" | |
elif suffix == "alize": | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
elif suffix in ("icate", "iciti", "ical"): | |
word = suffix_replace(word, suffix, "ic") | |
if len(r1) >= len(suffix): | |
r1 = suffix_replace(r1, suffix, "ic") | |
else: | |
r1 = "" | |
if len(r2) >= len(suffix): | |
r2 = suffix_replace(r2, suffix, "ic") | |
else: | |
r2 = "" | |
elif suffix in ("ful", "ness"): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
elif suffix == "ative" and r2.endswith(suffix): | |
word = word[:-5] | |
r1 = r1[:-5] | |
r2 = r2[:-5] | |
break | |
# STEP 4 | |
for suffix in self.__step4_suffixes: | |
if word.endswith(suffix): | |
if r2.endswith(suffix): | |
if suffix == "ion": | |
if word[-4] in "st": | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 5 | |
if r2.endswith("l") and word[-2] == "l": | |
word = word[:-1] | |
elif r2.endswith("e"): | |
word = word[:-1] | |
elif r1.endswith("e"): | |
if len(word) >= 4 and ( | |
word[-2] in self.__vowels | |
or word[-2] in "wxY" | |
or word[-3] not in self.__vowels | |
or word[-4] in self.__vowels | |
): | |
word = word[:-1] | |
word = word.replace("Y", "y") | |
return word | |
class FinnishStemmer(_StandardStemmer): | |
""" | |
The Finnish Snowball stemmer. | |
:cvar __vowels: The Finnish vowels. | |
:type __vowels: unicode | |
:cvar __restricted_vowels: A subset of the Finnish vowels. | |
:type __restricted_vowels: unicode | |
:cvar __long_vowels: The Finnish vowels in their long forms. | |
:type __long_vowels: tuple | |
:cvar __consonants: The Finnish consonants. | |
:type __consonants: unicode | |
:cvar __double_consonants: The Finnish double consonants. | |
:type __double_consonants: tuple | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. | |
:type __step4_suffixes: tuple | |
:note: A detailed description of the Finnish | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/finnish/stemmer.html | |
""" | |
__vowels = "aeiouy\xE4\xF6" | |
__restricted_vowels = "aeiou\xE4\xF6" | |
__long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6") | |
__consonants = "bcdfghjklmnpqrstvwxz" | |
__double_consonants = ( | |
"bb", | |
"cc", | |
"dd", | |
"ff", | |
"gg", | |
"hh", | |
"jj", | |
"kk", | |
"ll", | |
"mm", | |
"nn", | |
"pp", | |
"qq", | |
"rr", | |
"ss", | |
"tt", | |
"vv", | |
"ww", | |
"xx", | |
"zz", | |
) | |
__step1_suffixes = ( | |
"kaan", | |
"k\xE4\xE4n", | |
"sti", | |
"kin", | |
"han", | |
"h\xE4n", | |
"ko", | |
"k\xF6", | |
"pa", | |
"p\xE4", | |
) | |
__step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en") | |
__step3_suffixes = ( | |
"siin", | |
"tten", | |
"seen", | |
"han", | |
"hen", | |
"hin", | |
"hon", | |
"h\xE4n", | |
"h\xF6n", | |
"den", | |
"tta", | |
"tt\xE4", | |
"ssa", | |
"ss\xE4", | |
"sta", | |
"st\xE4", | |
"lla", | |
"ll\xE4", | |
"lta", | |
"lt\xE4", | |
"lle", | |
"ksi", | |
"ine", | |
"ta", | |
"t\xE4", | |
"na", | |
"n\xE4", | |
"a", | |
"\xE4", | |
"n", | |
) | |
__step4_suffixes = ( | |
"impi", | |
"impa", | |
"imp\xE4", | |
"immi", | |
"imma", | |
"imm\xE4", | |
"mpi", | |
"mpa", | |
"mp\xE4", | |
"mmi", | |
"mma", | |
"mm\xE4", | |
"eja", | |
"ej\xE4", | |
) | |
def stem(self, word): | |
""" | |
Stem a Finnish word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step3_success = False | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
# STEP 1: Particles etc. | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "sti": | |
if suffix in r2: | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
else: | |
if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6": | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 2: Possessives | |
for suffix in self.__step2_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "si": | |
if word[-3] != "k": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix == "ni": | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
if word.endswith("kse"): | |
word = suffix_replace(word, "kse", "ksi") | |
if r1.endswith("kse"): | |
r1 = suffix_replace(r1, "kse", "ksi") | |
if r2.endswith("kse"): | |
r2 = suffix_replace(r2, "kse", "ksi") | |
elif suffix == "an": | |
if word[-4:-2] in ("ta", "na") or word[-5:-2] in ( | |
"ssa", | |
"sta", | |
"lla", | |
"lta", | |
): | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix == "\xE4n": | |
if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in ( | |
"ss\xE4", | |
"st\xE4", | |
"ll\xE4", | |
"lt\xE4", | |
): | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
elif suffix == "en": | |
if word[-5:-2] in ("lle", "ine"): | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
else: | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
break | |
# STEP 3: Cases | |
for suffix in self.__step3_suffixes: | |
if r1.endswith(suffix): | |
if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"): | |
if ( | |
(suffix == "han" and word[-4] == "a") | |
or (suffix == "hen" and word[-4] == "e") | |
or (suffix == "hin" and word[-4] == "i") | |
or (suffix == "hon" and word[-4] == "o") | |
or (suffix == "h\xE4n" and word[-4] == "\xE4") | |
or (suffix == "h\xF6n" and word[-4] == "\xF6") | |
): | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
step3_success = True | |
elif suffix in ("siin", "den", "tten"): | |
if ( | |
word[-len(suffix) - 1] == "i" | |
and word[-len(suffix) - 2] in self.__restricted_vowels | |
): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
step3_success = True | |
else: | |
continue | |
elif suffix == "seen": | |
if word[-6:-4] in self.__long_vowels: | |
word = word[:-4] | |
r1 = r1[:-4] | |
r2 = r2[:-4] | |
step3_success = True | |
else: | |
continue | |
elif suffix in ("a", "\xE4"): | |
if word[-2] in self.__vowels and word[-3] in self.__consonants: | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
step3_success = True | |
elif suffix in ("tta", "tt\xE4"): | |
if word[-4] == "e": | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
step3_success = True | |
elif suffix == "n": | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
step3_success = True | |
if word[-2:] == "ie" or word[-2:] in self.__long_vowels: | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
step3_success = True | |
break | |
# STEP 4: Other endings | |
for suffix in self.__step4_suffixes: | |
if r2.endswith(suffix): | |
if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"): | |
if word[-5:-3] != "po": | |
word = word[:-3] | |
r1 = r1[:-3] | |
r2 = r2[:-3] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 5: Plurals | |
if step3_success and len(r1) >= 1 and r1[-1] in "ij": | |
word = word[:-1] | |
r1 = r1[:-1] | |
elif ( | |
not step3_success | |
and len(r1) >= 2 | |
and r1[-1] == "t" | |
and r1[-2] in self.__vowels | |
): | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
if r2.endswith("imma"): | |
word = word[:-4] | |
r1 = r1[:-4] | |
elif r2.endswith("mma") and r2[-5:-3] != "po": | |
word = word[:-3] | |
r1 = r1[:-3] | |
# STEP 6: Tidying up | |
if r1[-2:] in self.__long_vowels: | |
word = word[:-1] | |
r1 = r1[:-1] | |
if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei": | |
word = word[:-1] | |
r1 = r1[:-1] | |
if r1.endswith(("oj", "uj")): | |
word = word[:-1] | |
r1 = r1[:-1] | |
if r1.endswith("jo"): | |
word = word[:-1] | |
r1 = r1[:-1] | |
# If the word ends with a double consonant | |
# followed by zero or more vowels, the last consonant is removed. | |
for i in range(1, len(word)): | |
if word[-i] in self.__vowels: | |
continue | |
else: | |
if i == 1: | |
if word[-i - 1 :] in self.__double_consonants: | |
word = word[:-1] | |
else: | |
if word[-i - 1 : -i + 1] in self.__double_consonants: | |
word = "".join((word[:-i], word[-i + 1 :])) | |
break | |
return word | |
class FrenchStemmer(_StandardStemmer): | |
""" | |
The French Snowball stemmer. | |
:cvar __vowels: The French vowels. | |
:type __vowels: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. | |
:type __step2a_suffixes: tuple | |
:cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. | |
:type __step2b_suffixes: tuple | |
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. | |
:type __step4_suffixes: tuple | |
:note: A detailed description of the French | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/french/stemmer.html | |
""" | |
__vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" | |
__step1_suffixes = ( | |
"issements", | |
"issement", | |
"atrices", | |
"atrice", | |
"ateurs", | |
"ations", | |
"logies", | |
"usions", | |
"utions", | |
"ements", | |
"amment", | |
"emment", | |
"ances", | |
"iqUes", | |
"ismes", | |
"ables", | |
"istes", | |
"ateur", | |
"ation", | |
"logie", | |
"usion", | |
"ution", | |
"ences", | |
"ement", | |
"euses", | |
"ments", | |
"ance", | |
"iqUe", | |
"isme", | |
"able", | |
"iste", | |
"ence", | |
"it\xE9s", | |
"ives", | |
"eaux", | |
"euse", | |
"ment", | |
"eux", | |
"it\xE9", | |
"ive", | |
"ifs", | |
"aux", | |
"if", | |
) | |
__step2a_suffixes = ( | |
"issaIent", | |
"issantes", | |
"iraIent", | |
"issante", | |
"issants", | |
"issions", | |
"irions", | |
"issais", | |
"issait", | |
"issant", | |
"issent", | |
"issiez", | |
"issons", | |
"irais", | |
"irait", | |
"irent", | |
"iriez", | |
"irons", | |
"iront", | |
"isses", | |
"issez", | |
"\xEEmes", | |
"\xEEtes", | |
"irai", | |
"iras", | |
"irez", | |
"isse", | |
"ies", | |
"ira", | |
"\xEEt", | |
"ie", | |
"ir", | |
"is", | |
"it", | |
"i", | |
) | |
__step2b_suffixes = ( | |
"eraIent", | |
"assions", | |
"erions", | |
"assent", | |
"assiez", | |
"\xE8rent", | |
"erais", | |
"erait", | |
"eriez", | |
"erons", | |
"eront", | |
"aIent", | |
"antes", | |
"asses", | |
"ions", | |
"erai", | |
"eras", | |
"erez", | |
"\xE2mes", | |
"\xE2tes", | |
"ante", | |
"ants", | |
"asse", | |
"\xE9es", | |
"era", | |
"iez", | |
"ais", | |
"ait", | |
"ant", | |
"\xE9e", | |
"\xE9s", | |
"er", | |
"ez", | |
"\xE2t", | |
"ai", | |
"as", | |
"\xE9", | |
"a", | |
) | |
__step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB") | |
def stem(self, word): | |
""" | |
Stem a French word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step1_success = False | |
rv_ending_found = False | |
step2a_success = False | |
step2b_success = False | |
# Every occurrence of 'u' after 'q' is put into upper case. | |
for i in range(1, len(word)): | |
if word[i - 1] == "q" and word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
# Every occurrence of 'u' and 'i' | |
# between vowels is put into upper case. | |
# Every occurrence of 'y' preceded or | |
# followed by a vowel is also put into upper case. | |
for i in range(1, len(word) - 1): | |
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: | |
if word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
elif word[i] == "i": | |
word = "".join((word[:i], "I", word[i + 1 :])) | |
if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: | |
if word[i] == "y": | |
word = "".join((word[:i], "Y", word[i + 1 :])) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
rv = self.__rv_french(word, self.__vowels) | |
# STEP 1: Standard suffix removal | |
for suffix in self.__step1_suffixes: | |
if word.endswith(suffix): | |
if suffix == "eaux": | |
word = word[:-1] | |
step1_success = True | |
elif suffix in ("euse", "euses"): | |
if suffix in r2: | |
word = word[: -len(suffix)] | |
step1_success = True | |
elif suffix in r1: | |
word = suffix_replace(word, suffix, "eux") | |
step1_success = True | |
elif suffix in ("ement", "ements") and suffix in rv: | |
word = word[: -len(suffix)] | |
step1_success = True | |
if word[-2:] == "iv" and "iv" in r2: | |
word = word[:-2] | |
if word[-2:] == "at" and "at" in r2: | |
word = word[:-2] | |
elif word[-3:] == "eus": | |
if "eus" in r2: | |
word = word[:-3] | |
elif "eus" in r1: | |
word = "".join((word[:-1], "x")) | |
elif word[-3:] in ("abl", "iqU"): | |
if "abl" in r2 or "iqU" in r2: | |
word = word[:-3] | |
elif word[-3:] in ("i\xE8r", "I\xE8r"): | |
if "i\xE8r" in rv or "I\xE8r" in rv: | |
word = "".join((word[:-3], "i")) | |
elif suffix == "amment" and suffix in rv: | |
word = suffix_replace(word, "amment", "ant") | |
rv = suffix_replace(rv, "amment", "ant") | |
rv_ending_found = True | |
elif suffix == "emment" and suffix in rv: | |
word = suffix_replace(word, "emment", "ent") | |
rv_ending_found = True | |
elif ( | |
suffix in ("ment", "ments") | |
and suffix in rv | |
and not rv.startswith(suffix) | |
and rv[rv.rindex(suffix) - 1] in self.__vowels | |
): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
rv_ending_found = True | |
elif suffix == "aux" and suffix in r1: | |
word = "".join((word[:-2], "l")) | |
step1_success = True | |
elif ( | |
suffix in ("issement", "issements") | |
and suffix in r1 | |
and word[-len(suffix) - 1] not in self.__vowels | |
): | |
word = word[: -len(suffix)] | |
step1_success = True | |
elif ( | |
suffix | |
in ( | |
"ance", | |
"iqUe", | |
"isme", | |
"able", | |
"iste", | |
"eux", | |
"ances", | |
"iqUes", | |
"ismes", | |
"ables", | |
"istes", | |
) | |
and suffix in r2 | |
): | |
word = word[: -len(suffix)] | |
step1_success = True | |
elif ( | |
suffix | |
in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") | |
and suffix in r2 | |
): | |
word = word[: -len(suffix)] | |
step1_success = True | |
if word[-2:] == "ic": | |
if "ic" in r2: | |
word = word[:-2] | |
else: | |
word = "".join((word[:-2], "iqU")) | |
elif suffix in ("logie", "logies") and suffix in r2: | |
word = suffix_replace(word, suffix, "log") | |
step1_success = True | |
elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2: | |
word = suffix_replace(word, suffix, "u") | |
step1_success = True | |
elif suffix in ("ence", "ences") and suffix in r2: | |
word = suffix_replace(word, suffix, "ent") | |
step1_success = True | |
elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: | |
word = word[: -len(suffix)] | |
step1_success = True | |
if word[-4:] == "abil": | |
if "abil" in r2: | |
word = word[:-4] | |
else: | |
word = "".join((word[:-2], "l")) | |
elif word[-2:] == "ic": | |
if "ic" in r2: | |
word = word[:-2] | |
else: | |
word = "".join((word[:-2], "iqU")) | |
elif word[-2:] == "iv": | |
if "iv" in r2: | |
word = word[:-2] | |
elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2: | |
word = word[: -len(suffix)] | |
step1_success = True | |
if word[-2:] == "at" and "at" in r2: | |
word = word[:-2] | |
if word[-2:] == "ic": | |
if "ic" in r2: | |
word = word[:-2] | |
else: | |
word = "".join((word[:-2], "iqU")) | |
break | |
# STEP 2a: Verb suffixes beginning 'i' | |
if not step1_success or rv_ending_found: | |
for suffix in self.__step2a_suffixes: | |
if word.endswith(suffix): | |
if ( | |
suffix in rv | |
and len(rv) > len(suffix) | |
and rv[rv.rindex(suffix) - 1] not in self.__vowels | |
): | |
word = word[: -len(suffix)] | |
step2a_success = True | |
break | |
# STEP 2b: Other verb suffixes | |
if not step2a_success: | |
for suffix in self.__step2b_suffixes: | |
if rv.endswith(suffix): | |
if suffix == "ions" and "ions" in r2: | |
word = word[:-4] | |
step2b_success = True | |
elif suffix in ( | |
"eraIent", | |
"erions", | |
"\xE8rent", | |
"erais", | |
"erait", | |
"eriez", | |
"erons", | |
"eront", | |
"erai", | |
"eras", | |
"erez", | |
"\xE9es", | |
"era", | |
"iez", | |
"\xE9e", | |
"\xE9s", | |
"er", | |
"ez", | |
"\xE9", | |
): | |
word = word[: -len(suffix)] | |
step2b_success = True | |
elif suffix in ( | |
"assions", | |
"assent", | |
"assiez", | |
"aIent", | |
"antes", | |
"asses", | |
"\xE2mes", | |
"\xE2tes", | |
"ante", | |
"ants", | |
"asse", | |
"ais", | |
"ait", | |
"ant", | |
"\xE2t", | |
"ai", | |
"as", | |
"a", | |
): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
step2b_success = True | |
if rv.endswith("e"): | |
word = word[:-1] | |
break | |
# STEP 3 | |
if step1_success or step2a_success or step2b_success: | |
if word[-1] == "Y": | |
word = "".join((word[:-1], "i")) | |
elif word[-1] == "\xE7": | |
word = "".join((word[:-1], "c")) | |
# STEP 4: Residual suffixes | |
else: | |
if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s": | |
word = word[:-1] | |
for suffix in self.__step4_suffixes: | |
if word.endswith(suffix): | |
if suffix in rv: | |
if suffix == "ion" and suffix in r2 and rv[-4] in "st": | |
word = word[:-3] | |
elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"): | |
word = suffix_replace(word, suffix, "i") | |
elif suffix == "e": | |
word = word[:-1] | |
elif suffix == "\xEB" and word[-3:-1] == "gu": | |
word = word[:-1] | |
break | |
# STEP 5: Undouble | |
if word.endswith(("enn", "onn", "ett", "ell", "eill")): | |
word = word[:-1] | |
# STEP 6: Un-accent | |
for i in range(1, len(word)): | |
if word[-i] not in self.__vowels: | |
i += 1 | |
else: | |
if i != 1 and word[-i] in ("\xE9", "\xE8"): | |
word = "".join((word[:-i], "e", word[-i + 1 :])) | |
break | |
word = word.replace("I", "i").replace("U", "u").replace("Y", "y") | |
return word | |
def __rv_french(self, word, vowels): | |
""" | |
Return the region RV that is used by the French stemmer. | |
If the word begins with two vowels, RV is the region after | |
the third letter. Otherwise, it is the region after the first | |
vowel not at the beginning of the word, or the end of the word | |
if these positions cannot be found. (Exceptionally, u'par', | |
u'col' or u'tap' at the beginning of a word is also taken to | |
define RV as the region to their right.) | |
:param word: The French word whose region RV is determined. | |
:type word: str or unicode | |
:param vowels: The French vowels that are used to determine | |
the region RV. | |
:type vowels: unicode | |
:return: the region RV for the respective French word. | |
:rtype: unicode | |
:note: This helper method is invoked by the stem method of | |
the subclass FrenchStemmer. It is not to be invoked directly! | |
""" | |
rv = "" | |
if len(word) >= 2: | |
if word.startswith(("par", "col", "tap")) or ( | |
word[0] in vowels and word[1] in vowels | |
): | |
rv = word[3:] | |
else: | |
for i in range(1, len(word)): | |
if word[i] in vowels: | |
rv = word[i + 1 :] | |
break | |
return rv | |
class GermanStemmer(_StandardStemmer): | |
""" | |
The German Snowball stemmer. | |
:cvar __vowels: The German vowels. | |
:type __vowels: unicode | |
:cvar __s_ending: Letters that may directly appear before a word final 's'. | |
:type __s_ending: unicode | |
:cvar __st_ending: Letter that may directly appear before a word final 'st'. | |
:type __st_ending: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the German | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/german/stemmer.html | |
""" | |
__vowels = "aeiouy\xE4\xF6\xFC" | |
__s_ending = "bdfghklmnrt" | |
__st_ending = "bdfghklmnt" | |
__step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") | |
__step2_suffixes = ("est", "en", "er", "st") | |
__step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") | |
def stem(self, word): | |
""" | |
Stem a German word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
word = word.replace("\xDF", "ss") | |
# Every occurrence of 'u' and 'y' | |
# between vowels is put into upper case. | |
for i in range(1, len(word) - 1): | |
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: | |
if word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
elif word[i] == "y": | |
word = "".join((word[:i], "Y", word[i + 1 :])) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
# R1 is adjusted so that the region before it | |
# contains at least 3 letters. | |
for i in range(1, len(word)): | |
if word[i] not in self.__vowels and word[i - 1] in self.__vowels: | |
if 3 > len(word[: i + 1]) > 0: | |
r1 = word[3:] | |
elif len(word[: i + 1]) == 0: | |
return word | |
break | |
# STEP 1 | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if ( | |
suffix in ("en", "es", "e") | |
and word[-len(suffix) - 4 : -len(suffix)] == "niss" | |
): | |
word = word[: -len(suffix) - 1] | |
r1 = r1[: -len(suffix) - 1] | |
r2 = r2[: -len(suffix) - 1] | |
elif suffix == "s": | |
if word[-2] in self.__s_ending: | |
word = word[:-1] | |
r1 = r1[:-1] | |
r2 = r2[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 2 | |
for suffix in self.__step2_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "st": | |
if word[-3] in self.__st_ending and len(word[:-3]) >= 3: | |
word = word[:-2] | |
r1 = r1[:-2] | |
r2 = r2[:-2] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
break | |
# STEP 3: Derivational suffixes | |
for suffix in self.__step3_suffixes: | |
if r2.endswith(suffix): | |
if suffix in ("end", "ung"): | |
if ( | |
"ig" in r2[-len(suffix) - 2 : -len(suffix)] | |
and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2] | |
): | |
word = word[: -len(suffix) - 2] | |
else: | |
word = word[: -len(suffix)] | |
elif ( | |
suffix in ("ig", "ik", "isch") | |
and "e" not in r2[-len(suffix) - 1 : -len(suffix)] | |
): | |
word = word[: -len(suffix)] | |
elif suffix in ("lich", "heit"): | |
if ( | |
"er" in r1[-len(suffix) - 2 : -len(suffix)] | |
or "en" in r1[-len(suffix) - 2 : -len(suffix)] | |
): | |
word = word[: -len(suffix) - 2] | |
else: | |
word = word[: -len(suffix)] | |
elif suffix == "keit": | |
if "lich" in r2[-len(suffix) - 4 : -len(suffix)]: | |
word = word[: -len(suffix) - 4] | |
elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]: | |
word = word[: -len(suffix) - 2] | |
else: | |
word = word[: -len(suffix)] | |
break | |
# Umlaut accents are removed and | |
# 'u' and 'y' are put back into lower case. | |
word = ( | |
word.replace("\xE4", "a") | |
.replace("\xF6", "o") | |
.replace("\xFC", "u") | |
.replace("U", "u") | |
.replace("Y", "y") | |
) | |
return word | |
class HungarianStemmer(_LanguageSpecificStemmer): | |
""" | |
The Hungarian Snowball stemmer. | |
:cvar __vowels: The Hungarian vowels. | |
:type __vowels: unicode | |
:cvar __digraphs: The Hungarian digraphs. | |
:type __digraphs: tuple | |
:cvar __double_consonants: The Hungarian double consonants. | |
:type __double_consonants: tuple | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. | |
:type __step4_suffixes: tuple | |
:cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. | |
:type __step5_suffixes: tuple | |
:cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. | |
:type __step6_suffixes: tuple | |
:cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. | |
:type __step7_suffixes: tuple | |
:cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. | |
:type __step8_suffixes: tuple | |
:cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. | |
:type __step9_suffixes: tuple | |
:note: A detailed description of the Hungarian | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/hungarian/stemmer.html | |
""" | |
__vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" | |
__digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") | |
__double_consonants = ( | |
"bb", | |
"cc", | |
"ccs", | |
"dd", | |
"ff", | |
"gg", | |
"ggy", | |
"jj", | |
"kk", | |
"ll", | |
"lly", | |
"mm", | |
"nn", | |
"nny", | |
"pp", | |
"rr", | |
"ss", | |
"ssz", | |
"tt", | |
"tty", | |
"vv", | |
"zz", | |
"zzs", | |
) | |
__step1_suffixes = ("al", "el") | |
__step2_suffixes = ( | |
"k\xE9ppen", | |
"onk\xE9nt", | |
"enk\xE9nt", | |
"ank\xE9nt", | |
"k\xE9pp", | |
"k\xE9nt", | |
"ban", | |
"ben", | |
"nak", | |
"nek", | |
"val", | |
"vel", | |
"t\xF3l", | |
"t\xF5l", | |
"r\xF3l", | |
"r\xF5l", | |
"b\xF3l", | |
"b\xF5l", | |
"hoz", | |
"hez", | |
"h\xF6z", | |
"n\xE1l", | |
"n\xE9l", | |
"\xE9rt", | |
"kor", | |
"ba", | |
"be", | |
"ra", | |
"re", | |
"ig", | |
"at", | |
"et", | |
"ot", | |
"\xF6t", | |
"ul", | |
"\xFCl", | |
"v\xE1", | |
"v\xE9", | |
"en", | |
"on", | |
"an", | |
"\xF6n", | |
"n", | |
"t", | |
) | |
__step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") | |
__step4_suffixes = ( | |
"astul", | |
"est\xFCl", | |
"\xE1stul", | |
"\xE9st\xFCl", | |
"stul", | |
"st\xFCl", | |
) | |
__step5_suffixes = ("\xE1", "\xE9") | |
__step6_suffixes = ( | |
"ok\xE9", | |
"\xF6k\xE9", | |
"ak\xE9", | |
"ek\xE9", | |
"\xE1k\xE9", | |
"\xE1\xE9i", | |
"\xE9k\xE9", | |
"\xE9\xE9i", | |
"k\xE9", | |
"\xE9i", | |
"\xE9\xE9", | |
"\xE9", | |
) | |
__step7_suffixes = ( | |
"\xE1juk", | |
"\xE9j\xFCk", | |
"\xFCnk", | |
"unk", | |
"juk", | |
"j\xFCk", | |
"\xE1nk", | |
"\xE9nk", | |
"nk", | |
"uk", | |
"\xFCk", | |
"em", | |
"om", | |
"am", | |
"od", | |
"ed", | |
"ad", | |
"\xF6d", | |
"ja", | |
"je", | |
"\xE1m", | |
"\xE1d", | |
"\xE9m", | |
"\xE9d", | |
"m", | |
"d", | |
"a", | |
"e", | |
"o", | |
"\xE1", | |
"\xE9", | |
) | |
__step8_suffixes = ( | |
"jaitok", | |
"jeitek", | |
"jaink", | |
"jeink", | |
"aitok", | |
"eitek", | |
"\xE1itok", | |
"\xE9itek", | |
"jaim", | |
"jeim", | |
"jaid", | |
"jeid", | |
"eink", | |
"aink", | |
"itek", | |
"jeik", | |
"jaik", | |
"\xE1ink", | |
"\xE9ink", | |
"aim", | |
"eim", | |
"aid", | |
"eid", | |
"jai", | |
"jei", | |
"ink", | |
"aik", | |
"eik", | |
"\xE1im", | |
"\xE1id", | |
"\xE1ik", | |
"\xE9im", | |
"\xE9id", | |
"\xE9ik", | |
"im", | |
"id", | |
"ai", | |
"ei", | |
"ik", | |
"\xE1i", | |
"\xE9i", | |
"i", | |
) | |
__step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k") | |
def stem(self, word): | |
""" | |
Stem an Hungarian word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) | |
# STEP 1: Remove instrumental case | |
if r1.endswith(self.__step1_suffixes): | |
for double_cons in self.__double_consonants: | |
if word[-2 - len(double_cons) : -2] == double_cons: | |
word = "".join((word[:-4], word[-3])) | |
if r1[-2 - len(double_cons) : -2] == double_cons: | |
r1 = "".join((r1[:-4], r1[-3])) | |
break | |
# STEP 2: Remove frequent cases | |
for suffix in self.__step2_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
if r1.endswith("\xE1"): | |
word = "".join((word[:-1], "a")) | |
r1 = suffix_replace(r1, "\xE1", "a") | |
elif r1.endswith("\xE9"): | |
word = "".join((word[:-1], "e")) | |
r1 = suffix_replace(r1, "\xE9", "e") | |
break | |
# STEP 3: Remove special cases | |
for suffix in self.__step3_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "\xE9n": | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
else: | |
word = suffix_replace(word, suffix, "a") | |
r1 = suffix_replace(r1, suffix, "a") | |
break | |
# STEP 4: Remove other cases | |
for suffix in self.__step4_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "\xE1stul": | |
word = suffix_replace(word, suffix, "a") | |
r1 = suffix_replace(r1, suffix, "a") | |
elif suffix == "\xE9st\xFCl": | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 5: Remove factive case | |
for suffix in self.__step5_suffixes: | |
if r1.endswith(suffix): | |
for double_cons in self.__double_consonants: | |
if word[-1 - len(double_cons) : -1] == double_cons: | |
word = "".join((word[:-3], word[-2])) | |
if r1[-1 - len(double_cons) : -1] == double_cons: | |
r1 = "".join((r1[:-3], r1[-2])) | |
break | |
# STEP 6: Remove owned | |
for suffix in self.__step6_suffixes: | |
if r1.endswith(suffix): | |
if suffix in ("\xE1k\xE9", "\xE1\xE9i"): | |
word = suffix_replace(word, suffix, "a") | |
r1 = suffix_replace(r1, suffix, "a") | |
elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"): | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 7: Remove singular owner suffixes | |
for suffix in self.__step7_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"): | |
word = suffix_replace(word, suffix, "a") | |
r1 = suffix_replace(r1, suffix, "a") | |
elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"): | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 8: Remove plural owner suffixes | |
for suffix in self.__step8_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
if suffix in ( | |
"\xE1im", | |
"\xE1id", | |
"\xE1i", | |
"\xE1ink", | |
"\xE1itok", | |
"\xE1ik", | |
): | |
word = suffix_replace(word, suffix, "a") | |
r1 = suffix_replace(r1, suffix, "a") | |
elif suffix in ( | |
"\xE9im", | |
"\xE9id", | |
"\xE9i", | |
"\xE9ink", | |
"\xE9itek", | |
"\xE9ik", | |
): | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 9: Remove plural suffixes | |
for suffix in self.__step9_suffixes: | |
if word.endswith(suffix): | |
if r1.endswith(suffix): | |
if suffix == "\xE1k": | |
word = suffix_replace(word, suffix, "a") | |
elif suffix == "\xE9k": | |
word = suffix_replace(word, suffix, "e") | |
else: | |
word = word[: -len(suffix)] | |
break | |
return word | |
def __r1_hungarian(self, word, vowels, digraphs): | |
""" | |
Return the region R1 that is used by the Hungarian stemmer. | |
If the word begins with a vowel, R1 is defined as the region | |
after the first consonant or digraph (= two letters stand for | |
one phoneme) in the word. If the word begins with a consonant, | |
it is defined as the region after the first vowel in the word. | |
If the word does not contain both a vowel and consonant, R1 | |
is the null region at the end of the word. | |
:param word: The Hungarian word whose region R1 is determined. | |
:type word: str or unicode | |
:param vowels: The Hungarian vowels that are used to determine | |
the region R1. | |
:type vowels: unicode | |
:param digraphs: The digraphs that are used to determine the | |
region R1. | |
:type digraphs: tuple | |
:return: the region R1 for the respective word. | |
:rtype: unicode | |
:note: This helper method is invoked by the stem method of the subclass | |
HungarianStemmer. It is not to be invoked directly! | |
""" | |
r1 = "" | |
if word[0] in vowels: | |
for digraph in digraphs: | |
if digraph in word[1:]: | |
r1 = word[word.index(digraph[-1]) + 1 :] | |
return r1 | |
for i in range(1, len(word)): | |
if word[i] not in vowels: | |
r1 = word[i + 1 :] | |
break | |
else: | |
for i in range(1, len(word)): | |
if word[i] in vowels: | |
r1 = word[i + 1 :] | |
break | |
return r1 | |
class ItalianStemmer(_StandardStemmer): | |
""" | |
The Italian Snowball stemmer. | |
:cvar __vowels: The Italian vowels. | |
:type __vowels: unicode | |
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. | |
:type __step0_suffixes: tuple | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:note: A detailed description of the Italian | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/italian/stemmer.html | |
""" | |
__vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" | |
__step0_suffixes = ( | |
"gliela", | |
"gliele", | |
"glieli", | |
"glielo", | |
"gliene", | |
"sene", | |
"mela", | |
"mele", | |
"meli", | |
"melo", | |
"mene", | |
"tela", | |
"tele", | |
"teli", | |
"telo", | |
"tene", | |
"cela", | |
"cele", | |
"celi", | |
"celo", | |
"cene", | |
"vela", | |
"vele", | |
"veli", | |
"velo", | |
"vene", | |
"gli", | |
"ci", | |
"la", | |
"le", | |
"li", | |
"lo", | |
"mi", | |
"ne", | |
"si", | |
"ti", | |
"vi", | |
) | |
__step1_suffixes = ( | |
"atrice", | |
"atrici", | |
"azione", | |
"azioni", | |
"uzione", | |
"uzioni", | |
"usione", | |
"usioni", | |
"amento", | |
"amenti", | |
"imento", | |
"imenti", | |
"amente", | |
"abile", | |
"abili", | |
"ibile", | |
"ibili", | |
"mente", | |
"atore", | |
"atori", | |
"logia", | |
"logie", | |
"anza", | |
"anze", | |
"iche", | |
"ichi", | |
"ismo", | |
"ismi", | |
"ista", | |
"iste", | |
"isti", | |
"ist\xE0", | |
"ist\xE8", | |
"ist\xEC", | |
"ante", | |
"anti", | |
"enza", | |
"enze", | |
"ico", | |
"ici", | |
"ica", | |
"ice", | |
"oso", | |
"osi", | |
"osa", | |
"ose", | |
"it\xE0", | |
"ivo", | |
"ivi", | |
"iva", | |
"ive", | |
) | |
__step2_suffixes = ( | |
"erebbero", | |
"irebbero", | |
"assero", | |
"assimo", | |
"eranno", | |
"erebbe", | |
"eremmo", | |
"ereste", | |
"eresti", | |
"essero", | |
"iranno", | |
"irebbe", | |
"iremmo", | |
"ireste", | |
"iresti", | |
"iscano", | |
"iscono", | |
"issero", | |
"arono", | |
"avamo", | |
"avano", | |
"avate", | |
"eremo", | |
"erete", | |
"erono", | |
"evamo", | |
"evano", | |
"evate", | |
"iremo", | |
"irete", | |
"irono", | |
"ivamo", | |
"ivano", | |
"ivate", | |
"ammo", | |
"ando", | |
"asse", | |
"assi", | |
"emmo", | |
"enda", | |
"ende", | |
"endi", | |
"endo", | |
"erai", | |
"erei", | |
"Yamo", | |
"iamo", | |
"immo", | |
"irai", | |
"irei", | |
"isca", | |
"isce", | |
"isci", | |
"isco", | |
"ano", | |
"are", | |
"ata", | |
"ate", | |
"ati", | |
"ato", | |
"ava", | |
"avi", | |
"avo", | |
"er\xE0", | |
"ere", | |
"er\xF2", | |
"ete", | |
"eva", | |
"evi", | |
"evo", | |
"ir\xE0", | |
"ire", | |
"ir\xF2", | |
"ita", | |
"ite", | |
"iti", | |
"ito", | |
"iva", | |
"ivi", | |
"ivo", | |
"ono", | |
"uta", | |
"ute", | |
"uti", | |
"uto", | |
"ar", | |
"ir", | |
) | |
def stem(self, word): | |
""" | |
Stem an Italian word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step1_success = False | |
# All acute accents are replaced by grave accents. | |
word = ( | |
word.replace("\xE1", "\xE0") | |
.replace("\xE9", "\xE8") | |
.replace("\xED", "\xEC") | |
.replace("\xF3", "\xF2") | |
.replace("\xFA", "\xF9") | |
) | |
# Every occurrence of 'u' after 'q' | |
# is put into upper case. | |
for i in range(1, len(word)): | |
if word[i - 1] == "q" and word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
# Every occurrence of 'u' and 'i' | |
# between vowels is put into upper case. | |
for i in range(1, len(word) - 1): | |
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: | |
if word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
elif word[i] == "i": | |
word = "".join((word[:i], "I", word[i + 1 :])) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
rv = self._rv_standard(word, self.__vowels) | |
# STEP 0: Attached pronoun | |
for suffix in self.__step0_suffixes: | |
if rv.endswith(suffix): | |
if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"): | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"): | |
word = suffix_replace(word, suffix, "e") | |
r1 = suffix_replace(r1, suffix, "e") | |
r2 = suffix_replace(r2, suffix, "e") | |
rv = suffix_replace(rv, suffix, "e") | |
break | |
# STEP 1: Standard suffix removal | |
for suffix in self.__step1_suffixes: | |
if word.endswith(suffix): | |
if suffix == "amente" and r1.endswith(suffix): | |
step1_success = True | |
word = word[:-6] | |
r2 = r2[:-6] | |
rv = rv[:-6] | |
if r2.endswith("iv"): | |
word = word[:-2] | |
r2 = r2[:-2] | |
rv = rv[:-2] | |
if r2.endswith("at"): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith(("os", "ic")): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith("abil"): | |
word = word[:-4] | |
rv = rv[:-4] | |
elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith( | |
suffix | |
): | |
step1_success = True | |
word = word[:-6] | |
rv = rv[:-6] | |
elif r2.endswith(suffix): | |
step1_success = True | |
if suffix in ("azione", "azioni", "atore", "atori"): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith("ic"): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif suffix in ("logia", "logie"): | |
word = word[:-2] | |
rv = word[:-2] | |
elif suffix in ("uzione", "uzioni", "usione", "usioni"): | |
word = word[:-5] | |
rv = rv[:-5] | |
elif suffix in ("enza", "enze"): | |
word = suffix_replace(word, suffix, "te") | |
rv = suffix_replace(rv, suffix, "te") | |
elif suffix == "it\xE0": | |
word = word[:-3] | |
r2 = r2[:-3] | |
rv = rv[:-3] | |
if r2.endswith(("ic", "iv")): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith("abil"): | |
word = word[:-4] | |
rv = rv[:-4] | |
elif suffix in ("ivo", "ivi", "iva", "ive"): | |
word = word[:-3] | |
r2 = r2[:-3] | |
rv = rv[:-3] | |
if r2.endswith("at"): | |
word = word[:-2] | |
r2 = r2[:-2] | |
rv = rv[:-2] | |
if r2.endswith("ic"): | |
word = word[:-2] | |
rv = rv[:-2] | |
else: | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 2: Verb suffixes | |
if not step1_success: | |
for suffix in self.__step2_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 3a | |
if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")): | |
word = word[:-1] | |
rv = rv[:-1] | |
if rv.endswith("i"): | |
word = word[:-1] | |
rv = rv[:-1] | |
# STEP 3b | |
if rv.endswith(("ch", "gh")): | |
word = word[:-1] | |
word = word.replace("I", "i").replace("U", "u") | |
return word | |
class NorwegianStemmer(_ScandinavianStemmer): | |
""" | |
The Norwegian Snowball stemmer. | |
:cvar __vowels: The Norwegian vowels. | |
:type __vowels: unicode | |
:cvar __s_ending: Letters that may directly appear before a word final 's'. | |
:type __s_ending: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the Norwegian | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/norwegian/stemmer.html | |
""" | |
__vowels = "aeiouy\xE6\xE5\xF8" | |
__s_ending = "bcdfghjlmnoprtvyz" | |
__step1_suffixes = ( | |
"hetenes", | |
"hetene", | |
"hetens", | |
"heter", | |
"heten", | |
"endes", | |
"ande", | |
"ende", | |
"edes", | |
"enes", | |
"erte", | |
"ede", | |
"ane", | |
"ene", | |
"ens", | |
"ers", | |
"ets", | |
"het", | |
"ast", | |
"ert", | |
"en", | |
"ar", | |
"er", | |
"as", | |
"es", | |
"et", | |
"a", | |
"e", | |
"s", | |
) | |
__step2_suffixes = ("dt", "vt") | |
__step3_suffixes = ( | |
"hetslov", | |
"eleg", | |
"elig", | |
"elov", | |
"slov", | |
"leg", | |
"eig", | |
"lig", | |
"els", | |
"lov", | |
"ig", | |
) | |
def stem(self, word): | |
""" | |
Stem a Norwegian word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
r1 = self._r1_scandinavian(word, self.__vowels) | |
# STEP 1 | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if suffix in ("erte", "ert"): | |
word = suffix_replace(word, suffix, "er") | |
r1 = suffix_replace(r1, suffix, "er") | |
elif suffix == "s": | |
if word[-2] in self.__s_ending or ( | |
word[-2] == "k" and word[-3] not in self.__vowels | |
): | |
word = word[:-1] | |
r1 = r1[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 2 | |
for suffix in self.__step2_suffixes: | |
if r1.endswith(suffix): | |
word = word[:-1] | |
r1 = r1[:-1] | |
break | |
# STEP 3 | |
for suffix in self.__step3_suffixes: | |
if r1.endswith(suffix): | |
word = word[: -len(suffix)] | |
break | |
return word | |
class PortugueseStemmer(_StandardStemmer): | |
""" | |
The Portuguese Snowball stemmer. | |
:cvar __vowels: The Portuguese vowels. | |
:type __vowels: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. | |
:type __step4_suffixes: tuple | |
:note: A detailed description of the Portuguese | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/portuguese/stemmer.html | |
""" | |
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" | |
__step1_suffixes = ( | |
"amentos", | |
"imentos", | |
"uço~es", | |
"amento", | |
"imento", | |
"adoras", | |
"adores", | |
"a\xE7o~es", | |
"logias", | |
"\xEAncias", | |
"amente", | |
"idades", | |
"an\xE7as", | |
"ismos", | |
"istas", | |
"adora", | |
"a\xE7a~o", | |
"antes", | |
"\xE2ncia", | |
"logia", | |
"uça~o", | |
"\xEAncia", | |
"mente", | |
"idade", | |
"an\xE7a", | |
"ezas", | |
"icos", | |
"icas", | |
"ismo", | |
"\xE1vel", | |
"\xEDvel", | |
"ista", | |
"osos", | |
"osas", | |
"ador", | |
"ante", | |
"ivas", | |
"ivos", | |
"iras", | |
"eza", | |
"ico", | |
"ica", | |
"oso", | |
"osa", | |
"iva", | |
"ivo", | |
"ira", | |
) | |
__step2_suffixes = ( | |
"ar\xEDamos", | |
"er\xEDamos", | |
"ir\xEDamos", | |
"\xE1ssemos", | |
"\xEAssemos", | |
"\xEDssemos", | |
"ar\xEDeis", | |
"er\xEDeis", | |
"ir\xEDeis", | |
"\xE1sseis", | |
"\xE9sseis", | |
"\xEDsseis", | |
"\xE1ramos", | |
"\xE9ramos", | |
"\xEDramos", | |
"\xE1vamos", | |
"aremos", | |
"eremos", | |
"iremos", | |
"ariam", | |
"eriam", | |
"iriam", | |
"assem", | |
"essem", | |
"issem", | |
"ara~o", | |
"era~o", | |
"ira~o", | |
"arias", | |
"erias", | |
"irias", | |
"ardes", | |
"erdes", | |
"irdes", | |
"asses", | |
"esses", | |
"isses", | |
"astes", | |
"estes", | |
"istes", | |
"\xE1reis", | |
"areis", | |
"\xE9reis", | |
"ereis", | |
"\xEDreis", | |
"ireis", | |
"\xE1veis", | |
"\xEDamos", | |
"armos", | |
"ermos", | |
"irmos", | |
"aria", | |
"eria", | |
"iria", | |
"asse", | |
"esse", | |
"isse", | |
"aste", | |
"este", | |
"iste", | |
"arei", | |
"erei", | |
"irei", | |
"aram", | |
"eram", | |
"iram", | |
"avam", | |
"arem", | |
"erem", | |
"irem", | |
"ando", | |
"endo", | |
"indo", | |
"adas", | |
"idas", | |
"ar\xE1s", | |
"aras", | |
"er\xE1s", | |
"eras", | |
"ir\xE1s", | |
"avas", | |
"ares", | |
"eres", | |
"ires", | |
"\xEDeis", | |
"ados", | |
"idos", | |
"\xE1mos", | |
"amos", | |
"emos", | |
"imos", | |
"iras", | |
"ada", | |
"ida", | |
"ar\xE1", | |
"ara", | |
"er\xE1", | |
"era", | |
"ir\xE1", | |
"ava", | |
"iam", | |
"ado", | |
"ido", | |
"ias", | |
"ais", | |
"eis", | |
"ira", | |
"ia", | |
"ei", | |
"am", | |
"em", | |
"ar", | |
"er", | |
"ir", | |
"as", | |
"es", | |
"is", | |
"eu", | |
"iu", | |
"ou", | |
) | |
__step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3") | |
def stem(self, word): | |
""" | |
Stem a Portuguese word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step1_success = False | |
step2_success = False | |
word = ( | |
word.replace("\xE3", "a~") | |
.replace("\xF5", "o~") | |
.replace("q\xFC", "qu") | |
.replace("g\xFC", "gu") | |
) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
rv = self._rv_standard(word, self.__vowels) | |
# STEP 1: Standard suffix removal | |
for suffix in self.__step1_suffixes: | |
if word.endswith(suffix): | |
if suffix == "amente" and r1.endswith(suffix): | |
step1_success = True | |
word = word[:-6] | |
r2 = r2[:-6] | |
rv = rv[:-6] | |
if r2.endswith("iv"): | |
word = word[:-2] | |
r2 = r2[:-2] | |
rv = rv[:-2] | |
if r2.endswith("at"): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith(("os", "ic", "ad")): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif ( | |
suffix in ("ira", "iras") | |
and rv.endswith(suffix) | |
and word[-len(suffix) - 1 : -len(suffix)] == "e" | |
): | |
step1_success = True | |
word = suffix_replace(word, suffix, "ir") | |
rv = suffix_replace(rv, suffix, "ir") | |
elif r2.endswith(suffix): | |
step1_success = True | |
if suffix in ("logia", "logias"): | |
word = suffix_replace(word, suffix, "log") | |
rv = suffix_replace(rv, suffix, "log") | |
elif suffix in ("uça~o", "uço~es"): | |
word = suffix_replace(word, suffix, "u") | |
rv = suffix_replace(rv, suffix, "u") | |
elif suffix in ("\xEAncia", "\xEAncias"): | |
word = suffix_replace(word, suffix, "ente") | |
rv = suffix_replace(rv, suffix, "ente") | |
elif suffix == "mente": | |
word = word[:-5] | |
r2 = r2[:-5] | |
rv = rv[:-5] | |
if r2.endswith(("ante", "avel", "ivel")): | |
word = word[:-4] | |
rv = rv[:-4] | |
elif suffix in ("idade", "idades"): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith(("ic", "iv")): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith("abil"): | |
word = word[:-4] | |
rv = rv[:-4] | |
elif suffix in ("iva", "ivo", "ivas", "ivos"): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith("at"): | |
word = word[:-2] | |
rv = rv[:-2] | |
else: | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 2: Verb suffixes | |
if not step1_success: | |
for suffix in self.__step2_suffixes: | |
if rv.endswith(suffix): | |
step2_success = True | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 3 | |
if step1_success or step2_success: | |
if rv.endswith("i") and word[-2] == "c": | |
word = word[:-1] | |
rv = rv[:-1] | |
### STEP 4: Residual suffix | |
if not step1_success and not step2_success: | |
for suffix in self.__step4_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 5 | |
if rv.endswith(("e", "\xE9", "\xEA")): | |
word = word[:-1] | |
rv = rv[:-1] | |
if (word.endswith("gu") and rv.endswith("u")) or ( | |
word.endswith("ci") and rv.endswith("i") | |
): | |
word = word[:-1] | |
elif word.endswith("\xE7"): | |
word = suffix_replace(word, "\xE7", "c") | |
word = word.replace("a~", "\xE3").replace("o~", "\xF5") | |
return word | |
class RomanianStemmer(_StandardStemmer): | |
""" | |
The Romanian Snowball stemmer. | |
:cvar __vowels: The Romanian vowels. | |
:type __vowels: unicode | |
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. | |
:type __step0_suffixes: tuple | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the Romanian | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/romanian/stemmer.html | |
""" | |
__vowels = "aeiou\u0103\xE2\xEE" | |
__step0_suffixes = ( | |
"iilor", | |
"ului", | |
"elor", | |
"iile", | |
"ilor", | |
"atei", | |
"a\u0163ie", | |
"a\u0163ia", | |
"aua", | |
"ele", | |
"iua", | |
"iei", | |
"ile", | |
"ul", | |
"ea", | |
"ii", | |
) | |
__step1_suffixes = ( | |
"abilitate", | |
"abilitati", | |
"abilit\u0103\u0163i", | |
"ibilitate", | |
"abilit\u0103i", | |
"ivitate", | |
"ivitati", | |
"ivit\u0103\u0163i", | |
"icitate", | |
"icitati", | |
"icit\u0103\u0163i", | |
"icatori", | |
"ivit\u0103i", | |
"icit\u0103i", | |
"icator", | |
"a\u0163iune", | |
"atoare", | |
"\u0103toare", | |
"i\u0163iune", | |
"itoare", | |
"iciva", | |
"icive", | |
"icivi", | |
"iciv\u0103", | |
"icala", | |
"icale", | |
"icali", | |
"ical\u0103", | |
"ativa", | |
"ative", | |
"ativi", | |
"ativ\u0103", | |
"atori", | |
"\u0103tori", | |
"itiva", | |
"itive", | |
"itivi", | |
"itiv\u0103", | |
"itori", | |
"iciv", | |
"ical", | |
"ativ", | |
"ator", | |
"\u0103tor", | |
"itiv", | |
"itor", | |
) | |
__step2_suffixes = ( | |
"abila", | |
"abile", | |
"abili", | |
"abil\u0103", | |
"ibila", | |
"ibile", | |
"ibili", | |
"ibil\u0103", | |
"atori", | |
"itate", | |
"itati", | |
"it\u0103\u0163i", | |
"abil", | |
"ibil", | |
"oasa", | |
"oas\u0103", | |
"oase", | |
"anta", | |
"ante", | |
"anti", | |
"ant\u0103", | |
"ator", | |
"it\u0103i", | |
"iune", | |
"iuni", | |
"isme", | |
"ista", | |
"iste", | |
"isti", | |
"ist\u0103", | |
"i\u015Fti", | |
"ata", | |
"at\u0103", | |
"ati", | |
"ate", | |
"uta", | |
"ut\u0103", | |
"uti", | |
"ute", | |
"ita", | |
"it\u0103", | |
"iti", | |
"ite", | |
"ica", | |
"ice", | |
"ici", | |
"ic\u0103", | |
"osi", | |
"o\u015Fi", | |
"ant", | |
"iva", | |
"ive", | |
"ivi", | |
"iv\u0103", | |
"ism", | |
"ist", | |
"at", | |
"ut", | |
"it", | |
"ic", | |
"os", | |
"iv", | |
) | |
__step3_suffixes = ( | |
"seser\u0103\u0163i", | |
"aser\u0103\u0163i", | |
"iser\u0103\u0163i", | |
"\xE2ser\u0103\u0163i", | |
"user\u0103\u0163i", | |
"seser\u0103m", | |
"aser\u0103m", | |
"iser\u0103m", | |
"\xE2ser\u0103m", | |
"user\u0103m", | |
"ser\u0103\u0163i", | |
"sese\u015Fi", | |
"seser\u0103", | |
"easc\u0103", | |
"ar\u0103\u0163i", | |
"ur\u0103\u0163i", | |
"ir\u0103\u0163i", | |
"\xE2r\u0103\u0163i", | |
"ase\u015Fi", | |
"aser\u0103", | |
"ise\u015Fi", | |
"iser\u0103", | |
"\xe2se\u015Fi", | |
"\xE2ser\u0103", | |
"use\u015Fi", | |
"user\u0103", | |
"ser\u0103m", | |
"sesem", | |
"indu", | |
"\xE2ndu", | |
"eaz\u0103", | |
"e\u015Fti", | |
"e\u015Fte", | |
"\u0103\u015Fti", | |
"\u0103\u015Fte", | |
"ea\u0163i", | |
"ia\u0163i", | |
"ar\u0103m", | |
"ur\u0103m", | |
"ir\u0103m", | |
"\xE2r\u0103m", | |
"asem", | |
"isem", | |
"\xE2sem", | |
"usem", | |
"se\u015Fi", | |
"ser\u0103", | |
"sese", | |
"are", | |
"ere", | |
"ire", | |
"\xE2re", | |
"ind", | |
"\xE2nd", | |
"eze", | |
"ezi", | |
"esc", | |
"\u0103sc", | |
"eam", | |
"eai", | |
"eau", | |
"iam", | |
"iai", | |
"iau", | |
"a\u015Fi", | |
"ar\u0103", | |
"u\u015Fi", | |
"ur\u0103", | |
"i\u015Fi", | |
"ir\u0103", | |
"\xE2\u015Fi", | |
"\xe2r\u0103", | |
"ase", | |
"ise", | |
"\xE2se", | |
"use", | |
"a\u0163i", | |
"e\u0163i", | |
"i\u0163i", | |
"\xe2\u0163i", | |
"sei", | |
"ez", | |
"am", | |
"ai", | |
"au", | |
"ea", | |
"ia", | |
"ui", | |
"\xE2i", | |
"\u0103m", | |
"em", | |
"im", | |
"\xE2m", | |
"se", | |
) | |
def stem(self, word): | |
""" | |
Stem a Romanian word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step1_success = False | |
step2_success = False | |
for i in range(1, len(word) - 1): | |
if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: | |
if word[i] == "u": | |
word = "".join((word[:i], "U", word[i + 1 :])) | |
elif word[i] == "i": | |
word = "".join((word[:i], "I", word[i + 1 :])) | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
rv = self._rv_standard(word, self.__vowels) | |
# STEP 0: Removal of plurals and other simplifications | |
for suffix in self.__step0_suffixes: | |
if word.endswith(suffix): | |
if suffix in r1: | |
if suffix in ("ul", "ului"): | |
word = word[: -len(suffix)] | |
if suffix in rv: | |
rv = rv[: -len(suffix)] | |
else: | |
rv = "" | |
elif ( | |
suffix == "aua" | |
or suffix == "atei" | |
or (suffix == "ile" and word[-5:-3] != "ab") | |
): | |
word = word[:-2] | |
elif suffix in ("ea", "ele", "elor"): | |
word = suffix_replace(word, suffix, "e") | |
if suffix in rv: | |
rv = suffix_replace(rv, suffix, "e") | |
else: | |
rv = "" | |
elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): | |
word = suffix_replace(word, suffix, "i") | |
if suffix in rv: | |
rv = suffix_replace(rv, suffix, "i") | |
else: | |
rv = "" | |
elif suffix in ("a\u0163ie", "a\u0163ia"): | |
word = word[:-1] | |
break | |
# STEP 1: Reduction of combining suffixes | |
while True: | |
replacement_done = False | |
for suffix in self.__step1_suffixes: | |
if word.endswith(suffix): | |
if suffix in r1: | |
step1_success = True | |
replacement_done = True | |
if suffix in ( | |
"abilitate", | |
"abilitati", | |
"abilit\u0103i", | |
"abilit\u0103\u0163i", | |
): | |
word = suffix_replace(word, suffix, "abil") | |
elif suffix == "ibilitate": | |
word = word[:-5] | |
elif suffix in ( | |
"ivitate", | |
"ivitati", | |
"ivit\u0103i", | |
"ivit\u0103\u0163i", | |
): | |
word = suffix_replace(word, suffix, "iv") | |
elif suffix in ( | |
"icitate", | |
"icitati", | |
"icit\u0103i", | |
"icit\u0103\u0163i", | |
"icator", | |
"icatori", | |
"iciv", | |
"iciva", | |
"icive", | |
"icivi", | |
"iciv\u0103", | |
"ical", | |
"icala", | |
"icale", | |
"icali", | |
"ical\u0103", | |
): | |
word = suffix_replace(word, suffix, "ic") | |
elif suffix in ( | |
"ativ", | |
"ativa", | |
"ative", | |
"ativi", | |
"ativ\u0103", | |
"a\u0163iune", | |
"atoare", | |
"ator", | |
"atori", | |
"\u0103toare", | |
"\u0103tor", | |
"\u0103tori", | |
): | |
word = suffix_replace(word, suffix, "at") | |
if suffix in r2: | |
r2 = suffix_replace(r2, suffix, "at") | |
elif suffix in ( | |
"itiv", | |
"itiva", | |
"itive", | |
"itivi", | |
"itiv\u0103", | |
"i\u0163iune", | |
"itoare", | |
"itor", | |
"itori", | |
): | |
word = suffix_replace(word, suffix, "it") | |
if suffix in r2: | |
r2 = suffix_replace(r2, suffix, "it") | |
else: | |
step1_success = False | |
break | |
if not replacement_done: | |
break | |
# STEP 2: Removal of standard suffixes | |
for suffix in self.__step2_suffixes: | |
if word.endswith(suffix): | |
if suffix in r2: | |
step2_success = True | |
if suffix in ("iune", "iuni"): | |
if word[-5] == "\u0163": | |
word = "".join((word[:-5], "t")) | |
elif suffix in ( | |
"ism", | |
"isme", | |
"ist", | |
"ista", | |
"iste", | |
"isti", | |
"ist\u0103", | |
"i\u015Fti", | |
): | |
word = suffix_replace(word, suffix, "ist") | |
else: | |
word = word[: -len(suffix)] | |
break | |
# STEP 3: Removal of verb suffixes | |
if not step1_success and not step2_success: | |
for suffix in self.__step3_suffixes: | |
if word.endswith(suffix): | |
if suffix in rv: | |
if suffix in ( | |
"seser\u0103\u0163i", | |
"seser\u0103m", | |
"ser\u0103\u0163i", | |
"sese\u015Fi", | |
"seser\u0103", | |
"ser\u0103m", | |
"sesem", | |
"se\u015Fi", | |
"ser\u0103", | |
"sese", | |
"a\u0163i", | |
"e\u0163i", | |
"i\u0163i", | |
"\xE2\u0163i", | |
"sei", | |
"\u0103m", | |
"em", | |
"im", | |
"\xE2m", | |
"se", | |
): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
else: | |
if ( | |
not rv.startswith(suffix) | |
and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE" | |
): | |
word = word[: -len(suffix)] | |
break | |
# STEP 4: Removal of final vowel | |
for suffix in ("ie", "a", "e", "i", "\u0103"): | |
if word.endswith(suffix): | |
if suffix in rv: | |
word = word[: -len(suffix)] | |
break | |
word = word.replace("I", "i").replace("U", "u") | |
return word | |
class RussianStemmer(_LanguageSpecificStemmer): | |
""" | |
The Russian Snowball stemmer. | |
:cvar __perfective_gerund_suffixes: Suffixes to be deleted. | |
:type __perfective_gerund_suffixes: tuple | |
:cvar __adjectival_suffixes: Suffixes to be deleted. | |
:type __adjectival_suffixes: tuple | |
:cvar __reflexive_suffixes: Suffixes to be deleted. | |
:type __reflexive_suffixes: tuple | |
:cvar __verb_suffixes: Suffixes to be deleted. | |
:type __verb_suffixes: tuple | |
:cvar __noun_suffixes: Suffixes to be deleted. | |
:type __noun_suffixes: tuple | |
:cvar __superlative_suffixes: Suffixes to be deleted. | |
:type __superlative_suffixes: tuple | |
:cvar __derivational_suffixes: Suffixes to be deleted. | |
:type __derivational_suffixes: tuple | |
:note: A detailed description of the Russian | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/russian/stemmer.html | |
""" | |
__perfective_gerund_suffixes = ( | |
"ivshis'", | |
"yvshis'", | |
"vshis'", | |
"ivshi", | |
"yvshi", | |
"vshi", | |
"iv", | |
"yv", | |
"v", | |
) | |
__adjectival_suffixes = ( | |
"ui^ushchi^ui^u", | |
"ui^ushchi^ai^a", | |
"ui^ushchimi", | |
"ui^ushchymi", | |
"ui^ushchego", | |
"ui^ushchogo", | |
"ui^ushchemu", | |
"ui^ushchomu", | |
"ui^ushchikh", | |
"ui^ushchykh", | |
"ui^ushchui^u", | |
"ui^ushchaia", | |
"ui^ushchoi^u", | |
"ui^ushchei^u", | |
"i^ushchi^ui^u", | |
"i^ushchi^ai^a", | |
"ui^ushchee", | |
"ui^ushchie", | |
"ui^ushchye", | |
"ui^ushchoe", | |
"ui^ushchei`", | |
"ui^ushchii`", | |
"ui^ushchyi`", | |
"ui^ushchoi`", | |
"ui^ushchem", | |
"ui^ushchim", | |
"ui^ushchym", | |
"ui^ushchom", | |
"i^ushchimi", | |
"i^ushchymi", | |
"i^ushchego", | |
"i^ushchogo", | |
"i^ushchemu", | |
"i^ushchomu", | |
"i^ushchikh", | |
"i^ushchykh", | |
"i^ushchui^u", | |
"i^ushchai^a", | |
"i^ushchoi^u", | |
"i^ushchei^u", | |
"i^ushchee", | |
"i^ushchie", | |
"i^ushchye", | |
"i^ushchoe", | |
"i^ushchei`", | |
"i^ushchii`", | |
"i^ushchyi`", | |
"i^ushchoi`", | |
"i^ushchem", | |
"i^ushchim", | |
"i^ushchym", | |
"i^ushchom", | |
"shchi^ui^u", | |
"shchi^ai^a", | |
"ivshi^ui^u", | |
"ivshi^ai^a", | |
"yvshi^ui^u", | |
"yvshi^ai^a", | |
"shchimi", | |
"shchymi", | |
"shchego", | |
"shchogo", | |
"shchemu", | |
"shchomu", | |
"shchikh", | |
"shchykh", | |
"shchui^u", | |
"shchai^a", | |
"shchoi^u", | |
"shchei^u", | |
"ivshimi", | |
"ivshymi", | |
"ivshego", | |
"ivshogo", | |
"ivshemu", | |
"ivshomu", | |
"ivshikh", | |
"ivshykh", | |
"ivshui^u", | |
"ivshai^a", | |
"ivshoi^u", | |
"ivshei^u", | |
"yvshimi", | |
"yvshymi", | |
"yvshego", | |
"yvshogo", | |
"yvshemu", | |
"yvshomu", | |
"yvshikh", | |
"yvshykh", | |
"yvshui^u", | |
"yvshai^a", | |
"yvshoi^u", | |
"yvshei^u", | |
"vshi^ui^u", | |
"vshi^ai^a", | |
"shchee", | |
"shchie", | |
"shchye", | |
"shchoe", | |
"shchei`", | |
"shchii`", | |
"shchyi`", | |
"shchoi`", | |
"shchem", | |
"shchim", | |
"shchym", | |
"shchom", | |
"ivshee", | |
"ivshie", | |
"ivshye", | |
"ivshoe", | |
"ivshei`", | |
"ivshii`", | |
"ivshyi`", | |
"ivshoi`", | |
"ivshem", | |
"ivshim", | |
"ivshym", | |
"ivshom", | |
"yvshee", | |
"yvshie", | |
"yvshye", | |
"yvshoe", | |
"yvshei`", | |
"yvshii`", | |
"yvshyi`", | |
"yvshoi`", | |
"yvshem", | |
"yvshim", | |
"yvshym", | |
"yvshom", | |
"vshimi", | |
"vshymi", | |
"vshego", | |
"vshogo", | |
"vshemu", | |
"vshomu", | |
"vshikh", | |
"vshykh", | |
"vshui^u", | |
"vshai^a", | |
"vshoi^u", | |
"vshei^u", | |
"emi^ui^u", | |
"emi^ai^a", | |
"nni^ui^u", | |
"nni^ai^a", | |
"vshee", | |
"vshie", | |
"vshye", | |
"vshoe", | |
"vshei`", | |
"vshii`", | |
"vshyi`", | |
"vshoi`", | |
"vshem", | |
"vshim", | |
"vshym", | |
"vshom", | |
"emimi", | |
"emymi", | |
"emego", | |
"emogo", | |
"ememu", | |
"emomu", | |
"emikh", | |
"emykh", | |
"emui^u", | |
"emai^a", | |
"emoi^u", | |
"emei^u", | |
"nnimi", | |
"nnymi", | |
"nnego", | |
"nnogo", | |
"nnemu", | |
"nnomu", | |
"nnikh", | |
"nnykh", | |
"nnui^u", | |
"nnai^a", | |
"nnoi^u", | |
"nnei^u", | |
"emee", | |
"emie", | |
"emye", | |
"emoe", | |
"emei`", | |
"emii`", | |
"emyi`", | |
"emoi`", | |
"emem", | |
"emim", | |
"emym", | |
"emom", | |
"nnee", | |
"nnie", | |
"nnye", | |
"nnoe", | |
"nnei`", | |
"nnii`", | |
"nnyi`", | |
"nnoi`", | |
"nnem", | |
"nnim", | |
"nnym", | |
"nnom", | |
"i^ui^u", | |
"i^ai^a", | |
"imi", | |
"ymi", | |
"ego", | |
"ogo", | |
"emu", | |
"omu", | |
"ikh", | |
"ykh", | |
"ui^u", | |
"ai^a", | |
"oi^u", | |
"ei^u", | |
"ee", | |
"ie", | |
"ye", | |
"oe", | |
"ei`", | |
"ii`", | |
"yi`", | |
"oi`", | |
"em", | |
"im", | |
"ym", | |
"om", | |
) | |
__reflexive_suffixes = ("si^a", "s'") | |
__verb_suffixes = ( | |
"esh'", | |
"ei`te", | |
"ui`te", | |
"ui^ut", | |
"ish'", | |
"ete", | |
"i`te", | |
"i^ut", | |
"nno", | |
"ila", | |
"yla", | |
"ena", | |
"ite", | |
"ili", | |
"yli", | |
"ilo", | |
"ylo", | |
"eno", | |
"i^at", | |
"uet", | |
"eny", | |
"it'", | |
"yt'", | |
"ui^u", | |
"la", | |
"na", | |
"li", | |
"em", | |
"lo", | |
"no", | |
"et", | |
"ny", | |
"t'", | |
"ei`", | |
"ui`", | |
"il", | |
"yl", | |
"im", | |
"ym", | |
"en", | |
"it", | |
"yt", | |
"i^u", | |
"i`", | |
"l", | |
"n", | |
) | |
__noun_suffixes = ( | |
"ii^ami", | |
"ii^akh", | |
"i^ami", | |
"ii^am", | |
"i^akh", | |
"ami", | |
"iei`", | |
"i^am", | |
"iem", | |
"akh", | |
"ii^u", | |
"'i^u", | |
"ii^a", | |
"'i^a", | |
"ev", | |
"ov", | |
"ie", | |
"'e", | |
"ei", | |
"ii", | |
"ei`", | |
"oi`", | |
"ii`", | |
"em", | |
"am", | |
"om", | |
"i^u", | |
"i^a", | |
"a", | |
"e", | |
"i", | |
"i`", | |
"o", | |
"u", | |
"y", | |
"'", | |
) | |
__superlative_suffixes = ("ei`she", "ei`sh") | |
__derivational_suffixes = ("ost'", "ost") | |
def stem(self, word): | |
""" | |
Stem a Russian word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
if word in self.stopwords: | |
return word | |
chr_exceeded = False | |
for i in range(len(word)): | |
if ord(word[i]) > 255: | |
chr_exceeded = True | |
break | |
if not chr_exceeded: | |
return word | |
word = self.__cyrillic_to_roman(word) | |
step1_success = False | |
adjectival_removed = False | |
verb_removed = False | |
undouble_success = False | |
superlative_removed = False | |
rv, r2 = self.__regions_russian(word) | |
# Step 1 | |
for suffix in self.__perfective_gerund_suffixes: | |
if rv.endswith(suffix): | |
if suffix in ("v", "vshi", "vshis'"): | |
if ( | |
rv[-len(suffix) - 3 : -len(suffix)] == "i^a" | |
or rv[-len(suffix) - 1 : -len(suffix)] == "a" | |
): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
step1_success = True | |
break | |
else: | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
step1_success = True | |
break | |
if not step1_success: | |
for suffix in self.__reflexive_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
for suffix in self.__adjectival_suffixes: | |
if rv.endswith(suffix): | |
if suffix in ( | |
"i^ushchi^ui^u", | |
"i^ushchi^ai^a", | |
"i^ushchui^u", | |
"i^ushchai^a", | |
"i^ushchoi^u", | |
"i^ushchei^u", | |
"i^ushchimi", | |
"i^ushchymi", | |
"i^ushchego", | |
"i^ushchogo", | |
"i^ushchemu", | |
"i^ushchomu", | |
"i^ushchikh", | |
"i^ushchykh", | |
"shchi^ui^u", | |
"shchi^ai^a", | |
"i^ushchee", | |
"i^ushchie", | |
"i^ushchye", | |
"i^ushchoe", | |
"i^ushchei`", | |
"i^ushchii`", | |
"i^ushchyi`", | |
"i^ushchoi`", | |
"i^ushchem", | |
"i^ushchim", | |
"i^ushchym", | |
"i^ushchom", | |
"vshi^ui^u", | |
"vshi^ai^a", | |
"shchui^u", | |
"shchai^a", | |
"shchoi^u", | |
"shchei^u", | |
"emi^ui^u", | |
"emi^ai^a", | |
"nni^ui^u", | |
"nni^ai^a", | |
"shchimi", | |
"shchymi", | |
"shchego", | |
"shchogo", | |
"shchemu", | |
"shchomu", | |
"shchikh", | |
"shchykh", | |
"vshui^u", | |
"vshai^a", | |
"vshoi^u", | |
"vshei^u", | |
"shchee", | |
"shchie", | |
"shchye", | |
"shchoe", | |
"shchei`", | |
"shchii`", | |
"shchyi`", | |
"shchoi`", | |
"shchem", | |
"shchim", | |
"shchym", | |
"shchom", | |
"vshimi", | |
"vshymi", | |
"vshego", | |
"vshogo", | |
"vshemu", | |
"vshomu", | |
"vshikh", | |
"vshykh", | |
"emui^u", | |
"emai^a", | |
"emoi^u", | |
"emei^u", | |
"nnui^u", | |
"nnai^a", | |
"nnoi^u", | |
"nnei^u", | |
"vshee", | |
"vshie", | |
"vshye", | |
"vshoe", | |
"vshei`", | |
"vshii`", | |
"vshyi`", | |
"vshoi`", | |
"vshem", | |
"vshim", | |
"vshym", | |
"vshom", | |
"emimi", | |
"emymi", | |
"emego", | |
"emogo", | |
"ememu", | |
"emomu", | |
"emikh", | |
"emykh", | |
"nnimi", | |
"nnymi", | |
"nnego", | |
"nnogo", | |
"nnemu", | |
"nnomu", | |
"nnikh", | |
"nnykh", | |
"emee", | |
"emie", | |
"emye", | |
"emoe", | |
"emei`", | |
"emii`", | |
"emyi`", | |
"emoi`", | |
"emem", | |
"emim", | |
"emym", | |
"emom", | |
"nnee", | |
"nnie", | |
"nnye", | |
"nnoe", | |
"nnei`", | |
"nnii`", | |
"nnyi`", | |
"nnoi`", | |
"nnem", | |
"nnim", | |
"nnym", | |
"nnom", | |
): | |
if ( | |
rv[-len(suffix) - 3 : -len(suffix)] == "i^a" | |
or rv[-len(suffix) - 1 : -len(suffix)] == "a" | |
): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
adjectival_removed = True | |
break | |
else: | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
adjectival_removed = True | |
break | |
if not adjectival_removed: | |
for suffix in self.__verb_suffixes: | |
if rv.endswith(suffix): | |
if suffix in ( | |
"la", | |
"na", | |
"ete", | |
"i`te", | |
"li", | |
"i`", | |
"l", | |
"em", | |
"n", | |
"lo", | |
"no", | |
"et", | |
"i^ut", | |
"ny", | |
"t'", | |
"esh'", | |
"nno", | |
): | |
if ( | |
rv[-len(suffix) - 3 : -len(suffix)] == "i^a" | |
or rv[-len(suffix) - 1 : -len(suffix)] == "a" | |
): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
verb_removed = True | |
break | |
else: | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
verb_removed = True | |
break | |
if not adjectival_removed and not verb_removed: | |
for suffix in self.__noun_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# Step 2 | |
if rv.endswith("i"): | |
word = word[:-1] | |
r2 = r2[:-1] | |
# Step 3 | |
for suffix in self.__derivational_suffixes: | |
if r2.endswith(suffix): | |
word = word[: -len(suffix)] | |
break | |
# Step 4 | |
if word.endswith("nn"): | |
word = word[:-1] | |
undouble_success = True | |
if not undouble_success: | |
for suffix in self.__superlative_suffixes: | |
if word.endswith(suffix): | |
word = word[: -len(suffix)] | |
superlative_removed = True | |
break | |
if word.endswith("nn"): | |
word = word[:-1] | |
if not undouble_success and not superlative_removed: | |
if word.endswith("'"): | |
word = word[:-1] | |
word = self.__roman_to_cyrillic(word) | |
return word | |
def __regions_russian(self, word): | |
""" | |
Return the regions RV and R2 which are used by the Russian stemmer. | |
In any word, RV is the region after the first vowel, | |
or the end of the word if it contains no vowel. | |
R2 is the region after the first non-vowel following | |
a vowel in R1, or the end of the word if there is no such non-vowel. | |
R1 is the region after the first non-vowel following a vowel, | |
or the end of the word if there is no such non-vowel. | |
:param word: The Russian word whose regions RV and R2 are determined. | |
:type word: str or unicode | |
:return: the regions RV and R2 for the respective Russian word. | |
:rtype: tuple | |
:note: This helper method is invoked by the stem method of the subclass | |
RussianStemmer. It is not to be invoked directly! | |
""" | |
r1 = "" | |
r2 = "" | |
rv = "" | |
vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") | |
word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E") | |
for i in range(1, len(word)): | |
if word[i] not in vowels and word[i - 1] in vowels: | |
r1 = word[i + 1 :] | |
break | |
for i in range(1, len(r1)): | |
if r1[i] not in vowels and r1[i - 1] in vowels: | |
r2 = r1[i + 1 :] | |
break | |
for i in range(len(word)): | |
if word[i] in vowels: | |
rv = word[i + 1 :] | |
break | |
r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") | |
rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") | |
return (rv, r2) | |
def __cyrillic_to_roman(self, word): | |
""" | |
Transliterate a Russian word into the Roman alphabet. | |
A Russian word whose letters consist of the Cyrillic | |
alphabet are transliterated into the Roman alphabet | |
in order to ease the forthcoming stemming process. | |
:param word: The word that is transliterated. | |
:type word: unicode | |
:return: the transliterated word. | |
:rtype: unicode | |
:note: This helper method is invoked by the stem method of the subclass | |
RussianStemmer. It is not to be invoked directly! | |
""" | |
word = ( | |
word.replace("\u0410", "a") | |
.replace("\u0430", "a") | |
.replace("\u0411", "b") | |
.replace("\u0431", "b") | |
.replace("\u0412", "v") | |
.replace("\u0432", "v") | |
.replace("\u0413", "g") | |
.replace("\u0433", "g") | |
.replace("\u0414", "d") | |
.replace("\u0434", "d") | |
.replace("\u0415", "e") | |
.replace("\u0435", "e") | |
.replace("\u0401", "e") | |
.replace("\u0451", "e") | |
.replace("\u0416", "zh") | |
.replace("\u0436", "zh") | |
.replace("\u0417", "z") | |
.replace("\u0437", "z") | |
.replace("\u0418", "i") | |
.replace("\u0438", "i") | |
.replace("\u0419", "i`") | |
.replace("\u0439", "i`") | |
.replace("\u041A", "k") | |
.replace("\u043A", "k") | |
.replace("\u041B", "l") | |
.replace("\u043B", "l") | |
.replace("\u041C", "m") | |
.replace("\u043C", "m") | |
.replace("\u041D", "n") | |
.replace("\u043D", "n") | |
.replace("\u041E", "o") | |
.replace("\u043E", "o") | |
.replace("\u041F", "p") | |
.replace("\u043F", "p") | |
.replace("\u0420", "r") | |
.replace("\u0440", "r") | |
.replace("\u0421", "s") | |
.replace("\u0441", "s") | |
.replace("\u0422", "t") | |
.replace("\u0442", "t") | |
.replace("\u0423", "u") | |
.replace("\u0443", "u") | |
.replace("\u0424", "f") | |
.replace("\u0444", "f") | |
.replace("\u0425", "kh") | |
.replace("\u0445", "kh") | |
.replace("\u0426", "t^s") | |
.replace("\u0446", "t^s") | |
.replace("\u0427", "ch") | |
.replace("\u0447", "ch") | |
.replace("\u0428", "sh") | |
.replace("\u0448", "sh") | |
.replace("\u0429", "shch") | |
.replace("\u0449", "shch") | |
.replace("\u042A", "''") | |
.replace("\u044A", "''") | |
.replace("\u042B", "y") | |
.replace("\u044B", "y") | |
.replace("\u042C", "'") | |
.replace("\u044C", "'") | |
.replace("\u042D", "e`") | |
.replace("\u044D", "e`") | |
.replace("\u042E", "i^u") | |
.replace("\u044E", "i^u") | |
.replace("\u042F", "i^a") | |
.replace("\u044F", "i^a") | |
) | |
return word | |
def __roman_to_cyrillic(self, word): | |
""" | |
Transliterate a Russian word back into the Cyrillic alphabet. | |
A Russian word formerly transliterated into the Roman alphabet | |
in order to ease the stemming process, is transliterated back | |
into the Cyrillic alphabet, its original form. | |
:param word: The word that is transliterated. | |
:type word: str or unicode | |
:return: word, the transliterated word. | |
:rtype: unicode | |
:note: This helper method is invoked by the stem method of the subclass | |
RussianStemmer. It is not to be invoked directly! | |
""" | |
word = ( | |
word.replace("i^u", "\u044E") | |
.replace("i^a", "\u044F") | |
.replace("shch", "\u0449") | |
.replace("kh", "\u0445") | |
.replace("t^s", "\u0446") | |
.replace("ch", "\u0447") | |
.replace("e`", "\u044D") | |
.replace("i`", "\u0439") | |
.replace("sh", "\u0448") | |
.replace("k", "\u043A") | |
.replace("e", "\u0435") | |
.replace("zh", "\u0436") | |
.replace("a", "\u0430") | |
.replace("b", "\u0431") | |
.replace("v", "\u0432") | |
.replace("g", "\u0433") | |
.replace("d", "\u0434") | |
.replace("e", "\u0435") | |
.replace("z", "\u0437") | |
.replace("i", "\u0438") | |
.replace("l", "\u043B") | |
.replace("m", "\u043C") | |
.replace("n", "\u043D") | |
.replace("o", "\u043E") | |
.replace("p", "\u043F") | |
.replace("r", "\u0440") | |
.replace("s", "\u0441") | |
.replace("t", "\u0442") | |
.replace("u", "\u0443") | |
.replace("f", "\u0444") | |
.replace("''", "\u044A") | |
.replace("y", "\u044B") | |
.replace("'", "\u044C") | |
) | |
return word | |
class SpanishStemmer(_StandardStemmer): | |
""" | |
The Spanish Snowball stemmer. | |
:cvar __vowels: The Spanish vowels. | |
:type __vowels: unicode | |
:cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. | |
:type __step0_suffixes: tuple | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. | |
:type __step2a_suffixes: tuple | |
:cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. | |
:type __step2b_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the Spanish | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/spanish/stemmer.html | |
""" | |
__vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" | |
__step0_suffixes = ( | |
"selas", | |
"selos", | |
"sela", | |
"selo", | |
"las", | |
"les", | |
"los", | |
"nos", | |
"me", | |
"se", | |
"la", | |
"le", | |
"lo", | |
) | |
__step1_suffixes = ( | |
"amientos", | |
"imientos", | |
"amiento", | |
"imiento", | |
"acion", | |
"aciones", | |
"uciones", | |
"adoras", | |
"adores", | |
"ancias", | |
"log\xEDas", | |
"encias", | |
"amente", | |
"idades", | |
"anzas", | |
"ismos", | |
"ables", | |
"ibles", | |
"istas", | |
"adora", | |
"aci\xF3n", | |
"antes", | |
"ancia", | |
"log\xEDa", | |
"uci\xf3n", | |
"encia", | |
"mente", | |
"anza", | |
"icos", | |
"icas", | |
"ismo", | |
"able", | |
"ible", | |
"ista", | |
"osos", | |
"osas", | |
"ador", | |
"ante", | |
"idad", | |
"ivas", | |
"ivos", | |
"ico", | |
"ica", | |
"oso", | |
"osa", | |
"iva", | |
"ivo", | |
) | |
__step2a_suffixes = ( | |
"yeron", | |
"yendo", | |
"yamos", | |
"yais", | |
"yan", | |
"yen", | |
"yas", | |
"yes", | |
"ya", | |
"ye", | |
"yo", | |
"y\xF3", | |
) | |
__step2b_suffixes = ( | |
"ar\xEDamos", | |
"er\xEDamos", | |
"ir\xEDamos", | |
"i\xE9ramos", | |
"i\xE9semos", | |
"ar\xEDais", | |
"aremos", | |
"er\xEDais", | |
"eremos", | |
"ir\xEDais", | |
"iremos", | |
"ierais", | |
"ieseis", | |
"asteis", | |
"isteis", | |
"\xE1bamos", | |
"\xE1ramos", | |
"\xE1semos", | |
"ar\xEDan", | |
"ar\xEDas", | |
"ar\xE9is", | |
"er\xEDan", | |
"er\xEDas", | |
"er\xE9is", | |
"ir\xEDan", | |
"ir\xEDas", | |
"ir\xE9is", | |
"ieran", | |
"iesen", | |
"ieron", | |
"iendo", | |
"ieras", | |
"ieses", | |
"abais", | |
"arais", | |
"aseis", | |
"\xE9amos", | |
"ar\xE1n", | |
"ar\xE1s", | |
"ar\xEDa", | |
"er\xE1n", | |
"er\xE1s", | |
"er\xEDa", | |
"ir\xE1n", | |
"ir\xE1s", | |
"ir\xEDa", | |
"iera", | |
"iese", | |
"aste", | |
"iste", | |
"aban", | |
"aran", | |
"asen", | |
"aron", | |
"ando", | |
"abas", | |
"adas", | |
"idas", | |
"aras", | |
"ases", | |
"\xEDais", | |
"ados", | |
"idos", | |
"amos", | |
"imos", | |
"emos", | |
"ar\xE1", | |
"ar\xE9", | |
"er\xE1", | |
"er\xE9", | |
"ir\xE1", | |
"ir\xE9", | |
"aba", | |
"ada", | |
"ida", | |
"ara", | |
"ase", | |
"\xEDan", | |
"ado", | |
"ido", | |
"\xEDas", | |
"\xE1is", | |
"\xE9is", | |
"\xEDa", | |
"ad", | |
"ed", | |
"id", | |
"an", | |
"i\xF3", | |
"ar", | |
"er", | |
"ir", | |
"as", | |
"\xEDs", | |
"en", | |
"es", | |
) | |
__step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3") | |
def stem(self, word): | |
""" | |
Stem a Spanish word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
step1_success = False | |
r1, r2 = self._r1r2_standard(word, self.__vowels) | |
rv = self._rv_standard(word, self.__vowels) | |
# STEP 0: Attached pronoun | |
for suffix in self.__step0_suffixes: | |
if not (word.endswith(suffix) and rv.endswith(suffix)): | |
continue | |
if ( | |
rv[: -len(suffix)].endswith( | |
( | |
"ando", | |
"\xE1ndo", | |
"ar", | |
"\xE1r", | |
"er", | |
"\xE9r", | |
"iendo", | |
"i\xE9ndo", | |
"ir", | |
"\xEDr", | |
) | |
) | |
) or ( | |
rv[: -len(suffix)].endswith("yendo") | |
and word[: -len(suffix)].endswith("uyendo") | |
): | |
word = self.__replace_accented(word[: -len(suffix)]) | |
r1 = self.__replace_accented(r1[: -len(suffix)]) | |
r2 = self.__replace_accented(r2[: -len(suffix)]) | |
rv = self.__replace_accented(rv[: -len(suffix)]) | |
break | |
# STEP 1: Standard suffix removal | |
for suffix in self.__step1_suffixes: | |
if not word.endswith(suffix): | |
continue | |
if suffix == "amente" and r1.endswith(suffix): | |
step1_success = True | |
word = word[:-6] | |
r2 = r2[:-6] | |
rv = rv[:-6] | |
if r2.endswith("iv"): | |
word = word[:-2] | |
r2 = r2[:-2] | |
rv = rv[:-2] | |
if r2.endswith("at"): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith(("os", "ic", "ad")): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif r2.endswith(suffix): | |
step1_success = True | |
if suffix in ( | |
"adora", | |
"ador", | |
"aci\xF3n", | |
"adoras", | |
"adores", | |
"acion", | |
"aciones", | |
"ante", | |
"antes", | |
"ancia", | |
"ancias", | |
): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith("ic"): | |
word = word[:-2] | |
rv = rv[:-2] | |
elif suffix in ("log\xEDa", "log\xEDas"): | |
word = suffix_replace(word, suffix, "log") | |
rv = suffix_replace(rv, suffix, "log") | |
elif suffix in ("uci\xF3n", "uciones"): | |
word = suffix_replace(word, suffix, "u") | |
rv = suffix_replace(rv, suffix, "u") | |
elif suffix in ("encia", "encias"): | |
word = suffix_replace(word, suffix, "ente") | |
rv = suffix_replace(rv, suffix, "ente") | |
elif suffix == "mente": | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith(("ante", "able", "ible")): | |
word = word[:-4] | |
rv = rv[:-4] | |
elif suffix in ("idad", "idades"): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
for pre_suff in ("abil", "ic", "iv"): | |
if r2.endswith(pre_suff): | |
word = word[: -len(pre_suff)] | |
rv = rv[: -len(pre_suff)] | |
elif suffix in ("ivo", "iva", "ivos", "ivas"): | |
word = word[: -len(suffix)] | |
r2 = r2[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if r2.endswith("at"): | |
word = word[:-2] | |
rv = rv[:-2] | |
else: | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 2a: Verb suffixes beginning 'y' | |
if not step1_success: | |
for suffix in self.__step2a_suffixes: | |
if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u": | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
break | |
# STEP 2b: Other verb suffixes | |
for suffix in self.__step2b_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
rv = rv[: -len(suffix)] | |
if suffix in ("en", "es", "\xE9is", "emos"): | |
if word.endswith("gu"): | |
word = word[:-1] | |
if rv.endswith("gu"): | |
rv = rv[:-1] | |
break | |
# STEP 3: Residual suffix | |
for suffix in self.__step3_suffixes: | |
if rv.endswith(suffix): | |
word = word[: -len(suffix)] | |
if suffix in ("e", "\xE9"): | |
rv = rv[: -len(suffix)] | |
if word[-2:] == "gu" and rv.endswith("u"): | |
word = word[:-1] | |
break | |
word = self.__replace_accented(word) | |
return word | |
def __replace_accented(self, word): | |
""" | |
Replaces all accented letters on a word with their non-accented | |
counterparts. | |
:param word: A spanish word, with or without accents | |
:type word: str or unicode | |
:return: a word with the accented letters (á, é, í, ó, ú) replaced with | |
their non-accented counterparts (a, e, i, o, u) | |
:rtype: str or unicode | |
""" | |
return ( | |
word.replace("\xE1", "a") | |
.replace("\xE9", "e") | |
.replace("\xED", "i") | |
.replace("\xF3", "o") | |
.replace("\xFA", "u") | |
) | |
class SwedishStemmer(_ScandinavianStemmer): | |
""" | |
The Swedish Snowball stemmer. | |
:cvar __vowels: The Swedish vowels. | |
:type __vowels: unicode | |
:cvar __s_ending: Letters that may directly appear before a word final 's'. | |
:type __s_ending: unicode | |
:cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. | |
:type __step1_suffixes: tuple | |
:cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. | |
:type __step2_suffixes: tuple | |
:cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. | |
:type __step3_suffixes: tuple | |
:note: A detailed description of the Swedish | |
stemming algorithm can be found under | |
http://snowball.tartarus.org/algorithms/swedish/stemmer.html | |
""" | |
__vowels = "aeiouy\xE4\xE5\xF6" | |
__s_ending = "bcdfghjklmnoprtvy" | |
__step1_suffixes = ( | |
"heterna", | |
"hetens", | |
"heter", | |
"heten", | |
"anden", | |
"arnas", | |
"ernas", | |
"ornas", | |
"andes", | |
"andet", | |
"arens", | |
"arna", | |
"erna", | |
"orna", | |
"ande", | |
"arne", | |
"aste", | |
"aren", | |
"ades", | |
"erns", | |
"ade", | |
"are", | |
"ern", | |
"ens", | |
"het", | |
"ast", | |
"ad", | |
"en", | |
"ar", | |
"er", | |
"or", | |
"as", | |
"es", | |
"at", | |
"a", | |
"e", | |
"s", | |
) | |
__step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") | |
__step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") | |
def stem(self, word): | |
""" | |
Stem a Swedish word and return the stemmed form. | |
:param word: The word that is stemmed. | |
:type word: str or unicode | |
:return: The stemmed form. | |
:rtype: unicode | |
""" | |
word = word.lower() | |
if word in self.stopwords: | |
return word | |
r1 = self._r1_scandinavian(word, self.__vowels) | |
# STEP 1 | |
for suffix in self.__step1_suffixes: | |
if r1.endswith(suffix): | |
if suffix == "s": | |
if word[-2] in self.__s_ending: | |
word = word[:-1] | |
r1 = r1[:-1] | |
else: | |
word = word[: -len(suffix)] | |
r1 = r1[: -len(suffix)] | |
break | |
# STEP 2 | |
for suffix in self.__step2_suffixes: | |
if r1.endswith(suffix): | |
word = word[:-1] | |
r1 = r1[:-1] | |
break | |
# STEP 3 | |
for suffix in self.__step3_suffixes: | |
if r1.endswith(suffix): | |
if suffix in ("els", "lig", "ig"): | |
word = word[: -len(suffix)] | |
elif suffix in ("fullt", "l\xF6st"): | |
word = word[:-1] | |
break | |
return word | |
def demo(): | |
""" | |
This function provides a demonstration of the Snowball stemmers. | |
After invoking this function and specifying a language, | |
it stems an excerpt of the Universal Declaration of Human Rights | |
(which is a part of the NLTK corpus collection) and then prints | |
out the original and the stemmed text. | |
""" | |
from nltk.corpus import udhr | |
udhr_corpus = { | |
"arabic": "Arabic_Alarabia-Arabic", | |
"danish": "Danish_Dansk-Latin1", | |
"dutch": "Dutch_Nederlands-Latin1", | |
"english": "English-Latin1", | |
"finnish": "Finnish_Suomi-Latin1", | |
"french": "French_Francais-Latin1", | |
"german": "German_Deutsch-Latin1", | |
"hungarian": "Hungarian_Magyar-UTF8", | |
"italian": "Italian_Italiano-Latin1", | |
"norwegian": "Norwegian-Latin1", | |
"porter": "English-Latin1", | |
"portuguese": "Portuguese_Portugues-Latin1", | |
"romanian": "Romanian_Romana-Latin2", | |
"russian": "Russian-UTF8", | |
"spanish": "Spanish-Latin1", | |
"swedish": "Swedish_Svenska-Latin1", | |
} | |
print("\n") | |
print("******************************") | |
print("Demo for the Snowball stemmers") | |
print("******************************") | |
while True: | |
language = input( | |
"Please enter the name of the language " | |
+ "to be demonstrated\n" | |
+ "/".join(SnowballStemmer.languages) | |
+ "\n" | |
+ "(enter 'exit' in order to leave): " | |
) | |
if language == "exit": | |
break | |
if language not in SnowballStemmer.languages: | |
print( | |
"\nOops, there is no stemmer for this language. " | |
+ "Please try again.\n" | |
) | |
continue | |
stemmer = SnowballStemmer(language) | |
excerpt = udhr.words(udhr_corpus[language])[:300] | |
stemmed = " ".join(stemmer.stem(word) for word in excerpt) | |
stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip() | |
excerpt = " ".join(excerpt) | |
excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip() | |
print("\n") | |
print("-" * 70) | |
print("ORIGINAL".center(70)) | |
print(excerpt) | |
print("\n\n") | |
print("STEMMED RESULTS".center(70)) | |
print(stemmed) | |
print("-" * 70) | |
print("\n") | |