| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709 |
- """
- Porter Stemmer
- This is the Porter stemming algorithm. It follows the algorithm
- presented in
- Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
- with some optional deviations that can be turned on or off with the
- `mode` argument to the constructor.
- Martin Porter, the algorithm's inventor, maintains a web page about the
- algorithm at
- http://www.tartarus.org/~martin/PorterStemmer/
- which includes another Python implementation and other implementations
- in many languages.
- """
- __docformat__ = "plaintext"
- import re
- from nltk.stem.api import StemmerI
- class PorterStemmer(StemmerI):
- """
- A word stemmer based on the Porter stemming algorithm.
- Porter, M. "An algorithm for suffix stripping."
- Program 14.3 (1980): 130-137.
- See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
- of the algorithm.
- Martin Porter has endorsed several modifications to the Porter
- algorithm since writing his original paper, and those extensions are
- included in the implementations on his website. Additionally, others
- have proposed further improvements to the algorithm, including NLTK
- contributors. There are thus three modes that can be selected by
- passing the appropriate constant to the class constructor's `mode`
- attribute:
- PorterStemmer.ORIGINAL_ALGORITHM
- - Implementation that is faithful to the original paper.
- Note that Martin Porter has deprecated this version of the
- algorithm. Martin distributes implementations of the Porter
- Stemmer in many languages, hosted at:
- http://www.tartarus.org/~martin/PorterStemmer/
- and all of these implementations include his extensions. He
- strongly recommends against using the original, published
- version of the algorithm; only use this mode if you clearly
- understand why you are choosing to do so.
- PorterStemmer.MARTIN_EXTENSIONS
- - Implementation that only uses the modifications to the
- algorithm that are included in the implementations on Martin
- Porter's website. He has declared Porter frozen, so the
- behaviour of those implementations should never change.
- PorterStemmer.NLTK_EXTENSIONS (default)
- - Implementation that includes further improvements devised by
- NLTK contributors or taken from other modified implementations
- found on the web.
- For the best stemming, you should use the default NLTK_EXTENSIONS
- version. However, if you need to get the same results as either the
- original algorithm or one of Martin Porter's hosted versions for
- compatibility with an existing implementation or dataset, you can use
- one of the other modes instead.
- """
- # Modes the Stemmer can be instantiated in
- NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
- MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
- ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
- def __init__(self, mode=NLTK_EXTENSIONS):
- if mode not in (
- self.NLTK_EXTENSIONS,
- self.MARTIN_EXTENSIONS,
- self.ORIGINAL_ALGORITHM,
- ):
- raise ValueError(
- "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
- "PorterStemmer.MARTIN_EXTENSIONS, or "
- "PorterStemmer.ORIGINAL_ALGORITHM"
- )
- self.mode = mode
- if self.mode == self.NLTK_EXTENSIONS:
- # This is a table of irregular forms. It is quite short,
- # but still reflects the errors actually drawn to Martin
- # Porter's attention over a 20 year period!
- irregular_forms = {
- "sky": ["sky", "skies"],
- "die": ["dying"],
- "lie": ["lying"],
- "tie": ["tying"],
- "news": ["news"],
- "inning": ["innings", "inning"],
- "outing": ["outings", "outing"],
- "canning": ["cannings", "canning"],
- "howe": ["howe"],
- "proceed": ["proceed"],
- "exceed": ["exceed"],
- "succeed": ["succeed"],
- }
- self.pool = {}
- for key in irregular_forms:
- for val in irregular_forms[key]:
- self.pool[val] = key
- self.vowels = frozenset(["a", "e", "i", "o", "u"])
- def _is_consonant(self, word, i):
- """Returns True if word[i] is a consonant, False otherwise
- A consonant is defined in the paper as follows:
- A consonant in a word is a letter other than A, E, I, O or
- U, and other than Y preceded by a consonant. (The fact that
- the term `consonant' is defined to some extent in terms of
- itself does not make it ambiguous.) So in TOY the consonants
- are T and Y, and in SYZYGY they are S, Z and G. If a letter
- is not a consonant it is a vowel.
- """
- if word[i] in self.vowels:
- return False
- if word[i] == "y":
- if i == 0:
- return True
- else:
- return not self._is_consonant(word, i - 1)
- return True
- def _measure(self, stem):
- """Returns the 'measure' of stem, per definition in the paper
- From the paper:
- A consonant will be denoted by c, a vowel by v. A list
- ccc... of length greater than 0 will be denoted by C, and a
- list vvv... of length greater than 0 will be denoted by V.
- Any word, or part of a word, therefore has one of the four
- forms:
- CVCV ... C
- CVCV ... V
- VCVC ... C
- VCVC ... V
- These may all be represented by the single form
- [C]VCVC ... [V]
- where the square brackets denote arbitrary presence of their
- contents. Using (VC){m} to denote VC repeated m times, this
- may again be written as
- [C](VC){m}[V].
- m will be called the \measure\ of any word or word part when
- represented in this form. The case m = 0 covers the null
- word. Here are some examples:
- m=0 TR, EE, TREE, Y, BY.
- m=1 TROUBLE, OATS, TREES, IVY.
- m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
- """
- cv_sequence = ""
- # Construct a string of 'c's and 'v's representing whether each
- # character in `stem` is a consonant or a vowel.
- # e.g. 'falafel' becomes 'cvcvcvc',
- # 'architecture' becomes 'vcccvcvccvcv'
- for i in range(len(stem)):
- if self._is_consonant(stem, i):
- cv_sequence += "c"
- else:
- cv_sequence += "v"
- # Count the number of 'vc' occurences, which is equivalent to
- # the number of 'VC' occurrences in Porter's reduced form in the
- # docstring above, which is in turn equivalent to `m`
- return cv_sequence.count("vc")
- def _has_positive_measure(self, stem):
- return self._measure(stem) > 0
- def _contains_vowel(self, stem):
- """Returns True if stem contains a vowel, else False"""
- for i in range(len(stem)):
- if not self._is_consonant(stem, i):
- return True
- return False
- def _ends_double_consonant(self, word):
- """Implements condition *d from the paper
- Returns True if word ends with a double consonant
- """
- return (
- len(word) >= 2
- and word[-1] == word[-2]
- and self._is_consonant(word, len(word) - 1)
- )
- def _ends_cvc(self, word):
- """Implements condition *o from the paper
- From the paper:
- *o - the stem ends cvc, where the second c is not W, X or Y
- (e.g. -WIL, -HOP).
- """
- return (
- len(word) >= 3
- and self._is_consonant(word, len(word) - 3)
- and not self._is_consonant(word, len(word) - 2)
- and self._is_consonant(word, len(word) - 1)
- and word[-1] not in ("w", "x", "y")
- ) or (
- self.mode == self.NLTK_EXTENSIONS
- and len(word) == 2
- and not self._is_consonant(word, 0)
- and self._is_consonant(word, 1)
- )
- def _replace_suffix(self, word, suffix, replacement):
- """Replaces `suffix` of `word` with `replacement"""
- assert word.endswith(suffix), "Given word doesn't end with given suffix"
- if suffix == "":
- return word + replacement
- else:
- return word[: -len(suffix)] + replacement
- def _apply_rule_list(self, word, rules):
- """Applies the first applicable suffix-removal rule to the word
- Takes a word and a list of suffix-removal rules represented as
- 3-tuples, with the first element being the suffix to remove,
- the second element being the string to replace it with, and the
- final element being the condition for the rule to be applicable,
- or None if the rule is unconditional.
- """
- for rule in rules:
- suffix, replacement, condition = rule
- if suffix == "*d" and self._ends_double_consonant(word):
- stem = word[:-2]
- if condition is None or condition(stem):
- return stem + replacement
- else:
- # Don't try any further rules
- return word
- if word.endswith(suffix):
- stem = self._replace_suffix(word, suffix, "")
- if condition is None or condition(stem):
- return stem + replacement
- else:
- # Don't try any further rules
- return word
- return word
- def _step1a(self, word):
- """Implements Step 1a from "An algorithm for suffix stripping"
- From the paper:
- SSES -> SS caresses -> caress
- IES -> I ponies -> poni
- ties -> ti
- SS -> SS caress -> caress
- S -> cats -> cat
- """
- # this NLTK-only rule extends the original algorithm, so
- # that 'flies'->'fli' but 'dies'->'die' etc
- if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ies") and len(word) == 4:
- return self._replace_suffix(word, "ies", "ie")
- return self._apply_rule_list(
- word,
- [
- ("sses", "ss", None), # SSES -> SS
- ("ies", "i", None), # IES -> I
- ("ss", "ss", None), # SS -> SS
- ("s", "", None), # S ->
- ],
- )
- def _step1b(self, word):
- """Implements Step 1b from "An algorithm for suffix stripping"
- From the paper:
- (m>0) EED -> EE feed -> feed
- agreed -> agree
- (*v*) ED -> plastered -> plaster
- bled -> bled
- (*v*) ING -> motoring -> motor
- sing -> sing
- If the second or third of the rules in Step 1b is successful,
- the following is done:
- AT -> ATE conflat(ed) -> conflate
- BL -> BLE troubl(ed) -> trouble
- IZ -> IZE siz(ed) -> size
- (*d and not (*L or *S or *Z))
- -> single letter
- hopp(ing) -> hop
- tann(ed) -> tan
- fall(ing) -> fall
- hiss(ing) -> hiss
- fizz(ed) -> fizz
- (m=1 and *o) -> E fail(ing) -> fail
- fil(ing) -> file
- The rule to map to a single letter causes the removal of one of
- the double letter pair. The -E is put back on -AT, -BL and -IZ,
- so that the suffixes -ATE, -BLE and -IZE can be recognised
- later. This E may be removed in step 4.
- """
- # this NLTK-only block extends the original algorithm, so that
- # 'spied'->'spi' but 'died'->'die' etc
- if self.mode == self.NLTK_EXTENSIONS:
- if word.endswith("ied"):
- if len(word) == 4:
- return self._replace_suffix(word, "ied", "ie")
- else:
- return self._replace_suffix(word, "ied", "i")
- # (m>0) EED -> EE
- if word.endswith("eed"):
- stem = self._replace_suffix(word, "eed", "")
- if self._measure(stem) > 0:
- return stem + "ee"
- else:
- return word
- rule_2_or_3_succeeded = False
- for suffix in ["ed", "ing"]:
- if word.endswith(suffix):
- intermediate_stem = self._replace_suffix(word, suffix, "")
- if self._contains_vowel(intermediate_stem):
- rule_2_or_3_succeeded = True
- break
- if not rule_2_or_3_succeeded:
- return word
- return self._apply_rule_list(
- intermediate_stem,
- [
- ("at", "ate", None), # AT -> ATE
- ("bl", "ble", None), # BL -> BLE
- ("iz", "ize", None), # IZ -> IZE
- # (*d and not (*L or *S or *Z))
- # -> single letter
- (
- "*d",
- intermediate_stem[-1],
- lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
- ),
- # (m=1 and *o) -> E
- (
- "",
- "e",
- lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
- ),
- ],
- )
- def _step1c(self, word):
- """Implements Step 1c from "An algorithm for suffix stripping"
- From the paper:
- Step 1c
- (*v*) Y -> I happy -> happi
- sky -> sky
- """
- def nltk_condition(stem):
- """
- This has been modified from the original Porter algorithm so
- that y->i is only done when y is preceded by a consonant,
- but not if the stem is only a single consonant, i.e.
- (*c and not c) Y -> I
- So 'happy' -> 'happi', but
- 'enjoy' -> 'enjoy' etc
- This is a much better rule. Formerly 'enjoy'->'enjoi' and
- 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
- with this modification that no longer really matters.
- Also, the removal of the contains_vowel(z) condition means
- that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
- conflate with 'spied', 'tried', 'flies' ...
- """
- return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
- def original_condition(stem):
- return self._contains_vowel(stem)
- return self._apply_rule_list(
- word,
- [
- (
- "y",
- "i",
- nltk_condition
- if self.mode == self.NLTK_EXTENSIONS
- else original_condition,
- )
- ],
- )
- def _step2(self, word):
- """Implements Step 2 from "An algorithm for suffix stripping"
- From the paper:
- Step 2
- (m>0) ATIONAL -> ATE relational -> relate
- (m>0) TIONAL -> TION conditional -> condition
- rational -> rational
- (m>0) ENCI -> ENCE valenci -> valence
- (m>0) ANCI -> ANCE hesitanci -> hesitance
- (m>0) IZER -> IZE digitizer -> digitize
- (m>0) ABLI -> ABLE conformabli -> conformable
- (m>0) ALLI -> AL radicalli -> radical
- (m>0) ENTLI -> ENT differentli -> different
- (m>0) ELI -> E vileli - > vile
- (m>0) OUSLI -> OUS analogousli -> analogous
- (m>0) IZATION -> IZE vietnamization -> vietnamize
- (m>0) ATION -> ATE predication -> predicate
- (m>0) ATOR -> ATE operator -> operate
- (m>0) ALISM -> AL feudalism -> feudal
- (m>0) IVENESS -> IVE decisiveness -> decisive
- (m>0) FULNESS -> FUL hopefulness -> hopeful
- (m>0) OUSNESS -> OUS callousness -> callous
- (m>0) ALITI -> AL formaliti -> formal
- (m>0) IVITI -> IVE sensitiviti -> sensitive
- (m>0) BILITI -> BLE sensibiliti -> sensible
- """
- if self.mode == self.NLTK_EXTENSIONS:
- # Instead of applying the ALLI -> AL rule after '(a)bli' per
- # the published algorithm, instead we apply it first, and,
- # if it succeeds, run the result through step2 again.
- if word.endswith("alli") and self._has_positive_measure(
- self._replace_suffix(word, "alli", "")
- ):
- return self._step2(self._replace_suffix(word, "alli", "al"))
- bli_rule = ("bli", "ble", self._has_positive_measure)
- abli_rule = ("abli", "able", self._has_positive_measure)
- rules = [
- ("ational", "ate", self._has_positive_measure),
- ("tional", "tion", self._has_positive_measure),
- ("enci", "ence", self._has_positive_measure),
- ("anci", "ance", self._has_positive_measure),
- ("izer", "ize", self._has_positive_measure),
- abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
- ("alli", "al", self._has_positive_measure),
- ("entli", "ent", self._has_positive_measure),
- ("eli", "e", self._has_positive_measure),
- ("ousli", "ous", self._has_positive_measure),
- ("ization", "ize", self._has_positive_measure),
- ("ation", "ate", self._has_positive_measure),
- ("ator", "ate", self._has_positive_measure),
- ("alism", "al", self._has_positive_measure),
- ("iveness", "ive", self._has_positive_measure),
- ("fulness", "ful", self._has_positive_measure),
- ("ousness", "ous", self._has_positive_measure),
- ("aliti", "al", self._has_positive_measure),
- ("iviti", "ive", self._has_positive_measure),
- ("biliti", "ble", self._has_positive_measure),
- ]
- if self.mode == self.NLTK_EXTENSIONS:
- rules.append(("fulli", "ful", self._has_positive_measure))
- # The 'l' of the 'logi' -> 'log' rule is put with the stem,
- # so that short stems like 'geo' 'theo' etc work like
- # 'archaeo' 'philo' etc.
- rules.append(
- ("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
- )
- if self.mode == self.MARTIN_EXTENSIONS:
- rules.append(("logi", "log", self._has_positive_measure))
- return self._apply_rule_list(word, rules)
- def _step3(self, word):
- """Implements Step 3 from "An algorithm for suffix stripping"
- From the paper:
- Step 3
- (m>0) ICATE -> IC triplicate -> triplic
- (m>0) ATIVE -> formative -> form
- (m>0) ALIZE -> AL formalize -> formal
- (m>0) ICITI -> IC electriciti -> electric
- (m>0) ICAL -> IC electrical -> electric
- (m>0) FUL -> hopeful -> hope
- (m>0) NESS -> goodness -> good
- """
- return self._apply_rule_list(
- word,
- [
- ("icate", "ic", self._has_positive_measure),
- ("ative", "", self._has_positive_measure),
- ("alize", "al", self._has_positive_measure),
- ("iciti", "ic", self._has_positive_measure),
- ("ical", "ic", self._has_positive_measure),
- ("ful", "", self._has_positive_measure),
- ("ness", "", self._has_positive_measure),
- ],
- )
- def _step4(self, word):
- """Implements Step 4 from "An algorithm for suffix stripping"
- Step 4
- (m>1) AL -> revival -> reviv
- (m>1) ANCE -> allowance -> allow
- (m>1) ENCE -> inference -> infer
- (m>1) ER -> airliner -> airlin
- (m>1) IC -> gyroscopic -> gyroscop
- (m>1) ABLE -> adjustable -> adjust
- (m>1) IBLE -> defensible -> defens
- (m>1) ANT -> irritant -> irrit
- (m>1) EMENT -> replacement -> replac
- (m>1) MENT -> adjustment -> adjust
- (m>1) ENT -> dependent -> depend
- (m>1 and (*S or *T)) ION -> adoption -> adopt
- (m>1) OU -> homologou -> homolog
- (m>1) ISM -> communism -> commun
- (m>1) ATE -> activate -> activ
- (m>1) ITI -> angulariti -> angular
- (m>1) OUS -> homologous -> homolog
- (m>1) IVE -> effective -> effect
- (m>1) IZE -> bowdlerize -> bowdler
- The suffixes are now removed. All that remains is a little
- tidying up.
- """
- measure_gt_1 = lambda stem: self._measure(stem) > 1
- return self._apply_rule_list(
- word,
- [
- ("al", "", measure_gt_1),
- ("ance", "", measure_gt_1),
- ("ence", "", measure_gt_1),
- ("er", "", measure_gt_1),
- ("ic", "", measure_gt_1),
- ("able", "", measure_gt_1),
- ("ible", "", measure_gt_1),
- ("ant", "", measure_gt_1),
- ("ement", "", measure_gt_1),
- ("ment", "", measure_gt_1),
- ("ent", "", measure_gt_1),
- # (m>1 and (*S or *T)) ION ->
- (
- "ion",
- "",
- lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
- ),
- ("ou", "", measure_gt_1),
- ("ism", "", measure_gt_1),
- ("ate", "", measure_gt_1),
- ("iti", "", measure_gt_1),
- ("ous", "", measure_gt_1),
- ("ive", "", measure_gt_1),
- ("ize", "", measure_gt_1),
- ],
- )
- def _step5a(self, word):
- """Implements Step 5a from "An algorithm for suffix stripping"
- From the paper:
- Step 5a
- (m>1) E -> probate -> probat
- rate -> rate
- (m=1 and not *o) E -> cease -> ceas
- """
- # Note that Martin's test vocabulary and reference
- # implementations are inconsistent in how they handle the case
- # where two rules both refer to a suffix that matches the word
- # to be stemmed, but only the condition of the second one is
- # true.
- # Earlier in step2b we had the rules:
- # (m>0) EED -> EE
- # (*v*) ED ->
- # but the examples in the paper included "feed"->"feed", even
- # though (*v*) is true for "fe" and therefore the second rule
- # alone would map "feed"->"fe".
- # However, in THIS case, we need to handle the consecutive rules
- # differently and try both conditions (obviously; the second
- # rule here would be redundant otherwise). Martin's paper makes
- # no explicit mention of the inconsistency; you have to infer it
- # from the examples.
- # For this reason, we can't use _apply_rule_list here.
- if word.endswith("e"):
- stem = self._replace_suffix(word, "e", "")
- if self._measure(stem) > 1:
- return stem
- if self._measure(stem) == 1 and not self._ends_cvc(stem):
- return stem
- return word
- def _step5b(self, word):
- """Implements Step 5a from "An algorithm for suffix stripping"
- From the paper:
- Step 5b
- (m > 1 and *d and *L) -> single letter
- controll -> control
- roll -> roll
- """
- return self._apply_rule_list(
- word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
- )
- def stem(self, word):
- stem = word.lower()
- if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
- return self.pool[word]
- if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
- # With this line, strings of length 1 or 2 don't go through
- # the stemming process, although no mention is made of this
- # in the published algorithm.
- return word
- stem = self._step1a(stem)
- stem = self._step1b(stem)
- stem = self._step1c(stem)
- stem = self._step2(stem)
- stem = self._step3(stem)
- stem = self._step4(stem)
- stem = self._step5a(stem)
- stem = self._step5b(stem)
- return stem
- def __repr__(self):
- return "<PorterStemmer>"
- def demo():
- """
- A demonstration of the porter stemmer on a sample from
- the Penn Treebank corpus.
- """
- from nltk.corpus import treebank
- from nltk import stem
- stemmer = stem.PorterStemmer()
- orig = []
- stemmed = []
- for item in treebank.fileids()[:3]:
- for (word, tag) in treebank.tagged_words(item):
- orig.append(word)
- stemmed.append(stemmer.stem(word))
- # Convert the results to a string, and word-wrap them.
- results = " ".join(stemmed)
- results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
- # Convert the original to a string, and word wrap it.
- original = " ".join(orig)
- original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
- # Print the results.
- print("-Original-".center(70).replace(" ", "*").replace("-", " "))
- print(original)
- print("-Results-".center(70).replace(" ", "*").replace("-", " "))
- print(results)
- print("*" * 70)
|