| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449 |
- from __future__ import unicode_literals
- """
- Implementation of Porter Stemming Algorithm from
- https://tartarus.org/martin/PorterStemmer/python.txt
- Note: The Python implementation returns different results than the JS
- version:
- - Stemming "lay" returns "lai" in Python, but "lay" in JS
- - Stemming "try" returns "try" in Python, but "tri" in JS
- Porter Stemming Algorithm
- This is the Porter stemming algorithm, ported to Python from the
- version coded up in ANSI C by the author. It may be be regarded
- as canonical, in that it follows the algorithm presented in
- Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
- no. 3, pp 130-137,
- only differing from it at the points maked --DEPARTURE-- below.
- See also http://www.tartarus.org/~martin/PorterStemmer
- The algorithm as described in the paper could be exactly replicated
- by adjusting the points of DEPARTURE, but this is barely necessary,
- because (a) the points of DEPARTURE are definitely improvements, and
- (b) no encoding of the Porter stemmer I have seen is anything like
- as exact as this version, even with the points of DEPARTURE!
- Vivake Gupta (v@nano.com)
- Release 1: January 2001
- Further adjustments by Santiago Bruno (bananabruno@gmail.com)
- to allow word input not restricted to one word per line, leading
- to:
- release 2: July 2008
- """
- from lunr.pipeline import Pipeline
- class PorterStemmer:
- def __init__(self):
- """The main part of the stemming algorithm starts here.
- b is a buffer holding a word to be stemmed. The letters are in b[k0],
- b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
- readjusted downwards as the stemming progresses. Zero termination is
- not in fact used in the algorithm.
- Note that only lower case sequences are stemmed. Forcing to lower case
- should be done before stem(...) is called.
- """
- self.b = "" # buffer for word to be stemmed
- self.k = 0
- self.k0 = 0
- self.j = 0 # j is a general offset into the string
- def cons(self, i):
- """cons(i) is TRUE <=> b[i] is a consonant."""
- if (
- self.b[i] == "a"
- or self.b[i] == "e"
- or self.b[i] == "i"
- or self.b[i] == "o"
- or self.b[i] == "u"
- ):
- return 0
- if self.b[i] == "y":
- if i == self.k0:
- return 1
- else:
- return not self.cons(i - 1)
- return 1
- def m(self):
- """m() measures the number of consonant sequences between k0 and j.
- if c is a consonant sequence and v a vowel sequence, and <..>
- indicates arbitrary presence,
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
- """
- n = 0
- i = self.k0
- while 1:
- if i > self.j:
- return n
- if not self.cons(i):
- break
- i = i + 1
- i = i + 1
- while 1:
- while 1:
- if i > self.j:
- return n
- if self.cons(i):
- break
- i = i + 1
- i = i + 1
- n = n + 1
- while 1:
- if i > self.j:
- return n
- if not self.cons(i):
- break
- i = i + 1
- i = i + 1
- def vowelinstem(self):
- """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
- for i in range(self.k0, self.j + 1):
- if not self.cons(i):
- return 1
- return 0
- def doublec(self, j):
- """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
- if j < (self.k0 + 1):
- return 0
- if self.b[j] != self.b[j - 1]:
- return 0
- return self.cons(j)
- def cvc(self, i):
- """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
- and also if the second c is not w,x or y. this is used when trying to
- restore an e at the end of a short e.g.
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
- """
- if (
- i < (self.k0 + 2)
- or not self.cons(i)
- or self.cons(i - 1)
- or not self.cons(i - 2)
- ):
- return 0
- ch = self.b[i]
- if ch == "w" or ch == "x" or ch == "y":
- return 0
- return 1
- def ends(self, s):
- """ends(s) is TRUE <=> k0,...k ends with the string s."""
- length = len(s)
- if s[length - 1] != self.b[self.k]: # tiny speed-up
- return 0
- if length > (self.k - self.k0 + 1):
- return 0
- if self.b[self.k - length + 1 : self.k + 1] != s:
- return 0
- self.j = self.k - length
- return 1
- def setto(self, s):
- """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
- length = len(s)
- self.b = self.b[: self.j + 1] + s + self.b[self.j + length + 1 :]
- self.k = self.j + length
- def r(self, s):
- """r(s) is used further down."""
- if self.m() > 0:
- self.setto(s)
- def step1ab(self):
- """step1ab() gets rid of plurals and -ed or -ing. e.g.
- caresses -> caress
- ponies -> poni
- ties -> ti
- caress -> caress
- cats -> cat
- feed -> feed
- agreed -> agree
- disabled -> disable
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
- meetings -> meet
- """
- if self.b[self.k] == "s":
- if self.ends("sses"):
- self.k = self.k - 2
- elif self.ends("ies"):
- self.setto("i")
- elif self.b[self.k - 1] != "s":
- self.k = self.k - 1
- if self.ends("eed"):
- if self.m() > 0:
- self.k = self.k - 1
- elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
- self.k = self.j
- if self.ends("at"):
- self.setto("ate")
- elif self.ends("bl"):
- self.setto("ble")
- elif self.ends("iz"):
- self.setto("ize")
- elif self.doublec(self.k):
- self.k = self.k - 1
- ch = self.b[self.k]
- if ch == "l" or ch == "s" or ch == "z":
- self.k = self.k + 1
- elif self.m() == 1 and self.cvc(self.k):
- self.setto("e")
- def step1c(self):
- """step1c() turns terminal y to i when there is another vowel in the stem."""
- if self.ends("y") and self.vowelinstem():
- self.b = self.b[: self.k] + "i" + self.b[self.k + 1 :]
- def step2(self):
- """step2() maps double suffices to single ones.
- so -ization ( = -ize plus -ation) maps to -ize etc. note that the
- string before the suffix must give m() > 0.
- """
- if self.b[self.k - 1] == "a":
- if self.ends("ational"):
- self.r("ate")
- elif self.ends("tional"):
- self.r("tion")
- elif self.b[self.k - 1] == "c":
- if self.ends("enci"):
- self.r("ence")
- elif self.ends("anci"):
- self.r("ance")
- elif self.b[self.k - 1] == "e":
- if self.ends("izer"):
- self.r("ize")
- elif self.b[self.k - 1] == "l":
- if self.ends("bli"):
- self.r("ble") # --DEPARTURE--
- # To match the published algorithm, replace this phrase with
- # if self.ends("abli"): self.r("able")
- elif self.ends("alli"):
- self.r("al")
- elif self.ends("entli"):
- self.r("ent")
- elif self.ends("eli"):
- self.r("e")
- elif self.ends("ousli"):
- self.r("ous")
- elif self.b[self.k - 1] == "o":
- if self.ends("ization"):
- self.r("ize")
- elif self.ends("ation"):
- self.r("ate")
- elif self.ends("ator"):
- self.r("ate")
- elif self.b[self.k - 1] == "s":
- if self.ends("alism"):
- self.r("al")
- elif self.ends("iveness"):
- self.r("ive")
- elif self.ends("fulness"):
- self.r("ful")
- elif self.ends("ousness"):
- self.r("ous")
- elif self.b[self.k - 1] == "t":
- if self.ends("aliti"):
- self.r("al")
- elif self.ends("iviti"):
- self.r("ive")
- elif self.ends("biliti"):
- self.r("ble")
- elif self.b[self.k - 1] == "g": # --DEPARTURE--
- if self.ends("logi"):
- self.r("log")
- # To match the published algorithm, delete this phrase
- def step3(self):
- """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
- if self.b[self.k] == "e":
- if self.ends("icate"):
- self.r("ic")
- elif self.ends("ative"):
- self.r("")
- elif self.ends("alize"):
- self.r("al")
- elif self.b[self.k] == "i":
- if self.ends("iciti"):
- self.r("ic")
- elif self.b[self.k] == "l":
- if self.ends("ical"):
- self.r("ic")
- elif self.ends("ful"):
- self.r("")
- elif self.b[self.k] == "s":
- if self.ends("ness"):
- self.r("")
- def step4(self):
- """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
- if self.b[self.k - 1] == "a":
- if self.ends("al"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "c":
- if self.ends("ance"):
- pass
- elif self.ends("ence"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "e":
- if self.ends("er"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "i":
- if self.ends("ic"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "l":
- if self.ends("able"):
- pass
- elif self.ends("ible"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "n":
- if self.ends("ant"):
- pass
- elif self.ends("ement"):
- pass
- elif self.ends("ment"):
- pass
- elif self.ends("ent"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "o":
- if self.ends("ion") and (self.b[self.j] == "s" or self.b[self.j] == "t"):
- pass
- elif self.ends("ou"):
- pass
- # takes care of -ous
- else:
- return
- elif self.b[self.k - 1] == "s":
- if self.ends("ism"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "t":
- if self.ends("ate"):
- pass
- elif self.ends("iti"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "u":
- if self.ends("ous"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "v":
- if self.ends("ive"):
- pass
- else:
- return
- elif self.b[self.k - 1] == "z":
- if self.ends("ize"):
- pass
- else:
- return
- else:
- return
- if self.m() > 1:
- self.k = self.j
- def step5(self):
- """step5() removes a final -e if m() > 1, and changes -ll to -l if
- m() > 1.
- """
- self.j = self.k
- if self.b[self.k] == "e":
- a = self.m()
- if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
- self.k = self.k - 1
- if self.b[self.k] == "l" and self.doublec(self.k) and self.m() > 1:
- self.k = self.k - 1
- def stem(self, p, metadata=None):
- """In stem(p,i,j), p is a char pointer, and the string to be stemmed
- is from p[i] to p[j] inclusive. Typically i is zero and j is the
- offset to the last character of a string, (p[j+1] == '\0'). The
- stemmer adjusts the characters p[i] ... p[j] and returns the new
- end-point of the string, k. Stemming never increases word length, so
- i <= k <= j. To turn the stemmer into a module, declare 'stem' as
- extern, and delete the remainder of this file.
- """
- # TODO: removed i and j from the original implementation
- # to comply with the `token.update` API
- i = 0
- j = len(p) - 1
- # copy the parameters into statics
- self.b = p
- self.k = j
- self.k0 = i
- if self.k <= self.k0 + 1:
- return self.b # --DEPARTURE--
- # With this line, strings of length 1 or 2 don't go through the
- # stemming process, although no mention is made of this in the
- # published algorithm. Remove the line to match the published
- # algorithm.
- self.step1ab()
- self.step1c()
- self.step2()
- self.step3()
- self.step4()
- self.step5()
- return self.b[self.k0 : self.k + 1]
- porter_stemmer = PorterStemmer()
- def stemmer(token, i=None, tokens=None):
- """Wrapper around the PorterStemmer for inclusion in pipeline.
- Args:
- language (str): ISO-639-1 code of the language.
- token (lunr.Token): The token to stem.
- i (int): The index of the token in a set.
- tokens (list): A list of tokens representing the set.
- """
- return token.update(porter_stemmer.stem)
- Pipeline.register_function(stemmer, "stemmer")
|