lancaster.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Tomcavage <stomcava@law.upenn.edu>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
  9. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
  10. """
  11. import re
  12. from nltk.stem.api import StemmerI
  13. class LancasterStemmer(StemmerI):
  14. """
  15. Lancaster Stemmer
  16. >>> from nltk.stem.lancaster import LancasterStemmer
  17. >>> st = LancasterStemmer()
  18. >>> st.stem('maximum') # Remove "-um" when word is intact
  19. 'maxim'
  20. >>> st.stem('presumably') # Don't remove "-um" when word is not intact
  21. 'presum'
  22. >>> st.stem('multiply') # No action taken if word ends with "-ply"
  23. 'multiply'
  24. >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
  25. 'provid'
  26. >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
  27. 'ow'
  28. >>> st.stem('ear') # ditto
  29. 'ear'
  30. >>> st.stem('saying') # Words starting with consonant must contain at least 3
  31. 'say'
  32. >>> st.stem('crying') # letters and one of those letters must be a vowel
  33. 'cry'
  34. >>> st.stem('string') # ditto
  35. 'string'
  36. >>> st.stem('meant') # ditto
  37. 'meant'
  38. >>> st.stem('cement') # ditto
  39. 'cem'
  40. >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
  41. >>> st_pre.stem('kilometer') # Test Prefix
  42. 'met'
  43. >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
  44. >>> st_custom.stem("ness") # Change s to t
  45. 'nest'
  46. """
  47. # The rule list is static since it doesn't change between instances
  48. default_rule_tuple = (
  49. "ai*2.", # -ia > - if intact
  50. "a*1.", # -a > - if intact
  51. "bb1.", # -bb > -b
  52. "city3s.", # -ytic > -ys
  53. "ci2>", # -ic > -
  54. "cn1t>", # -nc > -nt
  55. "dd1.", # -dd > -d
  56. "dei3y>", # -ied > -y
  57. "deec2ss.", # -ceed >", -cess
  58. "dee1.", # -eed > -ee
  59. "de2>", # -ed > -
  60. "dooh4>", # -hood > -
  61. "e1>", # -e > -
  62. "feil1v.", # -lief > -liev
  63. "fi2>", # -if > -
  64. "gni3>", # -ing > -
  65. "gai3y.", # -iag > -y
  66. "ga2>", # -ag > -
  67. "gg1.", # -gg > -g
  68. "ht*2.", # -th > - if intact
  69. "hsiug5ct.", # -guish > -ct
  70. "hsi3>", # -ish > -
  71. "i*1.", # -i > - if intact
  72. "i1y>", # -i > -y
  73. "ji1d.", # -ij > -id -- see nois4j> & vis3j>
  74. "juf1s.", # -fuj > -fus
  75. "ju1d.", # -uj > -ud
  76. "jo1d.", # -oj > -od
  77. "jeh1r.", # -hej > -her
  78. "jrev1t.", # -verj > -vert
  79. "jsim2t.", # -misj > -mit
  80. "jn1d.", # -nj > -nd
  81. "j1s.", # -j > -s
  82. "lbaifi6.", # -ifiabl > -
  83. "lbai4y.", # -iabl > -y
  84. "lba3>", # -abl > -
  85. "lbi3.", # -ibl > -
  86. "lib2l>", # -bil > -bl
  87. "lc1.", # -cl > c
  88. "lufi4y.", # -iful > -y
  89. "luf3>", # -ful > -
  90. "lu2.", # -ul > -
  91. "lai3>", # -ial > -
  92. "lau3>", # -ual > -
  93. "la2>", # -al > -
  94. "ll1.", # -ll > -l
  95. "mui3.", # -ium > -
  96. "mu*2.", # -um > - if intact
  97. "msi3>", # -ism > -
  98. "mm1.", # -mm > -m
  99. "nois4j>", # -sion > -j
  100. "noix4ct.", # -xion > -ct
  101. "noi3>", # -ion > -
  102. "nai3>", # -ian > -
  103. "na2>", # -an > -
  104. "nee0.", # protect -een
  105. "ne2>", # -en > -
  106. "nn1.", # -nn > -n
  107. "pihs4>", # -ship > -
  108. "pp1.", # -pp > -p
  109. "re2>", # -er > -
  110. "rae0.", # protect -ear
  111. "ra2.", # -ar > -
  112. "ro2>", # -or > -
  113. "ru2>", # -ur > -
  114. "rr1.", # -rr > -r
  115. "rt1>", # -tr > -t
  116. "rei3y>", # -ier > -y
  117. "sei3y>", # -ies > -y
  118. "sis2.", # -sis > -s
  119. "si2>", # -is > -
  120. "ssen4>", # -ness > -
  121. "ss0.", # protect -ss
  122. "suo3>", # -ous > -
  123. "su*2.", # -us > - if intact
  124. "s*1>", # -s > - if intact
  125. "s0.", # -s > -s
  126. "tacilp4y.", # -plicat > -ply
  127. "ta2>", # -at > -
  128. "tnem4>", # -ment > -
  129. "tne3>", # -ent > -
  130. "tna3>", # -ant > -
  131. "tpir2b.", # -ript > -rib
  132. "tpro2b.", # -orpt > -orb
  133. "tcud1.", # -duct > -duc
  134. "tpmus2.", # -sumpt > -sum
  135. "tpec2iv.", # -cept > -ceiv
  136. "tulo2v.", # -olut > -olv
  137. "tsis0.", # protect -sist
  138. "tsi3>", # -ist > -
  139. "tt1.", # -tt > -t
  140. "uqi3.", # -iqu > -
  141. "ugo1.", # -ogu > -og
  142. "vis3j>", # -siv > -j
  143. "vie0.", # protect -eiv
  144. "vi2>", # -iv > -
  145. "ylb1>", # -bly > -bl
  146. "yli3y>", # -ily > -y
  147. "ylp0.", # protect -ply
  148. "yl2>", # -ly > -
  149. "ygo1.", # -ogy > -og
  150. "yhp1.", # -phy > -ph
  151. "ymo1.", # -omy > -om
  152. "ypo1.", # -opy > -op
  153. "yti3>", # -ity > -
  154. "yte3>", # -ety > -
  155. "ytl2.", # -lty > -l
  156. "yrtsi5.", # -istry > -
  157. "yra3>", # -ary > -
  158. "yro3>", # -ory > -
  159. "yfi3.", # -ify > -
  160. "ycn2t>", # -ncy > -nt
  161. "yca3>", # -acy > -
  162. "zi2>", # -iz > -
  163. "zy1s.", # -yz > -ys
  164. )
  165. def __init__(self, rule_tuple=None, strip_prefix_flag=False):
  166. """Create an instance of the Lancaster stemmer.
  167. """
  168. # Setup an empty rule dictionary - this will be filled in later
  169. self.rule_dictionary = {}
  170. # Check if a user wants to strip prefix
  171. self._strip_prefix = strip_prefix_flag
  172. # Check if a user wants to use his/her own rule tuples.
  173. self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
  174. def parseRules(self, rule_tuple=None):
  175. """Validate the set of rules used in this stemmer.
  176. If this function is called as an individual method, without using stem
  177. method, rule_tuple argument will be compiled into self.rule_dictionary.
  178. If this function is called within stem, self._rule_tuple will be used.
  179. """
  180. # If there is no argument for the function, use class' own rule tuple.
  181. rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
  182. valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
  183. # Empty any old rules from the rule set before adding new ones
  184. self.rule_dictionary = {}
  185. for rule in rule_tuple:
  186. if not valid_rule.match(rule):
  187. raise ValueError("The rule {0} is invalid".format(rule))
  188. first_letter = rule[0:1]
  189. if first_letter in self.rule_dictionary:
  190. self.rule_dictionary[first_letter].append(rule)
  191. else:
  192. self.rule_dictionary[first_letter] = [rule]
  193. def stem(self, word):
  194. """Stem a word using the Lancaster stemmer.
  195. """
  196. # Lower-case the word, since all the rules are lower-cased
  197. word = word.lower()
  198. word = self.__stripPrefix(word) if self._strip_prefix else word
  199. # Save a copy of the original word
  200. intact_word = word
  201. # If rule dictionary is empty, parse rule tuple.
  202. if not self.rule_dictionary:
  203. self.parseRules()
  204. return self.__doStemming(word, intact_word)
  205. def __doStemming(self, word, intact_word):
  206. """Perform the actual word stemming
  207. """
  208. valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
  209. proceed = True
  210. while proceed:
  211. # Find the position of the last letter of the word to be stemmed
  212. last_letter_position = self.__getLastLetter(word)
  213. # Only stem the word if it has a last letter and a rule matching that last letter
  214. if (
  215. last_letter_position < 0
  216. or word[last_letter_position] not in self.rule_dictionary
  217. ):
  218. proceed = False
  219. else:
  220. rule_was_applied = False
  221. # Go through each rule that matches the word's final letter
  222. for rule in self.rule_dictionary[word[last_letter_position]]:
  223. rule_match = valid_rule.match(rule)
  224. if rule_match:
  225. (
  226. ending_string,
  227. intact_flag,
  228. remove_total,
  229. append_string,
  230. cont_flag,
  231. ) = rule_match.groups()
  232. # Convert the number of chars to remove when stemming
  233. # from a string to an integer
  234. remove_total = int(remove_total)
  235. # Proceed if word's ending matches rule's word ending
  236. if word.endswith(ending_string[::-1]):
  237. if intact_flag:
  238. if word == intact_word and self.__isAcceptable(
  239. word, remove_total
  240. ):
  241. word = self.__applyRule(
  242. word, remove_total, append_string
  243. )
  244. rule_was_applied = True
  245. if cont_flag == ".":
  246. proceed = False
  247. break
  248. elif self.__isAcceptable(word, remove_total):
  249. word = self.__applyRule(
  250. word, remove_total, append_string
  251. )
  252. rule_was_applied = True
  253. if cont_flag == ".":
  254. proceed = False
  255. break
  256. # If no rules apply, the word doesn't need any more stemming
  257. if rule_was_applied == False:
  258. proceed = False
  259. return word
  260. def __getLastLetter(self, word):
  261. """Get the zero-based index of the last alphabetic character in this string
  262. """
  263. last_letter = -1
  264. for position in range(len(word)):
  265. if word[position].isalpha():
  266. last_letter = position
  267. else:
  268. break
  269. return last_letter
  270. def __isAcceptable(self, word, remove_total):
  271. """Determine if the word is acceptable for stemming.
  272. """
  273. word_is_acceptable = False
  274. # If the word starts with a vowel, it must be at least 2
  275. # characters long to be stemmed
  276. if word[0] in "aeiouy":
  277. if len(word) - remove_total >= 2:
  278. word_is_acceptable = True
  279. # If the word starts with a consonant, it must be at least 3
  280. # characters long (including one vowel) to be stemmed
  281. elif len(word) - remove_total >= 3:
  282. if word[1] in "aeiouy":
  283. word_is_acceptable = True
  284. elif word[2] in "aeiouy":
  285. word_is_acceptable = True
  286. return word_is_acceptable
  287. def __applyRule(self, word, remove_total, append_string):
  288. """Apply the stemming rule to the word
  289. """
  290. # Remove letters from the end of the word
  291. new_word_length = len(word) - remove_total
  292. word = word[0:new_word_length]
  293. # And add new letters to the end of the truncated word
  294. if append_string:
  295. word += append_string
  296. return word
  297. def __stripPrefix(self, word):
  298. """Remove prefix from a word.
  299. This function originally taken from Whoosh.
  300. """
  301. for prefix in (
  302. "kilo",
  303. "micro",
  304. "milli",
  305. "intra",
  306. "ultra",
  307. "mega",
  308. "nano",
  309. "pico",
  310. "pseudo",
  311. ):
  312. if word.startswith(prefix):
  313. return word[len(prefix) :]
  314. return word
  315. def __repr__(self):
  316. return "<LancasterStemmer>"