porter.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709
  1. """
  2. Porter Stemmer
  3. This is the Porter stemming algorithm. It follows the algorithm
  4. presented in
  5. Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
  6. with some optional deviations that can be turned on or off with the
  7. `mode` argument to the constructor.
  8. Martin Porter, the algorithm's inventor, maintains a web page about the
  9. algorithm at
  10. http://www.tartarus.org/~martin/PorterStemmer/
  11. which includes another Python implementation and other implementations
  12. in many languages.
  13. """
  14. __docformat__ = "plaintext"
  15. import re
  16. from nltk.stem.api import StemmerI
  17. class PorterStemmer(StemmerI):
  18. """
  19. A word stemmer based on the Porter stemming algorithm.
  20. Porter, M. "An algorithm for suffix stripping."
  21. Program 14.3 (1980): 130-137.
  22. See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
  23. of the algorithm.
  24. Martin Porter has endorsed several modifications to the Porter
  25. algorithm since writing his original paper, and those extensions are
  26. included in the implementations on his website. Additionally, others
  27. have proposed further improvements to the algorithm, including NLTK
  28. contributors. There are thus three modes that can be selected by
  29. passing the appropriate constant to the class constructor's `mode`
  30. attribute:
  31. PorterStemmer.ORIGINAL_ALGORITHM
  32. - Implementation that is faithful to the original paper.
  33. Note that Martin Porter has deprecated this version of the
  34. algorithm. Martin distributes implementations of the Porter
  35. Stemmer in many languages, hosted at:
  36. http://www.tartarus.org/~martin/PorterStemmer/
  37. and all of these implementations include his extensions. He
  38. strongly recommends against using the original, published
  39. version of the algorithm; only use this mode if you clearly
  40. understand why you are choosing to do so.
  41. PorterStemmer.MARTIN_EXTENSIONS
  42. - Implementation that only uses the modifications to the
  43. algorithm that are included in the implementations on Martin
  44. Porter's website. He has declared Porter frozen, so the
  45. behaviour of those implementations should never change.
  46. PorterStemmer.NLTK_EXTENSIONS (default)
  47. - Implementation that includes further improvements devised by
  48. NLTK contributors or taken from other modified implementations
  49. found on the web.
  50. For the best stemming, you should use the default NLTK_EXTENSIONS
  51. version. However, if you need to get the same results as either the
  52. original algorithm or one of Martin Porter's hosted versions for
  53. compatibility with an existing implementation or dataset, you can use
  54. one of the other modes instead.
  55. """
  56. # Modes the Stemmer can be instantiated in
  57. NLTK_EXTENSIONS = "NLTK_EXTENSIONS"
  58. MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS"
  59. ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM"
  60. def __init__(self, mode=NLTK_EXTENSIONS):
  61. if mode not in (
  62. self.NLTK_EXTENSIONS,
  63. self.MARTIN_EXTENSIONS,
  64. self.ORIGINAL_ALGORITHM,
  65. ):
  66. raise ValueError(
  67. "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
  68. "PorterStemmer.MARTIN_EXTENSIONS, or "
  69. "PorterStemmer.ORIGINAL_ALGORITHM"
  70. )
  71. self.mode = mode
  72. if self.mode == self.NLTK_EXTENSIONS:
  73. # This is a table of irregular forms. It is quite short,
  74. # but still reflects the errors actually drawn to Martin
  75. # Porter's attention over a 20 year period!
  76. irregular_forms = {
  77. "sky": ["sky", "skies"],
  78. "die": ["dying"],
  79. "lie": ["lying"],
  80. "tie": ["tying"],
  81. "news": ["news"],
  82. "inning": ["innings", "inning"],
  83. "outing": ["outings", "outing"],
  84. "canning": ["cannings", "canning"],
  85. "howe": ["howe"],
  86. "proceed": ["proceed"],
  87. "exceed": ["exceed"],
  88. "succeed": ["succeed"],
  89. }
  90. self.pool = {}
  91. for key in irregular_forms:
  92. for val in irregular_forms[key]:
  93. self.pool[val] = key
  94. self.vowels = frozenset(["a", "e", "i", "o", "u"])
  95. def _is_consonant(self, word, i):
  96. """Returns True if word[i] is a consonant, False otherwise
  97. A consonant is defined in the paper as follows:
  98. A consonant in a word is a letter other than A, E, I, O or
  99. U, and other than Y preceded by a consonant. (The fact that
  100. the term `consonant' is defined to some extent in terms of
  101. itself does not make it ambiguous.) So in TOY the consonants
  102. are T and Y, and in SYZYGY they are S, Z and G. If a letter
  103. is not a consonant it is a vowel.
  104. """
  105. if word[i] in self.vowels:
  106. return False
  107. if word[i] == "y":
  108. if i == 0:
  109. return True
  110. else:
  111. return not self._is_consonant(word, i - 1)
  112. return True
  113. def _measure(self, stem):
  114. """Returns the 'measure' of stem, per definition in the paper
  115. From the paper:
  116. A consonant will be denoted by c, a vowel by v. A list
  117. ccc... of length greater than 0 will be denoted by C, and a
  118. list vvv... of length greater than 0 will be denoted by V.
  119. Any word, or part of a word, therefore has one of the four
  120. forms:
  121. CVCV ... C
  122. CVCV ... V
  123. VCVC ... C
  124. VCVC ... V
  125. These may all be represented by the single form
  126. [C]VCVC ... [V]
  127. where the square brackets denote arbitrary presence of their
  128. contents. Using (VC){m} to denote VC repeated m times, this
  129. may again be written as
  130. [C](VC){m}[V].
  131. m will be called the \measure\ of any word or word part when
  132. represented in this form. The case m = 0 covers the null
  133. word. Here are some examples:
  134. m=0 TR, EE, TREE, Y, BY.
  135. m=1 TROUBLE, OATS, TREES, IVY.
  136. m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
  137. """
  138. cv_sequence = ""
  139. # Construct a string of 'c's and 'v's representing whether each
  140. # character in `stem` is a consonant or a vowel.
  141. # e.g. 'falafel' becomes 'cvcvcvc',
  142. # 'architecture' becomes 'vcccvcvccvcv'
  143. for i in range(len(stem)):
  144. if self._is_consonant(stem, i):
  145. cv_sequence += "c"
  146. else:
  147. cv_sequence += "v"
  148. # Count the number of 'vc' occurences, which is equivalent to
  149. # the number of 'VC' occurrences in Porter's reduced form in the
  150. # docstring above, which is in turn equivalent to `m`
  151. return cv_sequence.count("vc")
  152. def _has_positive_measure(self, stem):
  153. return self._measure(stem) > 0
  154. def _contains_vowel(self, stem):
  155. """Returns True if stem contains a vowel, else False"""
  156. for i in range(len(stem)):
  157. if not self._is_consonant(stem, i):
  158. return True
  159. return False
  160. def _ends_double_consonant(self, word):
  161. """Implements condition *d from the paper
  162. Returns True if word ends with a double consonant
  163. """
  164. return (
  165. len(word) >= 2
  166. and word[-1] == word[-2]
  167. and self._is_consonant(word, len(word) - 1)
  168. )
  169. def _ends_cvc(self, word):
  170. """Implements condition *o from the paper
  171. From the paper:
  172. *o - the stem ends cvc, where the second c is not W, X or Y
  173. (e.g. -WIL, -HOP).
  174. """
  175. return (
  176. len(word) >= 3
  177. and self._is_consonant(word, len(word) - 3)
  178. and not self._is_consonant(word, len(word) - 2)
  179. and self._is_consonant(word, len(word) - 1)
  180. and word[-1] not in ("w", "x", "y")
  181. ) or (
  182. self.mode == self.NLTK_EXTENSIONS
  183. and len(word) == 2
  184. and not self._is_consonant(word, 0)
  185. and self._is_consonant(word, 1)
  186. )
  187. def _replace_suffix(self, word, suffix, replacement):
  188. """Replaces `suffix` of `word` with `replacement"""
  189. assert word.endswith(suffix), "Given word doesn't end with given suffix"
  190. if suffix == "":
  191. return word + replacement
  192. else:
  193. return word[: -len(suffix)] + replacement
  194. def _apply_rule_list(self, word, rules):
  195. """Applies the first applicable suffix-removal rule to the word
  196. Takes a word and a list of suffix-removal rules represented as
  197. 3-tuples, with the first element being the suffix to remove,
  198. the second element being the string to replace it with, and the
  199. final element being the condition for the rule to be applicable,
  200. or None if the rule is unconditional.
  201. """
  202. for rule in rules:
  203. suffix, replacement, condition = rule
  204. if suffix == "*d" and self._ends_double_consonant(word):
  205. stem = word[:-2]
  206. if condition is None or condition(stem):
  207. return stem + replacement
  208. else:
  209. # Don't try any further rules
  210. return word
  211. if word.endswith(suffix):
  212. stem = self._replace_suffix(word, suffix, "")
  213. if condition is None or condition(stem):
  214. return stem + replacement
  215. else:
  216. # Don't try any further rules
  217. return word
  218. return word
  219. def _step1a(self, word):
  220. """Implements Step 1a from "An algorithm for suffix stripping"
  221. From the paper:
  222. SSES -> SS caresses -> caress
  223. IES -> I ponies -> poni
  224. ties -> ti
  225. SS -> SS caress -> caress
  226. S -> cats -> cat
  227. """
  228. # this NLTK-only rule extends the original algorithm, so
  229. # that 'flies'->'fli' but 'dies'->'die' etc
  230. if self.mode == self.NLTK_EXTENSIONS:
  231. if word.endswith("ies") and len(word) == 4:
  232. return self._replace_suffix(word, "ies", "ie")
  233. return self._apply_rule_list(
  234. word,
  235. [
  236. ("sses", "ss", None), # SSES -> SS
  237. ("ies", "i", None), # IES -> I
  238. ("ss", "ss", None), # SS -> SS
  239. ("s", "", None), # S ->
  240. ],
  241. )
  242. def _step1b(self, word):
  243. """Implements Step 1b from "An algorithm for suffix stripping"
  244. From the paper:
  245. (m>0) EED -> EE feed -> feed
  246. agreed -> agree
  247. (*v*) ED -> plastered -> plaster
  248. bled -> bled
  249. (*v*) ING -> motoring -> motor
  250. sing -> sing
  251. If the second or third of the rules in Step 1b is successful,
  252. the following is done:
  253. AT -> ATE conflat(ed) -> conflate
  254. BL -> BLE troubl(ed) -> trouble
  255. IZ -> IZE siz(ed) -> size
  256. (*d and not (*L or *S or *Z))
  257. -> single letter
  258. hopp(ing) -> hop
  259. tann(ed) -> tan
  260. fall(ing) -> fall
  261. hiss(ing) -> hiss
  262. fizz(ed) -> fizz
  263. (m=1 and *o) -> E fail(ing) -> fail
  264. fil(ing) -> file
  265. The rule to map to a single letter causes the removal of one of
  266. the double letter pair. The -E is put back on -AT, -BL and -IZ,
  267. so that the suffixes -ATE, -BLE and -IZE can be recognised
  268. later. This E may be removed in step 4.
  269. """
  270. # this NLTK-only block extends the original algorithm, so that
  271. # 'spied'->'spi' but 'died'->'die' etc
  272. if self.mode == self.NLTK_EXTENSIONS:
  273. if word.endswith("ied"):
  274. if len(word) == 4:
  275. return self._replace_suffix(word, "ied", "ie")
  276. else:
  277. return self._replace_suffix(word, "ied", "i")
  278. # (m>0) EED -> EE
  279. if word.endswith("eed"):
  280. stem = self._replace_suffix(word, "eed", "")
  281. if self._measure(stem) > 0:
  282. return stem + "ee"
  283. else:
  284. return word
  285. rule_2_or_3_succeeded = False
  286. for suffix in ["ed", "ing"]:
  287. if word.endswith(suffix):
  288. intermediate_stem = self._replace_suffix(word, suffix, "")
  289. if self._contains_vowel(intermediate_stem):
  290. rule_2_or_3_succeeded = True
  291. break
  292. if not rule_2_or_3_succeeded:
  293. return word
  294. return self._apply_rule_list(
  295. intermediate_stem,
  296. [
  297. ("at", "ate", None), # AT -> ATE
  298. ("bl", "ble", None), # BL -> BLE
  299. ("iz", "ize", None), # IZ -> IZE
  300. # (*d and not (*L or *S or *Z))
  301. # -> single letter
  302. (
  303. "*d",
  304. intermediate_stem[-1],
  305. lambda stem: intermediate_stem[-1] not in ("l", "s", "z"),
  306. ),
  307. # (m=1 and *o) -> E
  308. (
  309. "",
  310. "e",
  311. lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)),
  312. ),
  313. ],
  314. )
  315. def _step1c(self, word):
  316. """Implements Step 1c from "An algorithm for suffix stripping"
  317. From the paper:
  318. Step 1c
  319. (*v*) Y -> I happy -> happi
  320. sky -> sky
  321. """
  322. def nltk_condition(stem):
  323. """
  324. This has been modified from the original Porter algorithm so
  325. that y->i is only done when y is preceded by a consonant,
  326. but not if the stem is only a single consonant, i.e.
  327. (*c and not c) Y -> I
  328. So 'happy' -> 'happi', but
  329. 'enjoy' -> 'enjoy' etc
  330. This is a much better rule. Formerly 'enjoy'->'enjoi' and
  331. 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
  332. with this modification that no longer really matters.
  333. Also, the removal of the contains_vowel(z) condition means
  334. that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
  335. conflate with 'spied', 'tried', 'flies' ...
  336. """
  337. return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
  338. def original_condition(stem):
  339. return self._contains_vowel(stem)
  340. return self._apply_rule_list(
  341. word,
  342. [
  343. (
  344. "y",
  345. "i",
  346. nltk_condition
  347. if self.mode == self.NLTK_EXTENSIONS
  348. else original_condition,
  349. )
  350. ],
  351. )
  352. def _step2(self, word):
  353. """Implements Step 2 from "An algorithm for suffix stripping"
  354. From the paper:
  355. Step 2
  356. (m>0) ATIONAL -> ATE relational -> relate
  357. (m>0) TIONAL -> TION conditional -> condition
  358. rational -> rational
  359. (m>0) ENCI -> ENCE valenci -> valence
  360. (m>0) ANCI -> ANCE hesitanci -> hesitance
  361. (m>0) IZER -> IZE digitizer -> digitize
  362. (m>0) ABLI -> ABLE conformabli -> conformable
  363. (m>0) ALLI -> AL radicalli -> radical
  364. (m>0) ENTLI -> ENT differentli -> different
  365. (m>0) ELI -> E vileli - > vile
  366. (m>0) OUSLI -> OUS analogousli -> analogous
  367. (m>0) IZATION -> IZE vietnamization -> vietnamize
  368. (m>0) ATION -> ATE predication -> predicate
  369. (m>0) ATOR -> ATE operator -> operate
  370. (m>0) ALISM -> AL feudalism -> feudal
  371. (m>0) IVENESS -> IVE decisiveness -> decisive
  372. (m>0) FULNESS -> FUL hopefulness -> hopeful
  373. (m>0) OUSNESS -> OUS callousness -> callous
  374. (m>0) ALITI -> AL formaliti -> formal
  375. (m>0) IVITI -> IVE sensitiviti -> sensitive
  376. (m>0) BILITI -> BLE sensibiliti -> sensible
  377. """
  378. if self.mode == self.NLTK_EXTENSIONS:
  379. # Instead of applying the ALLI -> AL rule after '(a)bli' per
  380. # the published algorithm, instead we apply it first, and,
  381. # if it succeeds, run the result through step2 again.
  382. if word.endswith("alli") and self._has_positive_measure(
  383. self._replace_suffix(word, "alli", "")
  384. ):
  385. return self._step2(self._replace_suffix(word, "alli", "al"))
  386. bli_rule = ("bli", "ble", self._has_positive_measure)
  387. abli_rule = ("abli", "able", self._has_positive_measure)
  388. rules = [
  389. ("ational", "ate", self._has_positive_measure),
  390. ("tional", "tion", self._has_positive_measure),
  391. ("enci", "ence", self._has_positive_measure),
  392. ("anci", "ance", self._has_positive_measure),
  393. ("izer", "ize", self._has_positive_measure),
  394. abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
  395. ("alli", "al", self._has_positive_measure),
  396. ("entli", "ent", self._has_positive_measure),
  397. ("eli", "e", self._has_positive_measure),
  398. ("ousli", "ous", self._has_positive_measure),
  399. ("ization", "ize", self._has_positive_measure),
  400. ("ation", "ate", self._has_positive_measure),
  401. ("ator", "ate", self._has_positive_measure),
  402. ("alism", "al", self._has_positive_measure),
  403. ("iveness", "ive", self._has_positive_measure),
  404. ("fulness", "ful", self._has_positive_measure),
  405. ("ousness", "ous", self._has_positive_measure),
  406. ("aliti", "al", self._has_positive_measure),
  407. ("iviti", "ive", self._has_positive_measure),
  408. ("biliti", "ble", self._has_positive_measure),
  409. ]
  410. if self.mode == self.NLTK_EXTENSIONS:
  411. rules.append(("fulli", "ful", self._has_positive_measure))
  412. # The 'l' of the 'logi' -> 'log' rule is put with the stem,
  413. # so that short stems like 'geo' 'theo' etc work like
  414. # 'archaeo' 'philo' etc.
  415. rules.append(
  416. ("logi", "log", lambda stem: self._has_positive_measure(word[:-3]))
  417. )
  418. if self.mode == self.MARTIN_EXTENSIONS:
  419. rules.append(("logi", "log", self._has_positive_measure))
  420. return self._apply_rule_list(word, rules)
  421. def _step3(self, word):
  422. """Implements Step 3 from "An algorithm for suffix stripping"
  423. From the paper:
  424. Step 3
  425. (m>0) ICATE -> IC triplicate -> triplic
  426. (m>0) ATIVE -> formative -> form
  427. (m>0) ALIZE -> AL formalize -> formal
  428. (m>0) ICITI -> IC electriciti -> electric
  429. (m>0) ICAL -> IC electrical -> electric
  430. (m>0) FUL -> hopeful -> hope
  431. (m>0) NESS -> goodness -> good
  432. """
  433. return self._apply_rule_list(
  434. word,
  435. [
  436. ("icate", "ic", self._has_positive_measure),
  437. ("ative", "", self._has_positive_measure),
  438. ("alize", "al", self._has_positive_measure),
  439. ("iciti", "ic", self._has_positive_measure),
  440. ("ical", "ic", self._has_positive_measure),
  441. ("ful", "", self._has_positive_measure),
  442. ("ness", "", self._has_positive_measure),
  443. ],
  444. )
  445. def _step4(self, word):
  446. """Implements Step 4 from "An algorithm for suffix stripping"
  447. Step 4
  448. (m>1) AL -> revival -> reviv
  449. (m>1) ANCE -> allowance -> allow
  450. (m>1) ENCE -> inference -> infer
  451. (m>1) ER -> airliner -> airlin
  452. (m>1) IC -> gyroscopic -> gyroscop
  453. (m>1) ABLE -> adjustable -> adjust
  454. (m>1) IBLE -> defensible -> defens
  455. (m>1) ANT -> irritant -> irrit
  456. (m>1) EMENT -> replacement -> replac
  457. (m>1) MENT -> adjustment -> adjust
  458. (m>1) ENT -> dependent -> depend
  459. (m>1 and (*S or *T)) ION -> adoption -> adopt
  460. (m>1) OU -> homologou -> homolog
  461. (m>1) ISM -> communism -> commun
  462. (m>1) ATE -> activate -> activ
  463. (m>1) ITI -> angulariti -> angular
  464. (m>1) OUS -> homologous -> homolog
  465. (m>1) IVE -> effective -> effect
  466. (m>1) IZE -> bowdlerize -> bowdler
  467. The suffixes are now removed. All that remains is a little
  468. tidying up.
  469. """
  470. measure_gt_1 = lambda stem: self._measure(stem) > 1
  471. return self._apply_rule_list(
  472. word,
  473. [
  474. ("al", "", measure_gt_1),
  475. ("ance", "", measure_gt_1),
  476. ("ence", "", measure_gt_1),
  477. ("er", "", measure_gt_1),
  478. ("ic", "", measure_gt_1),
  479. ("able", "", measure_gt_1),
  480. ("ible", "", measure_gt_1),
  481. ("ant", "", measure_gt_1),
  482. ("ement", "", measure_gt_1),
  483. ("ment", "", measure_gt_1),
  484. ("ent", "", measure_gt_1),
  485. # (m>1 and (*S or *T)) ION ->
  486. (
  487. "ion",
  488. "",
  489. lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"),
  490. ),
  491. ("ou", "", measure_gt_1),
  492. ("ism", "", measure_gt_1),
  493. ("ate", "", measure_gt_1),
  494. ("iti", "", measure_gt_1),
  495. ("ous", "", measure_gt_1),
  496. ("ive", "", measure_gt_1),
  497. ("ize", "", measure_gt_1),
  498. ],
  499. )
  500. def _step5a(self, word):
  501. """Implements Step 5a from "An algorithm for suffix stripping"
  502. From the paper:
  503. Step 5a
  504. (m>1) E -> probate -> probat
  505. rate -> rate
  506. (m=1 and not *o) E -> cease -> ceas
  507. """
  508. # Note that Martin's test vocabulary and reference
  509. # implementations are inconsistent in how they handle the case
  510. # where two rules both refer to a suffix that matches the word
  511. # to be stemmed, but only the condition of the second one is
  512. # true.
  513. # Earlier in step2b we had the rules:
  514. # (m>0) EED -> EE
  515. # (*v*) ED ->
  516. # but the examples in the paper included "feed"->"feed", even
  517. # though (*v*) is true for "fe" and therefore the second rule
  518. # alone would map "feed"->"fe".
  519. # However, in THIS case, we need to handle the consecutive rules
  520. # differently and try both conditions (obviously; the second
  521. # rule here would be redundant otherwise). Martin's paper makes
  522. # no explicit mention of the inconsistency; you have to infer it
  523. # from the examples.
  524. # For this reason, we can't use _apply_rule_list here.
  525. if word.endswith("e"):
  526. stem = self._replace_suffix(word, "e", "")
  527. if self._measure(stem) > 1:
  528. return stem
  529. if self._measure(stem) == 1 and not self._ends_cvc(stem):
  530. return stem
  531. return word
  532. def _step5b(self, word):
  533. """Implements Step 5a from "An algorithm for suffix stripping"
  534. From the paper:
  535. Step 5b
  536. (m > 1 and *d and *L) -> single letter
  537. controll -> control
  538. roll -> roll
  539. """
  540. return self._apply_rule_list(
  541. word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)]
  542. )
  543. def stem(self, word):
  544. stem = word.lower()
  545. if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
  546. return self.pool[word]
  547. if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
  548. # With this line, strings of length 1 or 2 don't go through
  549. # the stemming process, although no mention is made of this
  550. # in the published algorithm.
  551. return word
  552. stem = self._step1a(stem)
  553. stem = self._step1b(stem)
  554. stem = self._step1c(stem)
  555. stem = self._step2(stem)
  556. stem = self._step3(stem)
  557. stem = self._step4(stem)
  558. stem = self._step5a(stem)
  559. stem = self._step5b(stem)
  560. return stem
  561. def __repr__(self):
  562. return "<PorterStemmer>"
  563. def demo():
  564. """
  565. A demonstration of the porter stemmer on a sample from
  566. the Penn Treebank corpus.
  567. """
  568. from nltk.corpus import treebank
  569. from nltk import stem
  570. stemmer = stem.PorterStemmer()
  571. orig = []
  572. stemmed = []
  573. for item in treebank.fileids()[:3]:
  574. for (word, tag) in treebank.tagged_words(item):
  575. orig.append(word)
  576. stemmed.append(stemmer.stem(word))
  577. # Convert the results to a string, and word-wrap them.
  578. results = " ".join(stemmed)
  579. results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()
  580. # Convert the original to a string, and word wrap it.
  581. original = " ".join(orig)
  582. original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()
  583. # Print the results.
  584. print("-Original-".center(70).replace(" ", "*").replace("-", " "))
  585. print(original)
  586. print("-Results-".center(70).replace(" ", "*").replace("-", " "))
  587. print(results)
  588. print("*" * 70)