rslp.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: RSLP Stemmer
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Tiago Tresoldi <tresoldi@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. # This code is based on the algorithm presented in the paper "A Stemming
  9. # Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
  10. # Christian Huyck, which unfortunately I had no access to. The code is a
  11. # Python version, with some minor modifications of mine, to the description
  12. # presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
  13. # available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
  14. # Please note that this stemmer is intended for demonstration and educational
  15. # purposes only. Feel free to write me for any comments, including the
  16. # development of a different and/or better stemmer for Portuguese. I also
  17. # suggest using NLTK's mailing list for Portuguese for any discussion.
  18. # Este código é baseado no algoritmo apresentado no artigo "A Stemming
  19. # Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
  20. # Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
  21. # código é uma conversão para Python, com algumas pequenas modificações
  22. # minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
  23. # código para linguagem C disponível em
  24. # http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
  25. # lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
  26. # de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
  27. # comentário, inclusive sobre o desenvolvimento de um stemmer diferente
  28. # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
  29. # do NLTK para o português para qualquer debate.
  30. from nltk.data import load
  31. from nltk.stem.api import StemmerI
  32. class RSLPStemmer(StemmerI):
  33. """
  34. A stemmer for Portuguese.
  35. >>> from nltk.stem import RSLPStemmer
  36. >>> st = RSLPStemmer()
  37. >>> # opening lines of Erico Verissimo's "Música ao Longe"
  38. >>> text = '''
  39. ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
  40. ... devem copiar . Uma casinha de porta e janela , em cima duma
  41. ... coxilha .'''
  42. >>> for token in text.split():
  43. ... print(st.stem(token))
  44. clariss risc com giz no quadro-negr a pais que os alun dev copi .
  45. uma cas de port e janel , em cim dum coxilh .
  46. """
  47. def __init__(self):
  48. self._model = []
  49. self._model.append(self.read_rule("step0.pt"))
  50. self._model.append(self.read_rule("step1.pt"))
  51. self._model.append(self.read_rule("step2.pt"))
  52. self._model.append(self.read_rule("step3.pt"))
  53. self._model.append(self.read_rule("step4.pt"))
  54. self._model.append(self.read_rule("step5.pt"))
  55. self._model.append(self.read_rule("step6.pt"))
  56. def read_rule(self, filename):
  57. rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
  58. lines = rules.split("\n")
  59. lines = [line for line in lines if line != ""] # remove blank lines
  60. lines = [line for line in lines if line[0] != "#"] # remove comments
  61. # NOTE: a simple but ugly hack to make this parser happy with double '\t's
  62. lines = [line.replace("\t\t", "\t") for line in lines]
  63. # parse rules
  64. rules = []
  65. for line in lines:
  66. rule = []
  67. tokens = line.split("\t")
  68. # text to be searched for at the end of the string
  69. rule.append(tokens[0][1:-1]) # remove quotes
  70. # minimum stem size to perform the replacement
  71. rule.append(int(tokens[1]))
  72. # text to be replaced into
  73. rule.append(tokens[2][1:-1]) # remove quotes
  74. # exceptions to this rule
  75. rule.append([token[1:-1] for token in tokens[3].split(",")])
  76. # append to the results
  77. rules.append(rule)
  78. return rules
  79. def stem(self, word):
  80. word = word.lower()
  81. # the word ends in 's'? apply rule for plural reduction
  82. if word[-1] == "s":
  83. word = self.apply_rule(word, 0)
  84. # the word ends in 'a'? apply rule for feminine reduction
  85. if word[-1] == "a":
  86. word = self.apply_rule(word, 1)
  87. # augmentative reduction
  88. word = self.apply_rule(word, 3)
  89. # adverb reduction
  90. word = self.apply_rule(word, 2)
  91. # noun reduction
  92. prev_word = word
  93. word = self.apply_rule(word, 4)
  94. if word == prev_word:
  95. # verb reduction
  96. prev_word = word
  97. word = self.apply_rule(word, 5)
  98. if word == prev_word:
  99. # vowel removal
  100. word = self.apply_rule(word, 6)
  101. return word
  102. def apply_rule(self, word, rule_index):
  103. rules = self._model[rule_index]
  104. for rule in rules:
  105. suffix_length = len(rule[0])
  106. if word[-suffix_length:] == rule[0]: # if suffix matches
  107. if len(word) >= suffix_length + rule[1]: # if we have minimum size
  108. if word not in rule[3]: # if not an exception
  109. word = word[:-suffix_length] + rule[2]
  110. break
  111. return word