rule.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Transformation-based learning
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Marcus Uneson <marcus.uneson@gmail.com>
  6. # based on previous (nltk2) version by
  7. # Christopher Maloof, Edward Loper, Steven Bird
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from abc import ABCMeta, abstractmethod
  11. from nltk import jsontags
  12. ######################################################################
  13. # Tag Rules
  14. ######################################################################
  15. class TagRule(metaclass=ABCMeta):
  16. """
  17. An interface for tag transformations on a tagged corpus, as
  18. performed by tbl taggers. Each transformation finds all tokens
  19. in the corpus that are tagged with a specific original tag and
  20. satisfy a specific condition, and replaces their tags with a
  21. replacement tag. For any given transformation, the original
  22. tag, replacement tag, and condition are fixed. Conditions may
  23. depend on the token under consideration, as well as any other
  24. tokens in the corpus.
  25. Tag rules must be comparable and hashable.
  26. """
  27. def __init__(self, original_tag, replacement_tag):
  28. self.original_tag = original_tag
  29. """The tag which this TagRule may cause to be replaced."""
  30. self.replacement_tag = replacement_tag
  31. """The tag with which this TagRule may replace another tag."""
  32. def apply(self, tokens, positions=None):
  33. """
  34. Apply this rule at every position in positions where it
  35. applies to the given sentence. I.e., for each position p
  36. in *positions*, if *tokens[p]* is tagged with this rule's
  37. original tag, and satisfies this rule's condition, then set
  38. its tag to be this rule's replacement tag.
  39. :param tokens: The tagged sentence
  40. :type tokens: list(tuple(str, str))
  41. :type positions: list(int)
  42. :param positions: The positions where the transformation is to
  43. be tried. If not specified, try it at all positions.
  44. :return: The indices of tokens whose tags were changed by this
  45. rule.
  46. :rtype: int
  47. """
  48. if positions is None:
  49. positions = list(range(len(tokens)))
  50. # Determine the indices at which this rule applies.
  51. change = [i for i in positions if self.applies(tokens, i)]
  52. # Make the changes. Note: this must be done in a separate
  53. # step from finding applicable locations, since we don't want
  54. # the rule to interact with itself.
  55. for i in change:
  56. tokens[i] = (tokens[i][0], self.replacement_tag)
  57. return change
  58. @abstractmethod
  59. def applies(self, tokens, index):
  60. """
  61. :return: True if the rule would change the tag of
  62. ``tokens[index]``, False otherwise
  63. :rtype: bool
  64. :param tokens: A tagged sentence
  65. :type tokens: list(str)
  66. :param index: The index to check
  67. :type index: int
  68. """
  69. # Rules must be comparable and hashable for the algorithm to work
  70. def __eq__(self, other):
  71. raise TypeError("Rules must implement __eq__()")
  72. def __ne__(self, other):
  73. raise TypeError("Rules must implement __ne__()")
  74. def __hash__(self):
  75. raise TypeError("Rules must implement __hash__()")
  76. @jsontags.register_tag
  77. class Rule(TagRule):
  78. """
  79. A Rule checks the current corpus position for a certain set of conditions;
  80. if they are all fulfilled, the Rule is triggered, meaning that it
  81. will change tag A to tag B. For other tags than A, nothing happens.
  82. The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
  83. with a set of positions to check for the value of the corresponding feature.
  84. Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
  85. More formally, the Rule is then applicable to the M{n}th token iff:
  86. - The M{n}th token is tagged with the Rule's original tag; and
  87. - For each (Feature(positions), M{value}) tuple:
  88. - The value of Feature of at least one token in {n+p for p in positions}
  89. is M{value}.
  90. """
  91. json_tag = "nltk.tbl.Rule"
  92. def __init__(self, templateid, original_tag, replacement_tag, conditions):
  93. """
  94. Construct a new Rule that changes a token's tag from
  95. C{original_tag} to C{replacement_tag} if all of the properties
  96. specified in C{conditions} hold.
  97. @type templateid: string
  98. @param templateid: the template id (a zero-padded string, '001' etc,
  99. so it will sort nicely)
  100. @type conditions: C{iterable} of C{Feature}
  101. @param conditions: A list of Feature(positions),
  102. each of which specifies that the property (computed by
  103. Feature.extract_property()) of at least one
  104. token in M{n} + p in positions is C{value}.
  105. """
  106. TagRule.__init__(self, original_tag, replacement_tag)
  107. self._conditions = conditions
  108. self.templateid = templateid
  109. def encode_json_obj(self):
  110. return {
  111. "templateid": self.templateid,
  112. "original": self.original_tag,
  113. "replacement": self.replacement_tag,
  114. "conditions": self._conditions,
  115. }
  116. @classmethod
  117. def decode_json_obj(cls, obj):
  118. return cls(
  119. obj["templateid"],
  120. obj["original"],
  121. obj["replacement"],
  122. tuple(tuple(feat) for feat in obj["conditions"])
  123. )
  124. def applies(self, tokens, index):
  125. # Inherit docs from TagRule
  126. # Does the given token have this Rule's "original tag"?
  127. if tokens[index][1] != self.original_tag:
  128. return False
  129. # Check to make sure that every condition holds.
  130. for (feature, val) in self._conditions:
  131. # Look for *any* token that satisfies the condition.
  132. for pos in feature.positions:
  133. if not (0 <= index + pos < len(tokens)):
  134. continue
  135. if feature.extract_property(tokens, index + pos) == val:
  136. break
  137. else:
  138. # No token satisfied the condition; return false.
  139. return False
  140. # Every condition checked out, so the Rule is applicable.
  141. return True
  142. def __eq__(self, other):
  143. return self is other or (
  144. other is not None
  145. and other.__class__ == self.__class__
  146. and self.original_tag == other.original_tag
  147. and self.replacement_tag == other.replacement_tag
  148. and self._conditions == other._conditions
  149. )
  150. def __ne__(self, other):
  151. return not (self == other)
  152. def __hash__(self):
  153. # Cache our hash value (justified by profiling.)
  154. try:
  155. return self.__hash
  156. except AttributeError:
  157. self.__hash = hash(repr(self))
  158. return self.__hash
  159. def __repr__(self):
  160. # Cache the repr (justified by profiling -- this is used as
  161. # a sort key when deterministic=True.)
  162. try:
  163. return self.__repr
  164. except AttributeError:
  165. self.__repr = "{0}('{1}', {2}, {3}, [{4}])".format(
  166. self.__class__.__name__,
  167. self.templateid,
  168. repr(self.original_tag),
  169. repr(self.replacement_tag),
  170. # list(self._conditions) would be simpler but will not generate
  171. # the same Rule.__repr__ in python 2 and 3 and thus break some tests
  172. ", ".join(
  173. "({0},{1})".format(f, repr(v))
  174. for (f, v) in self._conditions
  175. ),
  176. )
  177. return self.__repr
  178. def __str__(self):
  179. def _condition_to_logic(feature, value):
  180. """
  181. Return a compact, predicate-logic styled string representation
  182. of the given condition.
  183. """
  184. return "{0}:{1}@[{2}]".format(
  185. feature.PROPERTY_NAME,
  186. value,
  187. ",".join(str(w) for w in feature.positions),
  188. )
  189. conditions = " & ".join(
  190. [_condition_to_logic(f, v) for (f, v) in self._conditions]
  191. )
  192. s = "{0}->{1} if {2}".format(
  193. self.original_tag, self.replacement_tag, conditions
  194. )
  195. return s
  196. def format(self, fmt):
  197. """
  198. Return a string representation of this rule.
  199. >>> from nltk.tbl.rule import Rule
  200. >>> from nltk.tag.brill import Pos
  201. >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
  202. r.format("str") == str(r)
  203. True
  204. >>> r.format("str")
  205. 'VB->NN if Pos:DT@[-2,-1]'
  206. r.format("repr") == repr(r)
  207. True
  208. >>> r.format("repr")
  209. "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
  210. >>> r.format("verbose")
  211. 'VB -> NN if the Pos of words i-2...i-1 is "DT"'
  212. >>> r.format("not_found")
  213. Traceback (most recent call last):
  214. File "<stdin>", line 1, in <module>
  215. File "nltk/tbl/rule.py", line 256, in format
  216. raise ValueError("unknown rule format spec: {0}".format(fmt))
  217. ValueError: unknown rule format spec: not_found
  218. >>>
  219. :param fmt: format specification
  220. :type fmt: str
  221. :return: string representation
  222. :rtype: str
  223. """
  224. if fmt == "str":
  225. return self.__str__()
  226. elif fmt == "repr":
  227. return self.__repr__()
  228. elif fmt == "verbose":
  229. return self._verbose_format()
  230. else:
  231. raise ValueError("unknown rule format spec: {0}".format(fmt))
  232. def _verbose_format(self):
  233. """
  234. Return a wordy, human-readable string representation
  235. of the given rule.
  236. Not sure how useful this is.
  237. """
  238. def condition_to_str(feature, value):
  239. return 'the %s of %s is "%s"' % (
  240. feature.PROPERTY_NAME,
  241. range_to_str(feature.positions),
  242. value,
  243. )
  244. def range_to_str(positions):
  245. if len(positions) == 1:
  246. p = positions[0]
  247. if p == 0:
  248. return "this word"
  249. if p == -1:
  250. return "the preceding word"
  251. elif p == 1:
  252. return "the following word"
  253. elif p < 0:
  254. return "word i-%d" % -p
  255. elif p > 0:
  256. return "word i+%d" % p
  257. else:
  258. # for complete compatibility with the wordy format of nltk2
  259. mx = max(positions)
  260. mn = min(positions)
  261. if mx - mn == len(positions) - 1:
  262. return "words i%+d...i%+d" % (mn, mx)
  263. else:
  264. return "words {%s}" % (",".join("i%+d" % d for d in positions),)
  265. replacement = "%s -> %s" % (self.original_tag, self.replacement_tag)
  266. conditions = (" if " if self._conditions else "") + ", and ".join(
  267. condition_to_str(f, v) for (f, v) in self._conditions
  268. )
  269. return replacement + conditions