feature.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Transformation-based learning
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Marcus Uneson <marcus.uneson@gmail.com>
  6. # based on previous (nltk2) version by
  7. # Christopher Maloof, Edward Loper, Steven Bird
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from abc import ABCMeta, abstractmethod
  11. class Feature(metaclass=ABCMeta):
  12. """
  13. An abstract base class for Features. A Feature is a combination of
  14. a specific property-computing method and a list of relative positions
  15. to apply that method to.
  16. The property-computing method, M{extract_property(tokens, index)},
  17. must be implemented by every subclass. It extracts or computes a specific
  18. property for the token at the current index. Typical extract_property()
  19. methods return features such as the token text or tag; but more involved
  20. methods may consider the entire sequence M{tokens} and
  21. for instance compute the length of the sentence the token belongs to.
  22. In addition, the subclass may have a PROPERTY_NAME, which is how
  23. it will be printed (in Rules and Templates, etc). If not given, defaults
  24. to the classname.
  25. """
  26. json_tag = "nltk.tbl.Feature"
  27. PROPERTY_NAME = None
  28. def __init__(self, positions, end=None):
  29. """
  30. Construct a Feature which may apply at C{positions}.
  31. #For instance, importing some concrete subclasses (Feature is abstract)
  32. >>> from nltk.tag.brill import Word, Pos
  33. #Feature Word, applying at one of [-2, -1]
  34. >>> Word([-2,-1])
  35. Word([-2, -1])
  36. #Positions need not be contiguous
  37. >>> Word([-2,-1, 1])
  38. Word([-2, -1, 1])
  39. #Contiguous ranges can alternatively be specified giving the
  40. #two endpoints (inclusive)
  41. >>> Pos(-3, -1)
  42. Pos([-3, -2, -1])
  43. #In two-arg form, start <= end is enforced
  44. >>> Pos(2, 1)
  45. Traceback (most recent call last):
  46. File "<stdin>", line 1, in <module>
  47. File "nltk/tbl/template.py", line 306, in __init__
  48. raise TypeError
  49. ValueError: illegal interval specification: (start=2, end=1)
  50. :type positions: list of int
  51. :param positions: the positions at which this features should apply
  52. :raises ValueError: illegal position specifications
  53. An alternative calling convention, for contiguous positions only,
  54. is Feature(start, end):
  55. :type start: int
  56. :param start: start of range where this feature should apply
  57. :type end: int
  58. :param end: end of range (NOTE: inclusive!) where this feature should apply
  59. """
  60. self.positions = None # to avoid warnings
  61. if end is None:
  62. self.positions = tuple(sorted(set(int(i) for i in positions)))
  63. else: # positions was actually not a list, but only the start index
  64. try:
  65. if positions > end:
  66. raise TypeError
  67. self.positions = tuple(range(positions, end + 1))
  68. except TypeError:
  69. # let any kind of erroneous spec raise ValueError
  70. raise ValueError(
  71. "illegal interval specification: (start={0}, end={1})".format(
  72. positions, end
  73. )
  74. )
  75. # set property name given in subclass, or otherwise name of subclass
  76. self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
  77. def encode_json_obj(self):
  78. return self.positions
  79. @classmethod
  80. def decode_json_obj(cls, obj):
  81. positions = obj
  82. return cls(positions)
  83. def __repr__(self):
  84. return "%s(%r)" % (self.__class__.__name__, list(self.positions))
  85. @classmethod
  86. def expand(cls, starts, winlens, excludezero=False):
  87. """
  88. Return a list of features, one for each start point in starts
  89. and for each window length in winlen. If excludezero is True,
  90. no Features containing 0 in its positions will be generated
  91. (many tbl trainers have a special representation for the
  92. target feature at [0])
  93. For instance, importing a concrete subclass (Feature is abstract)
  94. >>> from nltk.tag.brill import Word
  95. First argument gives the possible start positions, second the
  96. possible window lengths
  97. >>> Word.expand([-3,-2,-1], [1])
  98. [Word([-3]), Word([-2]), Word([-1])]
  99. >>> Word.expand([-2,-1], [1])
  100. [Word([-2]), Word([-1])]
  101. >>> Word.expand([-3,-2,-1], [1,2])
  102. [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
  103. >>> Word.expand([-2,-1], [1])
  104. [Word([-2]), Word([-1])]
  105. a third optional argument excludes all Features whose positions contain zero
  106. >>> Word.expand([-2,-1,0], [1,2], excludezero=False)
  107. [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
  108. >>> Word.expand([-2,-1,0], [1,2], excludezero=True)
  109. [Word([-2]), Word([-1]), Word([-2, -1])]
  110. All window lengths must be positive
  111. >>> Word.expand([-2,-1], [0])
  112. Traceback (most recent call last):
  113. File "<stdin>", line 1, in <module>
  114. File "nltk/tag/tbl/template.py", line 371, in expand
  115. :param starts: where to start looking for Feature
  116. ValueError: non-positive window length in [0]
  117. :param starts: where to start looking for Feature
  118. :type starts: list of ints
  119. :param winlens: window lengths where to look for Feature
  120. :type starts: list of ints
  121. :param excludezero: do not output any Feature with 0 in any of its positions.
  122. :type excludezero: bool
  123. :returns: list of Features
  124. :raises ValueError: for non-positive window lengths
  125. """
  126. if not all(x > 0 for x in winlens):
  127. raise ValueError("non-positive window length in {0}".format(winlens))
  128. xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
  129. return [cls(x) for x in xs if not (excludezero and 0 in x)]
  130. def issuperset(self, other):
  131. """
  132. Return True if this Feature always returns True when other does
  133. More precisely, return True if this feature refers to the same property as other;
  134. and this Feature looks at all positions that other does (and possibly
  135. other positions in addition).
  136. #For instance, importing a concrete subclass (Feature is abstract)
  137. >>> from nltk.tag.brill import Word, Pos
  138. >>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
  139. True
  140. >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
  141. False
  142. #Feature subclasses must agree
  143. >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
  144. False
  145. :param other: feature with which to compare
  146. :type other: (subclass of) Feature
  147. :return: True if this feature is superset, otherwise False
  148. :rtype: bool
  149. """
  150. return self.__class__ is other.__class__ and set(self.positions) >= set(
  151. other.positions
  152. )
  153. def intersects(self, other):
  154. """
  155. Return True if the positions of this Feature intersects with those of other
  156. More precisely, return True if this feature refers to the same property as other;
  157. and there is some overlap in the positions they look at.
  158. #For instance, importing a concrete subclass (Feature is abstract)
  159. >>> from nltk.tag.brill import Word, Pos
  160. >>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
  161. True
  162. >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
  163. True
  164. >>> Word([-3,-2,-1]).intersects(Word([0]))
  165. False
  166. #Feature subclasses must agree
  167. >>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
  168. False
  169. :param other: feature with which to compare
  170. :type other: (subclass of) Feature
  171. :return: True if feature classes agree and there is some overlap in the positions they look at
  172. :rtype: bool
  173. """
  174. return bool(
  175. (
  176. self.__class__ is other.__class__
  177. and set(self.positions) & set(other.positions)
  178. )
  179. )
  180. # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
  181. # it will be enough to define __lt__ and __eq__
  182. def __eq__(self, other):
  183. return self.__class__ is other.__class__ and self.positions == other.positions
  184. def __lt__(self, other):
  185. return (
  186. self.__class__.__name__ < other.__class__.__name__
  187. or
  188. # self.positions is a sorted tuple of ints
  189. self.positions < other.positions
  190. )
  191. def __ne__(self, other):
  192. return not (self == other)
  193. def __gt__(self, other):
  194. return other < self
  195. def __ge__(self, other):
  196. return not self < other
  197. def __le__(self, other):
  198. return self < other or self == other
  199. @staticmethod
  200. @abstractmethod
  201. def extract_property(tokens, index):
  202. """
  203. Any subclass of Feature must define static method extract_property(tokens, index)
  204. :param tokens: the sequence of tokens
  205. :type tokens: list of tokens
  206. :param index: the current index
  207. :type index: int
  208. :return: feature value
  209. :rtype: any (but usually scalar)
  210. """