token_set.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. from __future__ import unicode_literals
  2. from builtins import str
  3. import six
  4. @six.python_2_unicode_compatible
  5. class TokenSet:
  6. """
  7. A token set is used to store the unique list of all tokens
  8. within an index. Token sets are also used to represent an
  9. incoming query to the index, this query token set and index
  10. token set are then intersected to find which tokens to look
  11. up in the inverted index.
  12. A token set can hold multiple tokens, as in the case of the
  13. index token set, or it can hold a single token as in the
  14. case of a simple query token set.
  15. Additionally token sets are used to perform wildcard matching.
  16. Leading, contained and trailing wildcards are supported, and
  17. from this edit distance matching can also be provided.
  18. Token sets are implemented as a minimal finite state automata,
  19. where both common prefixes and suffixes are shared between tokens.
  20. This helps to reduce the space used for storing the token set.
  21. TODO: consider https://github.com/glyph/automat
  22. """
  23. _next_id = 1
  24. def __init__(self):
  25. self.final = False
  26. self.edges = {}
  27. self.id = self._next_id
  28. self.__class__._next_id += 1
  29. def __str__(self):
  30. try:
  31. return self._string
  32. except AttributeError:
  33. pass
  34. string = "1" if self.final else "0"
  35. for label in sorted(list(self.edges.keys())):
  36. node = self.edges[label]
  37. try:
  38. node_id = str(node.id)
  39. except AttributeError:
  40. # TODO: JS seems to rely on undefined for the id attribute?
  41. node_id = ""
  42. string = string + label + node_id
  43. return string
  44. def __repr__(self):
  45. return '<TokenSet "{}">'.format(str(self))
  46. @classmethod
  47. def from_string(self, string):
  48. """Creates a TokenSet from a string.
  49. The string may contain one or more wildcard characters (*) that will
  50. allow wildcard matching when intersecting with another TokenSet
  51. """
  52. node = TokenSet()
  53. root = node
  54. # Iterates throough all characters in the passed string appending
  55. # a node for each character.
  56. # When a wildcard character is found then a self referencing edge
  57. # is introduced to continually match any number of characters
  58. for i, char in enumerate(string):
  59. final = i == len(string) - 1
  60. if char == "*":
  61. node.edges[char] = node
  62. node.final = final
  63. else:
  64. next_ = TokenSet()
  65. next_.final = final
  66. node.edges[char] = next_
  67. node = next_
  68. return root
  69. @classmethod
  70. def from_fuzzy_string(cls, string, edit_distance):
  71. """Creates a token set representing a single string with a specified
  72. edit distance.
  73. Insertions, deletions, substitutions and transpositions are each
  74. treated as an edit distance of 1.
  75. Increasing the allowed edit distance will have a dramatic impact
  76. on the performance of both creating and intersecting these TokenSets.
  77. It is advised to keep the edit distance less than 3.
  78. """
  79. root = TokenSet()
  80. stack = [{"node": root, "edits_remaining": edit_distance, "string": string}]
  81. while stack:
  82. frame = stack.pop()
  83. # no edit
  84. if len(frame["string"]) > 0:
  85. char = frame["string"][0]
  86. no_edit_node = None
  87. if char in frame["node"].edges:
  88. no_edit_node = frame["node"].edges[char]
  89. else:
  90. no_edit_node = TokenSet()
  91. frame["node"].edges[char] = no_edit_node
  92. if len(frame["string"]) == 1:
  93. no_edit_node.final = True
  94. stack.append(
  95. {
  96. "node": no_edit_node,
  97. "edits_remaining": frame["edits_remaining"],
  98. "string": frame["string"][1:],
  99. }
  100. )
  101. if frame["edits_remaining"] == 0:
  102. continue
  103. # insertion, can only do insertion if there are edits remaining
  104. if "*" in frame["node"].edges:
  105. insertion_node = frame["node"].edges["*"]
  106. else:
  107. insertion_node = TokenSet()
  108. frame["node"].edges["*"] = insertion_node
  109. if len(frame["string"]) == 0:
  110. insertion_node.final = True
  111. stack.append(
  112. {
  113. "node": insertion_node,
  114. "edits_remaining": frame["edits_remaining"] - 1,
  115. "string": frame["string"],
  116. }
  117. )
  118. # deletion, can only do a deletion if we have enough edits
  119. # remaining and if there are characters left to delete in the string
  120. if len(frame["string"]) > 1:
  121. stack.append(
  122. {
  123. "node": frame["node"],
  124. "edits_remaining": frame["edits_remaining"] - 1,
  125. "string": frame["string"][1:],
  126. }
  127. )
  128. # deletion, just removing the last character of the string
  129. if len(frame["string"]) == 1:
  130. frame["node"].final = True
  131. # substitution, can only do a substitution if we have enough edits
  132. # remaining and there are characters left to substitute
  133. if len(frame["string"]) >= 1:
  134. if "*" in frame["node"].edges:
  135. substitution_node = frame["node"].edges["*"]
  136. else:
  137. substitution_node = TokenSet()
  138. frame["node"].edges["*"] = substitution_node
  139. if len(frame["string"]) == 1:
  140. substitution_node.final = True
  141. stack.append(
  142. {
  143. "node": substitution_node,
  144. "edits_remaining": frame["edits_remaining"] - 1,
  145. "string": frame["string"][1:],
  146. }
  147. )
  148. # transposition, can only do a transposition if there are edits
  149. # remaining and there are enough characters to transpose
  150. if frame["edits_remaining"] and len(frame["string"]) > 1:
  151. char_a = frame["string"][0]
  152. char_b = frame["string"][1]
  153. transpose_node = None
  154. if char_b in frame["node"].edges:
  155. transpose_node = frame["node"].edges[char_b]
  156. else:
  157. transpose_node = TokenSet()
  158. frame["node"].edges[char_b] = transpose_node
  159. if len(frame["string"]) == 1:
  160. transpose_node.final = True
  161. stack.append(
  162. {
  163. "node": transpose_node,
  164. "edits_remaining": frame["edits_remaining"] - 1,
  165. "string": char_a + frame["string"][2:],
  166. }
  167. )
  168. return root
  169. @classmethod
  170. def from_list(cls, list_of_words):
  171. from lunr.token_set_builder import TokenSetBuilder
  172. builder = TokenSetBuilder()
  173. for word in list_of_words:
  174. builder.insert(word)
  175. builder.finish()
  176. return builder.root
  177. @classmethod
  178. def from_clause(cls, clause):
  179. if clause.edit_distance:
  180. return cls.from_fuzzy_string(clause.term, clause.edit_distance)
  181. else:
  182. return cls.from_string(clause.term)
  183. def to_list(self):
  184. words = []
  185. stack = [{"prefix": "", "node": self}]
  186. while stack:
  187. frame = stack.pop()
  188. if frame["node"].final:
  189. words.append(frame["prefix"])
  190. for edge in frame["node"].edges.keys():
  191. stack.append(
  192. {
  193. "prefix": frame["prefix"] + str(edge),
  194. "node": frame["node"].edges[edge],
  195. }
  196. )
  197. return words
  198. def intersect(self, other):
  199. """Returns a new TokenSet that is the intersection of this TokenSet
  200. and the passed TokenSet.
  201. This intersection will take into account any wildcards contained within
  202. the TokenSet.
  203. """
  204. output = TokenSet()
  205. stack = [{"node": self, "q_node": other, "output": output}]
  206. while stack:
  207. frame = stack.pop()
  208. for q_edge in frame["q_node"].edges.keys():
  209. for n_edge in frame["node"].edges.keys():
  210. if n_edge == q_edge or q_edge == "*":
  211. node = frame["node"].edges[n_edge]
  212. q_node = frame["q_node"].edges[q_edge]
  213. final = node.final and q_node.final
  214. next_ = None
  215. if n_edge in frame["output"].edges:
  216. next_ = frame["output"].edges[n_edge]
  217. next_.final = next_.final or final
  218. else:
  219. next_ = TokenSet()
  220. next_.final = final
  221. frame["output"].edges[n_edge] = next_
  222. stack.append({"node": node, "q_node": q_node, "output": next_})
  223. return output