token_set_builder.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. from __future__ import unicode_literals
  2. from builtins import str
  3. from lunr.token_set import TokenSet
  4. from lunr.exceptions import BaseLunrException
  5. class TokenSetBuilder:
  6. def __init__(self):
  7. self.previous_word = ""
  8. self.root = TokenSet()
  9. self.unchecked_nodes = []
  10. self.minimized_nodes = {}
  11. def insert(self, word):
  12. if word < self.previous_word:
  13. raise BaseLunrException("Out of order word insertion")
  14. common_prefix = 0
  15. for i in range(min(len(word), len(self.previous_word))):
  16. if word[i] != self.previous_word[i]:
  17. break
  18. common_prefix += 1
  19. self.minimize(common_prefix)
  20. node = (
  21. self.root if not self.unchecked_nodes else self.unchecked_nodes[-1]["child"]
  22. )
  23. for i in range(common_prefix, len(word)):
  24. next_node = TokenSet()
  25. char = word[i]
  26. node.edges[char] = next_node
  27. self.unchecked_nodes.append(
  28. {"parent": node, "char": char, "child": next_node}
  29. )
  30. node = next_node
  31. node.final = True
  32. self.previous_word = word
  33. def finish(self):
  34. self.minimize(0)
  35. def minimize(self, down_to):
  36. for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
  37. node = self.unchecked_nodes[i]
  38. child_key = str(node["child"])
  39. if child_key in self.minimized_nodes:
  40. node["parent"].edges[node["char"]] = self.minimized_nodes[child_key]
  41. else:
  42. node["child"]._str = child_key
  43. self.minimized_nodes[child_key] = node["child"]
  44. self.unchecked_nodes.pop()