| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- from __future__ import unicode_literals
- from builtins import str
- from lunr.token_set import TokenSet
- from lunr.exceptions import BaseLunrException
- class TokenSetBuilder:
- def __init__(self):
- self.previous_word = ""
- self.root = TokenSet()
- self.unchecked_nodes = []
- self.minimized_nodes = {}
- def insert(self, word):
- if word < self.previous_word:
- raise BaseLunrException("Out of order word insertion")
- common_prefix = 0
- for i in range(min(len(word), len(self.previous_word))):
- if word[i] != self.previous_word[i]:
- break
- common_prefix += 1
- self.minimize(common_prefix)
- node = (
- self.root if not self.unchecked_nodes else self.unchecked_nodes[-1]["child"]
- )
- for i in range(common_prefix, len(word)):
- next_node = TokenSet()
- char = word[i]
- node.edges[char] = next_node
- self.unchecked_nodes.append(
- {"parent": node, "char": char, "child": next_node}
- )
- node = next_node
- node.final = True
- self.previous_word = word
- def finish(self):
- self.minimize(0)
- def minimize(self, down_to):
- for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
- node = self.unchecked_nodes[i]
- child_key = str(node["child"])
- if child_key in self.minimized_nodes:
- node["parent"].edges[node["char"]] = self.minimized_nodes[child_key]
- else:
- node["child"]._str = child_key
- self.minimized_nodes[child_key] = node["child"]
- self.unchecked_nodes.pop()
|