builder.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. from __future__ import unicode_literals, division
  2. from collections import defaultdict
  3. from builtins import str, dict # noqa
  4. from lunr.pipeline import Pipeline
  5. from lunr.tokenizer import Tokenizer
  6. from lunr.token_set import TokenSet
  7. from lunr.field_ref import FieldRef
  8. from lunr.index import Index
  9. from lunr.vector import Vector
  10. from lunr.idf import idf as Idf
  11. class Field:
  12. """Represents a field with boost and extractor functions."""
  13. def __init__(self, field_name, boost=1, extractor=None):
  14. self.name = field_name
  15. self.boost = boost
  16. self.extractor = extractor
  17. def __repr__(self):
  18. return '<Field "{0.name}" boost="{0.boost}">'.format(self)
  19. def __hash__(self):
  20. return hash(self.name)
  21. class Builder:
  22. """Performs indexing on a set of documents and returns instances of
  23. lunr.Index ready for querying.
  24. All configuration of the index is done via the builder, the fields to
  25. index, the document reference, the text processing pipeline and document
  26. scoring parameters are all set on the builder before indexing.
  27. """
  28. def __init__(self):
  29. self._ref = "id"
  30. self._fields = {}
  31. self.inverted_index = {}
  32. self.field_term_frequencies = {}
  33. self.field_lengths = {}
  34. self.pipeline = Pipeline()
  35. self.search_pipeline = Pipeline()
  36. self._documents = {}
  37. self.document_count = 0
  38. self._b = 0.75
  39. self._k1 = 1.2
  40. self.term_index = 0
  41. self.metadata_whitelist = []
  42. def ref(self, ref):
  43. """Sets the document field used as the document reference.
  44. Every document must have this field. The type of this field in the
  45. document should be a string, if it is not a string it will be coerced
  46. into a string by calling `str`.
  47. The default ref is 'id'. The ref should _not_ be changed during
  48. indexing, it should be set before any documents are added to the index.
  49. Changing it during indexing can lead to inconsistent results.
  50. """
  51. self._ref = ref
  52. def field(self, field_name, boost=1, extractor=None):
  53. """Adds a field to the list of document fields that will be indexed.
  54. Every document being indexed should have this field. None values for
  55. this field in indexed documents will not cause errors but will limit
  56. the chance of that document being retrieved by searches.
  57. All fields should be added before adding documents to the index. Adding
  58. fields after a document has been indexed will have no effect on already
  59. indexed documents.
  60. Fields can be boosted at build time. This allows terms within that
  61. field to have more importance on search results. Use a field boost to
  62. specify that matches within one field are more important that other
  63. fields.
  64. Args:
  65. field_name (str): Name of the field to be added, must not include
  66. a forward slash '/'.
  67. boost (int): Optional boost factor to apply to field.
  68. extractor (callable): Optional function to extract a field from
  69. the document.
  70. Raises:
  71. ValueError: If the field name contains a `/`.
  72. """
  73. if "/" in field_name:
  74. raise ValueError("Field {} contains illegal character `/`")
  75. self._fields[field_name] = Field(field_name, boost, extractor)
  76. def b(self, number):
  77. """A parameter to tune the amount of field length normalisation that is
  78. applied when calculating relevance scores.
  79. A value of 0 will completely disable any normalisation and a value of 1
  80. will fully normalise field lengths. The default is 0.75. Values of b
  81. will be clamped to the range 0 - 1.
  82. """
  83. if number < 0:
  84. self._b = 0
  85. elif number > 1:
  86. self._b = 1
  87. else:
  88. self._b = number
  89. def k1(self, number):
  90. """ A parameter that controls the speed at which a rise in term
  91. frequency results in term frequency saturation.
  92. The default value is 1.2. Setting this to a higher value will give
  93. slower saturation levels, a lower value will result in quicker
  94. saturation.
  95. """
  96. self._k1 = number
  97. def add(self, doc, attributes=None):
  98. """Adds a document to the index.
  99. Before adding documents to the index it should have been fully
  100. setup, with the document ref and all fields to index already having
  101. been specified.
  102. The document must have a field name as specified by the ref (by default
  103. this is 'id') and it should have all fields defined for indexing,
  104. though None values will not cause errors.
  105. Args:
  106. - doc (dict): The document to be added to the index.
  107. - attributes (dict, optional): A set of attributes corresponding
  108. to the document, currently a single `boost` -> int will be
  109. taken into account.
  110. """
  111. doc_ref = str(doc[self._ref])
  112. self._documents[doc_ref] = attributes or {}
  113. self.document_count += 1
  114. for field_name, field in self._fields.items():
  115. extractor = field.extractor
  116. field_value = doc[field_name] if extractor is None else extractor(doc)
  117. tokens = Tokenizer(field_value)
  118. terms = self.pipeline.run(tokens)
  119. field_ref = FieldRef(doc_ref, field_name)
  120. field_terms = defaultdict(int)
  121. # TODO: field_refs are casted to strings in JS, should we allow
  122. # FieldRef as keys?
  123. self.field_term_frequencies[str(field_ref)] = field_terms
  124. self.field_lengths[str(field_ref)] = len(terms)
  125. for term in terms:
  126. # TODO: term is a Token, should we allow Tokens as keys?
  127. term_key = str(term)
  128. field_terms[term_key] += 1
  129. if term_key not in self.inverted_index:
  130. posting = {_field_name: {} for _field_name in self._fields}
  131. posting["_index"] = self.term_index
  132. self.term_index += 1
  133. self.inverted_index[term_key] = posting
  134. if doc_ref not in self.inverted_index[term_key][field_name]:
  135. self.inverted_index[term_key][field_name][doc_ref] = defaultdict(
  136. list
  137. )
  138. for metadata_key in self.metadata_whitelist:
  139. metadata = term.metadata[metadata_key]
  140. self.inverted_index[term_key][field_name][doc_ref][
  141. metadata_key
  142. ].append(metadata)
  143. def build(self):
  144. """Builds the index, creating an instance of `lunr.Index`.
  145. This completes the indexing process and should only be called once all
  146. documents have been added to the index.
  147. """
  148. self._calculate_average_field_lengths()
  149. self._create_field_vectors()
  150. self._create_token_set()
  151. return Index(
  152. inverted_index=self.inverted_index,
  153. field_vectors=self.field_vectors,
  154. token_set=self.token_set,
  155. fields=list(self._fields.keys()),
  156. pipeline=self.search_pipeline,
  157. )
  158. def _create_token_set(self):
  159. """Creates a token set of all tokens in the index using `lunr.TokenSet`
  160. """
  161. self.token_set = TokenSet.from_list(sorted(list(self.inverted_index.keys())))
  162. def _calculate_average_field_lengths(self):
  163. """Calculates the average document length for this index"""
  164. accumulator = defaultdict(int)
  165. documents_with_field = defaultdict(int)
  166. for field_ref, length in self.field_lengths.items():
  167. _field_ref = FieldRef.from_string(field_ref)
  168. field = _field_ref.field_name
  169. documents_with_field[field] += 1
  170. accumulator[field] += length
  171. for field_name in self._fields:
  172. accumulator[field_name] /= documents_with_field[field_name]
  173. self.average_field_length = accumulator
  174. def _create_field_vectors(self):
  175. """Builds a vector space model of every document using lunr.Vector."""
  176. field_vectors = {}
  177. term_idf_cache = {}
  178. for field_ref, term_frequencies in self.field_term_frequencies.items():
  179. _field_ref = FieldRef.from_string(field_ref)
  180. field_name = _field_ref.field_name
  181. field_length = self.field_lengths[field_ref]
  182. field_vector = Vector()
  183. field_boost = self._fields[field_name].boost
  184. doc_boost = self._documents[_field_ref.doc_ref].get("boost", 1)
  185. for term, tf in term_frequencies.items():
  186. term_index = self.inverted_index[term]["_index"]
  187. if term not in term_idf_cache:
  188. idf = Idf(self.inverted_index[term], self.document_count)
  189. term_idf_cache[term] = idf
  190. else:
  191. idf = term_idf_cache[term]
  192. score = (
  193. idf
  194. * ((self._k1 + 1) * tf)
  195. / (
  196. self._k1
  197. * (
  198. 1
  199. - self._b
  200. + self._b
  201. * (field_length / self.average_field_length[field_name])
  202. )
  203. + tf
  204. )
  205. )
  206. score *= field_boost
  207. score *= doc_boost
  208. score_with_precision = round(score, 3)
  209. field_vector.insert(term_index, score_with_precision)
  210. field_vectors[field_ref] = field_vector
  211. self.field_vectors = field_vectors
  212. def use(self, fn, *args, **kwargs):
  213. """Applies a plugin to the index builder.
  214. A plugin is a function that is called with the index builder as its
  215. context. Plugins can be used to customise or extend the behaviour of
  216. the index in some way.
  217. A plugin is just a function, that encapsulated the custom behaviour
  218. that should be applied when building the index. The plugin function
  219. will be called with the index builder as its argument, additional
  220. arguments can also be passed when calling use.
  221. """
  222. fn(self, *args, **kwargs)