__main__.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. from __future__ import unicode_literals
  2. from past.builtins import basestring
  3. from lunr import languages as lang
  4. from lunr.builder import Builder
  5. from lunr.stemmer import stemmer
  6. from lunr.trimmer import trimmer
  7. from lunr.stop_word_filter import stop_word_filter
  8. def lunr(ref, fields, documents, languages=None):
  9. """A convenience function to configure and construct a lunr.Index.
  10. Args:
  11. ref (str): The key in the documents to be used a the reference.
  12. fields (list): A list of strings defining fields in the documents to
  13. index. Optionally a list of dictionaries with three keys:
  14. `field_name` defining the document's field, `boost` an integer
  15. defining a boost to be applied to the field, and `extractor`
  16. a callable taking the document as a single argument and returning
  17. a string located in the document in a particular way.
  18. documents (list): The list of dictonaries representing the documents
  19. to index. Optionally a 2-tuple of dicts, the first one being
  20. the document and the second the associated attributes to it.
  21. languages (str or list, optional): The languages to use if using
  22. NLTK language support, ignored if NLTK is not available.
  23. Returns:
  24. Index: The populated Index ready to search against.
  25. """
  26. if languages is not None and lang.LANGUAGE_SUPPORT:
  27. if isinstance(languages, basestring):
  28. languages = [languages]
  29. unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
  30. if unsupported_languages:
  31. raise RuntimeError(
  32. "The specified languages {} are not supported, "
  33. "please choose one of {}".format(
  34. ", ".join(unsupported_languages),
  35. ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
  36. )
  37. )
  38. builder = lang.get_nltk_builder(languages)
  39. else:
  40. builder = Builder()
  41. builder.pipeline.add(trimmer, stop_word_filter, stemmer)
  42. builder.search_pipeline.add(stemmer)
  43. builder.ref(ref)
  44. for field in fields:
  45. if isinstance(field, dict):
  46. builder.field(**field)
  47. else:
  48. builder.field(field)
  49. for document in documents:
  50. if isinstance(document, (tuple, list)):
  51. builder.add(document[0], attributes=document[1])
  52. else:
  53. builder.add(document)
  54. return builder.build()