gbrault
/
jupytersketcher
mirror of https://github.com/gbrault/jupytersketcher.git


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							from __future__ import unicode_literals


from past.builtins import basestring

from lunr import languages as lang
from lunr.builder import Builder
from lunr.stemmer import stemmer
from lunr.trimmer import trimmer
from lunr.stop_word_filter import stop_word_filter


def lunr(ref, fields, documents, languages=None):
    """A convenience function to configure and construct a lunr.Index.

    Args:
        ref (str): The key in the documents to be used a the reference.
        fields (list): A list of strings defining fields in the documents to
            index. Optionally a list of dictionaries with three keys:
            `field_name` defining the document's field, `boost` an integer
            defining a boost to be applied to the field, and `extractor`
            a callable taking the document as a single argument and returning
            a string located in the document in a particular way.
        documents (list): The list of dictonaries representing the documents
            to index. Optionally a 2-tuple of dicts, the first one being
            the document and the second the associated attributes to it.
        languages (str or list, optional): The languages to use if using
            NLTK language support, ignored if NLTK is not available.

    Returns:
        Index: The populated Index ready to search against.
    """
    if languages is not None and lang.LANGUAGE_SUPPORT:
        if isinstance(languages, basestring):
            languages = [languages]

        unsupported_languages = set(languages) - set(lang.SUPPORTED_LANGUAGES)
        if unsupported_languages:
            raise RuntimeError(
                "The specified languages {} are not supported, "
                "please choose one of {}".format(
                    ", ".join(unsupported_languages),
                    ", ".join(lang.SUPPORTED_LANGUAGES.keys()),
                )
            )
        builder = lang.get_nltk_builder(languages)
    else:
        builder = Builder()
        builder.pipeline.add(trimmer, stop_word_filter, stemmer)
        builder.search_pipeline.add(stemmer)

    builder.ref(ref)
    for field in fields:
        if isinstance(field, dict):
            builder.field(**field)
        else:
            builder.field(field)

    for document in documents:
        if isinstance(document, (tuple, list)):
            builder.add(document[0], attributes=document[1])
        else:
            builder.add(document)

    return builder.build()