| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- from __future__ import unicode_literals
- from functools import partial
- from lunr.pipeline import Pipeline
- # map from ISO-639-1 codes to SnowballStemmer.languages
- SUPPORTED_LANGUAGES = {
- 'ar': 'arabic',
- 'da': 'danish',
- 'nl': 'dutch',
- 'en': 'english',
- 'fi': 'finnish',
- 'fr': 'french',
- 'de': 'german',
- 'hu': 'hungarian',
- 'it': 'italian',
- 'no': 'norwegian',
- 'pt': 'portuguese',
- 'ro': 'romanian',
- 'ru': 'russian',
- 'es': 'spanish',
- 'sv': 'swedish'
- }
- try: # pragma: no cover
- from nltk.stem.snowball import SnowballStemmer
- LANGUAGE_SUPPORT = True
- except ImportError: # pragma: no cover
- LANGUAGE_SUPPORT = False
- def get_language_stemmer(language):
- """Retrieves the SnowballStemmer for a particular language.
- Args:
- language (str): ISO-639-1 code of the language.
- """
- return SnowballStemmer(SUPPORTED_LANGUAGES[language])
- def nltk_stemmer(stemmer, token, i=None, tokens=None):
- """Wrapper around a NLTK SnowballStemmer, which includes stop words for
- each language.
- Args:
- stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
- token (lunr.Token): The token to stem.
- i (int): The index of the token in a set.
- tokens (list): A list of tokens representing the set.
- """
- def wrapped_stem(token, metadata=None):
- return stemmer.stem(token)
- return token.update(wrapped_stem)
- def register_languages():
- """Register all supported languages to ensure compatibility."""
- for language in SUPPORTED_LANGUAGES:
- language_stemmer = partial(
- nltk_stemmer, get_language_stemmer(language))
- Pipeline.register_function(
- language_stemmer, 'stemmer-{}'.format(language))
- if LANGUAGE_SUPPORT: # pragma: no cover
- # TODO: registering all possible stemmers feels unnecessary but it solves
- # deserializing with arbitrary language functions. Ideally the schema would
- # provide the language(s) for the index and we could register the stemmers
- # as needed
- register_languages()
|