tokenizer.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. from __future__ import unicode_literals
  2. from builtins import str
  3. from copy import deepcopy
  4. from lunr.token import Token
  5. from lunr.utils import as_string
  6. SEPARATOR_CHARS = " \t\n\r\f\v\xa0-"
  7. def default_separator(char):
  8. return char and char in SEPARATOR_CHARS
  9. def Tokenizer(obj, metadata=None, separator=None):
  10. """Splits a string into tokens ready to be inserted into the search index.
  11. Args:
  12. metadata (dict): Optional metadata can be passed to the tokenizer, this
  13. metadata will be cloned and added as metadata to every token that is
  14. created from the object to be tokenized.
  15. separator (callable or compiled regex): This tokenizer will convert its
  16. parameter to a string by calling `str` and then will split this
  17. string on characters for which `separator` is True. Lists will have
  18. their elements converted to strings and wrapped in a lunr `Token`.
  19. Returns:
  20. List of Token instances.
  21. """
  22. if obj is None:
  23. return []
  24. metadata = metadata or {}
  25. if isinstance(obj, (list, tuple)):
  26. return [
  27. Token(as_string(element).lower(), deepcopy(metadata)) for element in obj
  28. ]
  29. if separator is None:
  30. is_separator = default_separator
  31. elif callable(separator):
  32. is_separator = separator
  33. else: # must be a regex, remove when dropping support for 2.7
  34. is_separator = lambda c: separator.match(c) # noqa
  35. string = str(obj).lower()
  36. length = len(string)
  37. tokens = []
  38. slice_start = 0
  39. for slice_end in range(length + 1):
  40. char = string[slice_end] if slice_end != length else ""
  41. slice_length = slice_end - slice_start
  42. if is_separator(char) or slice_end == length:
  43. if slice_length > 0:
  44. token_metadata = {}
  45. token_metadata["position"] = [slice_start, slice_length]
  46. token_metadata["index"] = len(tokens)
  47. token_metadata.update(metadata)
  48. sl = slice(slice_start, slice_end)
  49. tokens.append(Token(string[sl], token_metadata))
  50. slice_start = slice_end + 1
  51. return tokens