trimmer.py 660 B

1234567891011121314151617181920212223
  1. from __future__ import unicode_literals
  2. import re
  3. def generate_trimmer(word_characters):
  4. """Returns a trimmer function from a string of word characters.
  5. TODO: lunr-languages ships with lists of word characters for each language
  6. I haven't found an equivalent in Python, we may need to copy it.
  7. """
  8. full_re = re.compile(r"^[^{0}]*?([{0}]+)[^{0}]*?$".format(word_characters))
  9. def trimmer(token, i=None, tokens=None):
  10. def trim(s, metadata=None):
  11. match = full_re.match(s)
  12. if match is None:
  13. return s
  14. return match.group(1)
  15. return token.update(trim)
  16. return trimmer