from __future__ import unicode_literals import re def generate_trimmer(word_characters): """Returns a trimmer function from a string of word characters. TODO: lunr-languages ships with lists of word characters for each language I haven't found an equivalent in Python, we may need to copy it. """ full_re = re.compile(r"^[^{0}]*?([{0}]+)[^{0}]*?$".format(word_characters)) def trimmer(token, i=None, tokens=None): def trim(s, metadata=None): match = full_re.match(s) if match is None: return s return match.group(1) return token.update(trim) return trimmer