api.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # Natural Language Toolkit: Tokenizer Interface
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Tokenizer Interface
  10. """
  11. from abc import ABC, abstractmethod
  12. from nltk.internals import overridden
  13. from nltk.tokenize.util import string_span_tokenize
  14. class TokenizerI(ABC):
  15. """
  16. A processing interface for tokenizing a string.
  17. Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
  18. """
  19. @abstractmethod
  20. def tokenize(self, s):
  21. """
  22. Return a tokenized copy of *s*.
  23. :rtype: list of str
  24. """
  25. if overridden(self.tokenize_sents):
  26. return self.tokenize_sents([s])[0]
  27. def span_tokenize(self, s):
  28. """
  29. Identify the tokens using integer offsets ``(start_i, end_i)``,
  30. where ``s[start_i:end_i]`` is the corresponding token.
  31. :rtype: iter(tuple(int, int))
  32. """
  33. raise NotImplementedError()
  34. def tokenize_sents(self, strings):
  35. """
  36. Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
  37. return [self.tokenize(s) for s in strings]
  38. :rtype: list(list(str))
  39. """
  40. return [self.tokenize(s) for s in strings]
  41. def span_tokenize_sents(self, strings):
  42. """
  43. Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
  44. return [self.span_tokenize(s) for s in strings]
  45. :rtype: iter(list(tuple(int, int)))
  46. """
  47. for s in strings:
  48. yield list(self.span_tokenize(s))
  49. class StringTokenizer(TokenizerI):
  50. """A tokenizer that divides a string into substrings by splitting
  51. on the specified string (defined in subclasses).
  52. """
  53. @property
  54. @abstractmethod
  55. def _string(self):
  56. raise NotImplementedError
  57. def tokenize(self, s):
  58. return s.split(self._string)
  59. def span_tokenize(self, s):
  60. for span in string_span_tokenize(s, self._string):
  61. yield span