__init__.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Tokenizers
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Edward Loper <edloper@gmail.com>
  6. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  7. # Contributors: matthewmc, clouds56
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. r"""
  11. NLTK Tokenizer Package
  12. Tokenizers divide strings into lists of substrings. For example,
  13. tokenizers can be used to find the words and punctuation in a string:
  14. >>> from nltk.tokenize import word_tokenize
  15. >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
  16. ... two of them.\n\nThanks.'''
  17. >>> word_tokenize(s)
  18. ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
  19. 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  20. This particular tokenizer requires the Punkt sentence tokenization
  21. models to be installed. NLTK also provides a simpler,
  22. regular-expression based tokenizer, which splits text on whitespace
  23. and punctuation:
  24. >>> from nltk.tokenize import wordpunct_tokenize
  25. >>> wordpunct_tokenize(s)
  26. ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
  27. 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
  28. We can also operate at the level of sentences, using the sentence
  29. tokenizer directly as follows:
  30. >>> from nltk.tokenize import sent_tokenize, word_tokenize
  31. >>> sent_tokenize(s)
  32. ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
  33. >>> [word_tokenize(t) for t in sent_tokenize(s)]
  34. [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
  35. ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
  36. Caution: when tokenizing a Unicode string, make sure you are not
  37. using an encoded version of the string (it may be necessary to
  38. decode it first, e.g. with ``s.decode("utf8")``.
  39. NLTK tokenizers can produce token-spans, represented as tuples of integers
  40. having the same semantics as string slices, to support efficient comparison
  41. of tokenizers. (These methods are implemented as generators.)
  42. >>> from nltk.tokenize import WhitespaceTokenizer
  43. >>> list(WhitespaceTokenizer().span_tokenize(s))
  44. [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
  45. (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
  46. There are numerous ways to tokenize text. If you need more control over
  47. tokenization, see the other methods provided in this package.
  48. For further information, please see Chapter 3 of the NLTK book.
  49. """
  50. import re
  51. from nltk.data import load
  52. from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
  53. from nltk.tokenize.mwe import MWETokenizer
  54. from nltk.tokenize.destructive import NLTKWordTokenizer
  55. from nltk.tokenize.punkt import PunktSentenceTokenizer
  56. from nltk.tokenize.regexp import (
  57. RegexpTokenizer,
  58. WhitespaceTokenizer,
  59. BlanklineTokenizer,
  60. WordPunctTokenizer,
  61. wordpunct_tokenize,
  62. regexp_tokenize,
  63. blankline_tokenize,
  64. )
  65. from nltk.tokenize.repp import ReppTokenizer
  66. from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
  67. from nltk.tokenize.simple import (
  68. SpaceTokenizer,
  69. TabTokenizer,
  70. LineTokenizer,
  71. line_tokenize,
  72. )
  73. from nltk.tokenize.texttiling import TextTilingTokenizer
  74. from nltk.tokenize.toktok import ToktokTokenizer
  75. from nltk.tokenize.treebank import TreebankWordTokenizer
  76. from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
  77. from nltk.tokenize.stanford_segmenter import StanfordSegmenter
  78. from nltk.tokenize.sonority_sequencing import SyllableTokenizer
  79. # Standard sentence tokenizer.
  80. def sent_tokenize(text, language="english"):
  81. """
  82. Return a sentence-tokenized copy of *text*,
  83. using NLTK's recommended sentence tokenizer
  84. (currently :class:`.PunktSentenceTokenizer`
  85. for the specified language).
  86. :param text: text to split into sentences
  87. :param language: the model name in the Punkt corpus
  88. """
  89. tokenizer = load("tokenizers/punkt/{0}.pickle".format(language))
  90. return tokenizer.tokenize(text)
  91. # Standard word tokenizer.
  92. _treebank_word_tokenizer = NLTKWordTokenizer()
  93. def word_tokenize(text, language="english", preserve_line=False):
  94. """
  95. Return a tokenized copy of *text*,
  96. using NLTK's recommended word tokenizer
  97. (currently an improved :class:`.TreebankWordTokenizer`
  98. along with :class:`.PunktSentenceTokenizer`
  99. for the specified language).
  100. :param text: text to split into words
  101. :type text: str
  102. :param language: the model name in the Punkt corpus
  103. :type language: str
  104. :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
  105. :type preserve_line: bool
  106. """
  107. sentences = [text] if preserve_line else sent_tokenize(text, language)
  108. return [
  109. token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
  110. ]