tokenize.doctest 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. .. Copyright (C) 2001-2020 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. >>> from nltk.tokenize import *
  4. Regression Tests: Treebank Tokenizer
  5. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6. Some test strings.
  7. >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
  8. >>> word_tokenize(s1)
  9. ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
  10. >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
  11. >>> word_tokenize(s2)
  12. ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
  13. >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
  14. >>> word_tokenize(s3)
  15. ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
  16. >>> s4 = "I cannot cannot work under these conditions!"
  17. >>> word_tokenize(s4)
  18. ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
  19. >>> s5 = "The company spent $30,000,000 last year."
  20. >>> word_tokenize(s5)
  21. ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
  22. >>> s6 = "The company spent 40.75% of its income last year."
  23. >>> word_tokenize(s6)
  24. ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
  25. >>> s7 = "He arrived at 3:00 pm."
  26. >>> word_tokenize(s7)
  27. ['He', 'arrived', 'at', '3:00', 'pm', '.']
  28. >>> s8 = "I bought these items: books, pencils, and pens."
  29. >>> word_tokenize(s8)
  30. ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
  31. >>> s9 = "Though there were 150, 100 of them were old."
  32. >>> word_tokenize(s9)
  33. ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
  34. >>> s10 = "There were 300,000, but that wasn't enough."
  35. >>> word_tokenize(s10)
  36. ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']
  37. Testing improvement made to the TreebankWordTokenizer
  38. >>> sx1 = '\xabNow that I can do.\xbb'
  39. >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
  40. >>> word_tokenize(sx1) == expected
  41. True
  42. >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
  43. >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
  44. >>> word_tokenize(sx2) == expected
  45. True
  46. Sentence tokenization in word_tokenize:
  47. >>> s11 = "I called Dr. Jones. I called Dr. Jones."
  48. >>> word_tokenize(s11)
  49. ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.']
  50. >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen "
  51. ... "Kuchen einzukaufen. Ich muss.")
  52. >>> word_tokenize(s12)
  53. ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw',
  54. '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
  55. >>> word_tokenize(s12, 'german')
  56. ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.',
  57. 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
  58. Regression Tests: Regexp Tokenizer
  59. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  60. Some additional test strings.
  61. >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
  62. ... "two of them.\n\nThanks.")
  63. >>> s2 = ("Alas, it has not rained today. When, do you think, "
  64. ... "will it rain again?")
  65. >>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
  66. ... "not relax our vigilance!</p>")
  67. >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
  68. [', ', '. ', ', ', ', ', '?']
  69. >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
  70. ['Alas', 'it has not rained today', 'When', 'do you think',
  71. 'will it rain again']
  72. Take care to avoid using capturing groups:
  73. >>> regexp_tokenize(s3, r'</?[bp]>', gaps=False)
  74. ['<p>', '<b>', '</b>', '</p>']
  75. >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=False)
  76. ['<p>', '<b>', '</b>', '</p>']
  77. >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=True)
  78. ['Although this is ', 'not',
  79. ' the case here, we must not relax our vigilance!']
  80. Named groups are capturing groups, and confuse the tokenizer:
  81. >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
  82. ['p', 'b', 'b', 'p']
  83. >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
  84. ['p', 'Although this is ', 'b', 'not', 'b',
  85. ' the case here, we must not relax our vigilance!', 'p']
  86. Make sure that nested groups don't confuse the tokenizer:
  87. >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False)
  88. ['las', 'has', 'rai', 'rai']
  89. >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True)
  90. ['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
  91. 'n again?']
  92. Back-references require capturing groups, and these are not supported:
  93. >>> regexp_tokenize("aabbbcccc", r'(.)\1')
  94. ['a', 'b', 'c', 'c']
  95. A simple sentence tokenizer '\.(\s+|$)'
  96. >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True)
  97. ['Good muffins cost $3.88\nin New York',
  98. 'Please buy me\ntwo of them', 'Thanks']
  99. Regression Tests: TweetTokenizer
  100. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  101. TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks.
  102. >>> from nltk.tokenize import TweetTokenizer
  103. >>> tknzr = TweetTokenizer()
  104. >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
  105. >>> tknzr.tokenize(s0)
  106. ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
  107. >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
  108. >>> tknzr.tokenize(s1)
  109. ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']
  110. >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
  111. >>> tknzr.tokenize(s2)
  112. ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
  113. >>> s3 = "@Insanomania They do... Their mentality doesn't :("
  114. >>> tknzr.tokenize(s3)
  115. ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':(']
  116. >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!"
  117. >>> tknzr.tokenize(s4)
  118. ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!']
  119. >>> tknzr = TweetTokenizer(reduce_len=True)
  120. >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :("
  121. >>> tknzr.tokenize(s5)
  122. ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':(']
  123. It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3.
  124. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
  125. >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!'
  126. >>> tknzr.tokenize(s6)
  127. [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
  128. >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.'
  129. >>> tknzr.tokenize(s7)
  130. [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.']
  131. >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.'
  132. >>> tknzr.tokenize(s8)
  133. ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.']
  134. The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected:
  135. >>> tknzr = TweetTokenizer(preserve_case=False)
  136. >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P"
  137. >>> tknzr.tokenize(s9)
  138. ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P']
  139. It should not hang on long sequences of the same punctuation character.
  140. >>> tknzr = TweetTokenizer()
  141. >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
  142. >>> tknzr.tokenize(s10)
  143. ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
  144. Regression Tests: PunktSentenceTokenizer
  145. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  146. The sentence splitter should remove whitespace following the sentence boundary.
  147. >>> pst = PunktSentenceTokenizer()
  148. >>> pst.tokenize('See Section 3). Or Section 2). ')
  149. ['See Section 3).', 'Or Section 2).']
  150. >>> pst.tokenize('See Section 3.) Or Section 2.) ')
  151. ['See Section 3.)', 'Or Section 2.)']
  152. >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False)
  153. ['See Section 3.', ') Or Section 2.', ')']
  154. Two instances of PunktSentenceTokenizer should not share PunktParameters.
  155. >>> pst = PunktSentenceTokenizer()
  156. >>> pst2 = PunktSentenceTokenizer()
  157. >>> pst._params is pst2._params
  158. False
  159. Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067
  160. >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer
  161. >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters
  162. >>> pbc = PunktBaseClass(lang_vars=None, params=None)
  163. >>> type(pbc._params)
  164. <class 'nltk.tokenize.punkt.PunktParameters'>
  165. >>> type(pbc._lang_vars)
  166. <class 'nltk.tokenize.punkt.PunktLanguageVars'>
  167. >>> pt = PunktTrainer(lang_vars=None)
  168. >>> type(pt._lang_vars)
  169. <class 'nltk.tokenize.punkt.PunktLanguageVars'>
  170. >>> pst = PunktSentenceTokenizer(lang_vars=None)
  171. >>> type(pst._lang_vars)
  172. <class 'nltk.tokenize.punkt.PunktLanguageVars'>
  173. Regression Tests: align_tokens
  174. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  175. Post-hoc alignment of tokens with a source string
  176. >>> from nltk.tokenize.util import align_tokens
  177. >>> list(align_tokens([''], ""))
  178. [(0, 0)]
  179. >>> list(align_tokens([''], " "))
  180. [(0, 0)]
  181. >>> list(align_tokens([], ""))
  182. []
  183. >>> list(align_tokens([], " "))
  184. []
  185. >>> list(align_tokens(['a'], "a"))
  186. [(0, 1)]
  187. >>> list(align_tokens(['abc', 'def'], "abcdef"))
  188. [(0, 3), (3, 6)]
  189. >>> list(align_tokens(['abc', 'def'], "abc def"))
  190. [(0, 3), (4, 7)]
  191. >>> list(align_tokens(['ab', 'cd'], "ab cd ef"))
  192. [(0, 2), (3, 5)]
  193. >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef"))
  194. [(0, 2), (3, 5), (6, 8)]
  195. >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef"))
  196. Traceback (most recent call last):
  197. ....
  198. ValueError: substring "efg" not found in "ab cd ef"
  199. >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef"))
  200. Traceback (most recent call last):
  201. ....
  202. ValueError: substring "gh" not found in "ab cd ef"
  203. >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday."))
  204. [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)]
  205. Regression Tests: MWETokenizer
  206. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  207. Pickle an MWETokenizer
  208. >>> from nltk.tokenize import MWETokenizer
  209. >>> import pickle
  210. >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
  211. >>> p = pickle.dumps(tokenizer)
  212. >>> unpickeled = pickle.loads(p)
  213. >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split())
  214. ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
  215. Regression Tests: TextTilingTokenizer
  216. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  217. TextTilingTokneizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm.
  218. >>> from nltk.tokenize import TextTilingTokenizer
  219. >>> from nltk.corpus import brown
  220. >>> tt = TextTilingTokenizer()
  221. >>> tt.tokenize(brown.raw()[0:1000])
  222. ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"]
  223. Test that `ValueError` exceptions are raised when illegal arguments are used.
  224. >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000])
  225. Traceback (most recent call last):
  226. ...
  227. ValueError: Similarity method foo not recognized
  228. >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000])
  229. Traceback (most recent call last):
  230. ...
  231. ValueError: Smoothing method bar not recognized