test_tokenize.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. # -*- coding: utf-8 -*-
  2. """
  3. Unit tests for nltk.tokenize.
  4. See also nltk/test/tokenize.doctest
  5. """
  6. import unittest
  7. from nose import SkipTest
  8. from nose.tools import assert_equal
  9. from nltk.tokenize import (
  10. punkt,
  11. word_tokenize,
  12. TweetTokenizer,
  13. StanfordSegmenter,
  14. TreebankWordTokenizer,
  15. SyllableTokenizer,
  16. )
  17. class TestTokenize(unittest.TestCase):
  18. def test_tweet_tokenizer(self):
  19. """
  20. Test TweetTokenizer using words with special and accented characters.
  21. """
  22. tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
  23. s9 = "@myke: Let's test these words: resumé España München français"
  24. tokens = tokenizer.tokenize(s9)
  25. expected = [
  26. ':',
  27. "Let's",
  28. 'test',
  29. 'these',
  30. 'words',
  31. ':',
  32. 'resumé',
  33. 'España',
  34. 'München',
  35. 'français',
  36. ]
  37. self.assertEqual(tokens, expected)
  38. def test_sonority_sequencing_syllable_tokenizer(self):
  39. """
  40. Test SyllableTokenizer tokenizer.
  41. """
  42. tokenizer = SyllableTokenizer()
  43. tokens = tokenizer.tokenize('justification')
  44. self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
  45. def test_stanford_segmenter_arabic(self):
  46. """
  47. Test the Stanford Word Segmenter for Arabic (default config)
  48. """
  49. try:
  50. seg = StanfordSegmenter()
  51. seg.default_config('ar')
  52. sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
  53. segmented_sent = seg.segment(sent.split())
  54. assert segmented_sent.split() == [
  55. 'يبحث',
  56. 'علم',
  57. 'الحاسوب',
  58. 'استخدام',
  59. 'الحوسبة',
  60. 'ب',
  61. 'جميع',
  62. 'اشكال',
  63. 'ها',
  64. 'ل',
  65. 'حل',
  66. 'المشكلات',
  67. ]
  68. except LookupError as e:
  69. raise SkipTest(str(e))
  70. def test_stanford_segmenter_chinese(self):
  71. """
  72. Test the Stanford Word Segmenter for Chinese (default config)
  73. """
  74. try:
  75. seg = StanfordSegmenter()
  76. seg.default_config('zh')
  77. sent = u"这是斯坦福中文分词器测试"
  78. segmented_sent = seg.segment(sent.split())
  79. assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
  80. except LookupError as e:
  81. raise SkipTest(str(e))
  82. def test_phone_tokenizer(self):
  83. """
  84. Test a string that resembles a phone number but contains a newline
  85. """
  86. # Should be recognized as a phone number, albeit one with multiple spaces
  87. tokenizer = TweetTokenizer()
  88. test1 = "(393) 928 -3010"
  89. expected = ['(393) 928 -3010']
  90. result = tokenizer.tokenize(test1)
  91. self.assertEqual(result, expected)
  92. # Due to newline, first three elements aren't part of a phone number;
  93. # fourth is
  94. test2 = "(393)\n928 -3010"
  95. expected = ['(', '393', ')', "928 -3010"]
  96. result = tokenizer.tokenize(test2)
  97. self.assertEqual(result, expected)
  98. def test_pad_asterisk(self):
  99. """
  100. Test padding of asterisk for word tokenization.
  101. """
  102. text = "This is a, *weird sentence with *asterisks in it."
  103. expected = ['This', 'is', 'a', ',', '*', 'weird', 'sentence',
  104. 'with', '*', 'asterisks', 'in', 'it', '.']
  105. self.assertEqual(word_tokenize(text), expected)
  106. def test_pad_dotdot(self):
  107. """
  108. Test padding of dotdot* for word tokenization.
  109. """
  110. text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
  111. expected = ['Why', 'did', 'dotdot', '..', 'not', 'get',
  112. 'tokenized', 'but', 'dotdotdot', '...', 'did', '?',
  113. 'How', 'about', 'manydots', '.....']
  114. self.assertEqual(word_tokenize(text), expected)
  115. def test_remove_handle(self):
  116. """
  117. Test remove_handle() from casual.py with specially crafted edge cases
  118. """
  119. tokenizer = TweetTokenizer(strip_handles=True)
  120. # Simple example. Handles with just numbers should be allowed
  121. test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
  122. expected = ['hello', '.', 'hi']
  123. result = tokenizer.tokenize(test1)
  124. self.assertEqual(result, expected)
  125. # Handles are allowed to follow any of the following characters
  126. test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
  127. expected = [
  128. '`',
  129. '~',
  130. '(',
  131. ')',
  132. '-',
  133. '=',
  134. '+',
  135. '\\',
  136. '|',
  137. '[',
  138. ']',
  139. '{',
  140. '}',
  141. ';',
  142. ':',
  143. "'",
  144. '"',
  145. '/',
  146. '?',
  147. '.',
  148. ',',
  149. '<',
  150. '>',
  151. 'ñ',
  152. '.',
  153. 'ü',
  154. '.',
  155. 'ç',
  156. '.',
  157. ]
  158. result = tokenizer.tokenize(test2)
  159. self.assertEqual(result, expected)
  160. # Handles are NOT allowed to follow any of the following characters
  161. test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
  162. expected = [
  163. 'a',
  164. '@n',
  165. 'j',
  166. '@n',
  167. 'z',
  168. '@n',
  169. 'A',
  170. '@n',
  171. 'L',
  172. '@n',
  173. 'Z',
  174. '@n',
  175. '1',
  176. '@n',
  177. '4',
  178. '@n',
  179. '7',
  180. '@n',
  181. '9',
  182. '@n',
  183. '0',
  184. '@n',
  185. '_',
  186. '@n',
  187. '!',
  188. '@n',
  189. '@',
  190. '@n',
  191. '#',
  192. '@n',
  193. '$',
  194. '@n',
  195. '%',
  196. '@n',
  197. '&',
  198. '@n',
  199. '*',
  200. '@n',
  201. ]
  202. result = tokenizer.tokenize(test3)
  203. self.assertEqual(result, expected)
  204. # Handles are allowed to precede the following characters
  205. test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
  206. expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
  207. result = tokenizer.tokenize(test4)
  208. self.assertEqual(result, expected)
  209. # Tests interactions with special symbols and multiple @
  210. test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
  211. expected = [
  212. '!',
  213. '@n',
  214. '#',
  215. '@n',
  216. '$',
  217. '@n',
  218. '%',
  219. '@n',
  220. '&',
  221. '@n',
  222. '*',
  223. '@n',
  224. '@n',
  225. '@n',
  226. '@',
  227. '@n',
  228. '@n',
  229. '@',
  230. '@n',
  231. '@n_',
  232. '@n',
  233. '@n7',
  234. '@n',
  235. '@nj',
  236. '@n',
  237. ]
  238. result = tokenizer.tokenize(test5)
  239. self.assertEqual(result, expected)
  240. # Tests that handles can have a max length of 20
  241. test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
  242. expected = ['uvwxyz', '1234', '_', 'endofhandle']
  243. result = tokenizer.tokenize(test6)
  244. self.assertEqual(result, expected)
  245. # Edge case where an @ comes directly after a long handle
  246. test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
  247. expected = [
  248. 'u',
  249. '@abcde',
  250. '@abcdefghijklmnopqrst',
  251. '@abcde',
  252. '_',
  253. '@abcde',
  254. '5',
  255. '@abcde',
  256. ]
  257. result = tokenizer.tokenize(test7)
  258. self.assertEqual(result, expected)
  259. def test_treebank_span_tokenizer(self):
  260. """
  261. Test TreebankWordTokenizer.span_tokenize function
  262. """
  263. tokenizer = TreebankWordTokenizer()
  264. # Test case in the docstring
  265. test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
  266. expected = [
  267. (0, 4),
  268. (5, 12),
  269. (13, 17),
  270. (18, 19),
  271. (19, 23),
  272. (24, 26),
  273. (27, 30),
  274. (31, 32),
  275. (32, 36),
  276. (36, 37),
  277. (37, 38),
  278. (40, 46),
  279. (47, 48),
  280. (48, 51),
  281. (51, 52),
  282. (53, 55),
  283. (56, 59),
  284. (60, 62),
  285. (63, 68),
  286. (69, 70),
  287. (70, 76),
  288. (76, 77),
  289. (77, 78),
  290. ]
  291. result = list(tokenizer.span_tokenize(test1))
  292. self.assertEqual(result, expected)
  293. # Test case with double quotation
  294. test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
  295. expected = [
  296. (0, 3),
  297. (4, 7),
  298. (8, 10),
  299. (11, 18),
  300. (19, 21),
  301. (22, 25),
  302. (26, 27),
  303. (27, 36),
  304. (37, 42),
  305. (42, 43),
  306. (44, 46),
  307. (47, 50),
  308. (51, 57),
  309. (58, 64),
  310. (65, 68),
  311. (69, 74),
  312. (75, 76),
  313. (77, 85),
  314. (86, 92),
  315. (93, 95),
  316. (96, 102),
  317. (103, 109),
  318. ]
  319. result = list(tokenizer.span_tokenize(test2))
  320. self.assertEqual(result, expected)
  321. # Test case with double qoutation as well as converted quotations
  322. test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
  323. expected = [
  324. (0, 3),
  325. (4, 7),
  326. (8, 10),
  327. (11, 18),
  328. (19, 21),
  329. (22, 25),
  330. (26, 27),
  331. (27, 36),
  332. (37, 42),
  333. (42, 43),
  334. (44, 46),
  335. (47, 50),
  336. (51, 57),
  337. (58, 64),
  338. (65, 68),
  339. (69, 74),
  340. (75, 76),
  341. (77, 79),
  342. (79, 87),
  343. (87, 89),
  344. (90, 96),
  345. (97, 99),
  346. (100, 106),
  347. (107, 113),
  348. ]
  349. result = list(tokenizer.span_tokenize(test3))
  350. self.assertEqual(result, expected)
  351. def test_word_tokenize(self):
  352. """
  353. Test word_tokenize function
  354. """
  355. sentence = "The 'v', I've been fooled but I'll seek revenge."
  356. expected = ['The', "'", 'v', "'", ',', 'I', "'ve", 'been', 'fooled',
  357. 'but', 'I', "'ll", 'seek', 'revenge', '.']
  358. self.assertEqual(word_tokenize(sentence), expected)
  359. sentence = "'v' 're'"
  360. expected = ["'", 'v', "'", "'re", "'"]
  361. self.assertEqual(word_tokenize(sentence), expected)
  362. def test_punkt_pair_iter(self):
  363. test_cases = [
  364. ('12', [('1', '2'), ('2', None)]),
  365. ('123', [('1', '2'), ('2', '3'), ('3', None)]),
  366. ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
  367. ]
  368. for (test_input, expected_output) in test_cases:
  369. actual_output = [x for x in punkt._pair_iter(test_input)]
  370. assert_equal(actual_output, expected_output)
  371. def test_punkt_pair_iter_handles_stop_iteration_exception(self):
  372. # test input to trigger StopIteration from next()
  373. it = iter([])
  374. # call method under test and produce a generator
  375. gen = punkt._pair_iter(it)
  376. # unpack generator, ensure that no error is raised
  377. list(gen)
  378. def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
  379. obj = punkt.PunktBaseClass()
  380. class TestPunktTokenizeWordsMock:
  381. def word_tokenize(self, s):
  382. return iter([])
  383. obj._lang_vars = TestPunktTokenizeWordsMock()
  384. # unpack generator, ensure that no error is raised
  385. list(obj._tokenize_words('test'))