| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425 |
- # -*- coding: utf-8 -*-
- """
- Unit tests for nltk.tokenize.
- See also nltk/test/tokenize.doctest
- """
- import unittest
- from nose import SkipTest
- from nose.tools import assert_equal
- from nltk.tokenize import (
- punkt,
- word_tokenize,
- TweetTokenizer,
- StanfordSegmenter,
- TreebankWordTokenizer,
- SyllableTokenizer,
- )
- class TestTokenize(unittest.TestCase):
- def test_tweet_tokenizer(self):
- """
- Test TweetTokenizer using words with special and accented characters.
- """
- tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
- s9 = "@myke: Let's test these words: resumé España München français"
- tokens = tokenizer.tokenize(s9)
- expected = [
- ':',
- "Let's",
- 'test',
- 'these',
- 'words',
- ':',
- 'resumé',
- 'España',
- 'München',
- 'français',
- ]
- self.assertEqual(tokens, expected)
-
- def test_sonority_sequencing_syllable_tokenizer(self):
- """
- Test SyllableTokenizer tokenizer.
- """
- tokenizer = SyllableTokenizer()
- tokens = tokenizer.tokenize('justification')
- self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
- def test_stanford_segmenter_arabic(self):
- """
- Test the Stanford Word Segmenter for Arabic (default config)
- """
- try:
- seg = StanfordSegmenter()
- seg.default_config('ar')
- sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
- segmented_sent = seg.segment(sent.split())
- assert segmented_sent.split() == [
- 'يبحث',
- 'علم',
- 'الحاسوب',
- 'استخدام',
- 'الحوسبة',
- 'ب',
- 'جميع',
- 'اشكال',
- 'ها',
- 'ل',
- 'حل',
- 'المشكلات',
- ]
- except LookupError as e:
- raise SkipTest(str(e))
- def test_stanford_segmenter_chinese(self):
- """
- Test the Stanford Word Segmenter for Chinese (default config)
- """
- try:
- seg = StanfordSegmenter()
- seg.default_config('zh')
- sent = u"这是斯坦福中文分词器测试"
- segmented_sent = seg.segment(sent.split())
- assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
- except LookupError as e:
- raise SkipTest(str(e))
- def test_phone_tokenizer(self):
- """
- Test a string that resembles a phone number but contains a newline
- """
- # Should be recognized as a phone number, albeit one with multiple spaces
- tokenizer = TweetTokenizer()
- test1 = "(393) 928 -3010"
- expected = ['(393) 928 -3010']
- result = tokenizer.tokenize(test1)
- self.assertEqual(result, expected)
- # Due to newline, first three elements aren't part of a phone number;
- # fourth is
- test2 = "(393)\n928 -3010"
- expected = ['(', '393', ')', "928 -3010"]
- result = tokenizer.tokenize(test2)
- self.assertEqual(result, expected)
-
- def test_pad_asterisk(self):
- """
- Test padding of asterisk for word tokenization.
- """
- text = "This is a, *weird sentence with *asterisks in it."
- expected = ['This', 'is', 'a', ',', '*', 'weird', 'sentence',
- 'with', '*', 'asterisks', 'in', 'it', '.']
- self.assertEqual(word_tokenize(text), expected)
-
- def test_pad_dotdot(self):
- """
- Test padding of dotdot* for word tokenization.
- """
- text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....."
- expected = ['Why', 'did', 'dotdot', '..', 'not', 'get',
- 'tokenized', 'but', 'dotdotdot', '...', 'did', '?',
- 'How', 'about', 'manydots', '.....']
- self.assertEqual(word_tokenize(text), expected)
- def test_remove_handle(self):
- """
- Test remove_handle() from casual.py with specially crafted edge cases
- """
- tokenizer = TweetTokenizer(strip_handles=True)
- # Simple example. Handles with just numbers should be allowed
- test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
- expected = ['hello', '.', 'hi']
- result = tokenizer.tokenize(test1)
- self.assertEqual(result, expected)
- # Handles are allowed to follow any of the following characters
- test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
- expected = [
- '`',
- '~',
- '(',
- ')',
- '-',
- '=',
- '+',
- '\\',
- '|',
- '[',
- ']',
- '{',
- '}',
- ';',
- ':',
- "'",
- '"',
- '/',
- '?',
- '.',
- ',',
- '<',
- '>',
- 'ñ',
- '.',
- 'ü',
- '.',
- 'ç',
- '.',
- ]
- result = tokenizer.tokenize(test2)
- self.assertEqual(result, expected)
- # Handles are NOT allowed to follow any of the following characters
- test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
- expected = [
- 'a',
- '@n',
- 'j',
- '@n',
- 'z',
- '@n',
- 'A',
- '@n',
- 'L',
- '@n',
- 'Z',
- '@n',
- '1',
- '@n',
- '4',
- '@n',
- '7',
- '@n',
- '9',
- '@n',
- '0',
- '@n',
- '_',
- '@n',
- '!',
- '@n',
- '@',
- '@n',
- '#',
- '@n',
- '$',
- '@n',
- '%',
- '@n',
- '&',
- '@n',
- '*',
- '@n',
- ]
- result = tokenizer.tokenize(test3)
- self.assertEqual(result, expected)
- # Handles are allowed to precede the following characters
- test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
- expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
- result = tokenizer.tokenize(test4)
- self.assertEqual(result, expected)
- # Tests interactions with special symbols and multiple @
- test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
- expected = [
- '!',
- '@n',
- '#',
- '@n',
- '$',
- '@n',
- '%',
- '@n',
- '&',
- '@n',
- '*',
- '@n',
- '@n',
- '@n',
- '@',
- '@n',
- '@n',
- '@',
- '@n',
- '@n_',
- '@n',
- '@n7',
- '@n',
- '@nj',
- '@n',
- ]
- result = tokenizer.tokenize(test5)
- self.assertEqual(result, expected)
- # Tests that handles can have a max length of 20
- test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
- expected = ['uvwxyz', '1234', '_', 'endofhandle']
- result = tokenizer.tokenize(test6)
- self.assertEqual(result, expected)
- # Edge case where an @ comes directly after a long handle
- test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
- expected = [
- 'u',
- '@abcde',
- '@abcdefghijklmnopqrst',
- '@abcde',
- '_',
- '@abcde',
- '5',
- '@abcde',
- ]
- result = tokenizer.tokenize(test7)
- self.assertEqual(result, expected)
- def test_treebank_span_tokenizer(self):
- """
- Test TreebankWordTokenizer.span_tokenize function
- """
- tokenizer = TreebankWordTokenizer()
- # Test case in the docstring
- test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
- expected = [
- (0, 4),
- (5, 12),
- (13, 17),
- (18, 19),
- (19, 23),
- (24, 26),
- (27, 30),
- (31, 32),
- (32, 36),
- (36, 37),
- (37, 38),
- (40, 46),
- (47, 48),
- (48, 51),
- (51, 52),
- (53, 55),
- (56, 59),
- (60, 62),
- (63, 68),
- (69, 70),
- (70, 76),
- (76, 77),
- (77, 78),
- ]
- result = list(tokenizer.span_tokenize(test1))
- self.assertEqual(result, expected)
- # Test case with double quotation
- test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
- expected = [
- (0, 3),
- (4, 7),
- (8, 10),
- (11, 18),
- (19, 21),
- (22, 25),
- (26, 27),
- (27, 36),
- (37, 42),
- (42, 43),
- (44, 46),
- (47, 50),
- (51, 57),
- (58, 64),
- (65, 68),
- (69, 74),
- (75, 76),
- (77, 85),
- (86, 92),
- (93, 95),
- (96, 102),
- (103, 109),
- ]
- result = list(tokenizer.span_tokenize(test2))
- self.assertEqual(result, expected)
- # Test case with double qoutation as well as converted quotations
- test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
- expected = [
- (0, 3),
- (4, 7),
- (8, 10),
- (11, 18),
- (19, 21),
- (22, 25),
- (26, 27),
- (27, 36),
- (37, 42),
- (42, 43),
- (44, 46),
- (47, 50),
- (51, 57),
- (58, 64),
- (65, 68),
- (69, 74),
- (75, 76),
- (77, 79),
- (79, 87),
- (87, 89),
- (90, 96),
- (97, 99),
- (100, 106),
- (107, 113),
- ]
- result = list(tokenizer.span_tokenize(test3))
- self.assertEqual(result, expected)
- def test_word_tokenize(self):
- """
- Test word_tokenize function
- """
-
- sentence = "The 'v', I've been fooled but I'll seek revenge."
- expected = ['The', "'", 'v', "'", ',', 'I', "'ve", 'been', 'fooled',
- 'but', 'I', "'ll", 'seek', 'revenge', '.']
- self.assertEqual(word_tokenize(sentence), expected)
-
- sentence = "'v' 're'"
- expected = ["'", 'v', "'", "'re", "'"]
- self.assertEqual(word_tokenize(sentence), expected)
- def test_punkt_pair_iter(self):
- test_cases = [
- ('12', [('1', '2'), ('2', None)]),
- ('123', [('1', '2'), ('2', '3'), ('3', None)]),
- ('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
- ]
- for (test_input, expected_output) in test_cases:
- actual_output = [x for x in punkt._pair_iter(test_input)]
- assert_equal(actual_output, expected_output)
- def test_punkt_pair_iter_handles_stop_iteration_exception(self):
- # test input to trigger StopIteration from next()
- it = iter([])
- # call method under test and produce a generator
- gen = punkt._pair_iter(it)
- # unpack generator, ensure that no error is raised
- list(gen)
- def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
- obj = punkt.PunktBaseClass()
- class TestPunktTokenizeWordsMock:
- def word_tokenize(self, s):
- return iter([])
- obj._lang_vars = TestPunktTokenizeWordsMock()
- # unpack generator, ensure that no error is raised
- list(obj._tokenize_words('test'))
|