| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- # -*- coding: utf-8 -*-
- """
- Unit tests for Senna
- """
- from os import environ, path, sep
- import logging
- import unittest
- from nltk.classify import Senna
- from nltk.tag import SennaTagger, SennaChunkTagger, SennaNERTagger
- # Set Senna executable path for tests if it is not specified as an environment variable
- if 'SENNA' in environ:
- SENNA_EXECUTABLE_PATH = path.normpath(environ['SENNA']) + sep
- else:
- SENNA_EXECUTABLE_PATH = '/usr/share/senna-v3.0'
- senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
- @unittest.skipUnless(senna_is_installed, "Requires Senna executable")
- class TestSennaPipeline(unittest.TestCase):
- """Unittest for nltk.classify.senna"""
- def test_senna_pipeline(self):
- """Senna pipeline interface"""
- pipeline = Senna(SENNA_EXECUTABLE_PATH, ['pos', 'chk', 'ner'])
- sent = 'Dusseldorf is an international business center'.split()
- result = [
- (token['word'], token['chk'], token['ner'], token['pos'])
- for token in pipeline.tag(sent)
- ]
- expected = [
- ('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'),
- ('is', 'B-VP', 'O', 'VBZ'),
- ('an', 'B-NP', 'O', 'DT'),
- ('international', 'I-NP', 'O', 'JJ'),
- ('business', 'I-NP', 'O', 'NN'),
- ('center', 'I-NP', 'O', 'NN'),
- ]
- self.assertEqual(result, expected)
- @unittest.skipUnless(senna_is_installed, "Requires Senna executable")
- class TestSennaTagger(unittest.TestCase):
- """Unittest for nltk.tag.senna"""
- def test_senna_tagger(self):
- tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
- result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
- expected = [
- ('What', 'WP'),
- ('is', 'VBZ'),
- ('the', 'DT'),
- ('airspeed', 'NN'),
- ('of', 'IN'),
- ('an', 'DT'),
- ('unladen', 'NN'),
- ('swallow', 'NN'),
- ('?', '.'),
- ]
- self.assertEqual(result, expected)
- def test_senna_chunk_tagger(self):
- chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
- result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
- expected_1 = [
- ('What', 'B-NP'),
- ('is', 'B-VP'),
- ('the', 'B-NP'),
- ('airspeed', 'I-NP'),
- ('of', 'B-PP'),
- ('an', 'B-NP'),
- ('unladen', 'I-NP'),
- ('swallow', 'I-NP'),
- ('?', 'O'),
- ]
- result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
- expected_2 = [
- ('What', '0'),
- ('the airspeed', '2-3'),
- ('an unladen swallow', '5-6-7'),
- ]
- self.assertEqual(result_1, expected_1)
- self.assertEqual(result_2, expected_2)
- def test_senna_ner_tagger(self):
- nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
- result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
- expected_1 = [
- ('Shakespeare', 'B-PER'),
- ('theatre', 'O'),
- ('was', 'O'),
- ('in', 'O'),
- ('London', 'B-LOC'),
- ('.', 'O'),
- ]
- result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
- expected_2 = [
- ('UN', 'B-ORG'),
- ('headquarters', 'O'),
- ('are', 'O'),
- ('in', 'O'),
- ('NY', 'B-LOC'),
- (',', 'O'),
- ('USA', 'B-LOC'),
- ('.', 'O'),
- ]
- self.assertEqual(result_1, expected_1)
- self.assertEqual(result_2, expected_2)
|