| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416 |
- # -*- coding: utf-8 -*-
- """
- Mock test for Stanford CoreNLP wrappers.
- """
- import sys
- from itertools import chain
- from unittest import TestCase, SkipTest
- from unittest.mock import MagicMock
- from nltk.tree import Tree
- from nltk.parse import corenlp
- class TestTokenizerAPI(TestCase):
- def test_tokenize(self):
- corenlp_tokenizer = corenlp.CoreNLPParser()
- api_return_value = {
- u'sentences': [
- {
- u'index': 0,
- u'tokens': [
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 0,
- u'characterOffsetEnd': 4,
- u'index': 1,
- u'originalText': u'Good',
- u'word': u'Good',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 5,
- u'characterOffsetEnd': 12,
- u'index': 2,
- u'originalText': u'muffins',
- u'word': u'muffins',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 13,
- u'characterOffsetEnd': 17,
- u'index': 3,
- u'originalText': u'cost',
- u'word': u'cost',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 18,
- u'characterOffsetEnd': 19,
- u'index': 4,
- u'originalText': u'$',
- u'word': u'$',
- },
- {
- u'after': u'\n',
- u'before': u'',
- u'characterOffsetBegin': 19,
- u'characterOffsetEnd': 23,
- u'index': 5,
- u'originalText': u'3.88',
- u'word': u'3.88',
- },
- {
- u'after': u' ',
- u'before': u'\n',
- u'characterOffsetBegin': 24,
- u'characterOffsetEnd': 26,
- u'index': 6,
- u'originalText': u'in',
- u'word': u'in',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 27,
- u'characterOffsetEnd': 30,
- u'index': 7,
- u'originalText': u'New',
- u'word': u'New',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 31,
- u'characterOffsetEnd': 35,
- u'index': 8,
- u'originalText': u'York',
- u'word': u'York',
- },
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 35,
- u'characterOffsetEnd': 36,
- u'index': 9,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- {
- u'index': 1,
- u'tokens': [
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 38,
- u'characterOffsetEnd': 44,
- u'index': 1,
- u'originalText': u'Please',
- u'word': u'Please',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 45,
- u'characterOffsetEnd': 48,
- u'index': 2,
- u'originalText': u'buy',
- u'word': u'buy',
- },
- {
- u'after': u'\n',
- u'before': u' ',
- u'characterOffsetBegin': 49,
- u'characterOffsetEnd': 51,
- u'index': 3,
- u'originalText': u'me',
- u'word': u'me',
- },
- {
- u'after': u' ',
- u'before': u'\n',
- u'characterOffsetBegin': 52,
- u'characterOffsetEnd': 55,
- u'index': 4,
- u'originalText': u'two',
- u'word': u'two',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 56,
- u'characterOffsetEnd': 58,
- u'index': 5,
- u'originalText': u'of',
- u'word': u'of',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 59,
- u'characterOffsetEnd': 63,
- u'index': 6,
- u'originalText': u'them',
- u'word': u'them',
- },
- {
- u'after': u'\n',
- u'before': u'',
- u'characterOffsetBegin': 63,
- u'characterOffsetEnd': 64,
- u'index': 7,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- {
- u'index': 2,
- u'tokens': [
- {
- u'after': u'',
- u'before': u'\n',
- u'characterOffsetBegin': 65,
- u'characterOffsetEnd': 71,
- u'index': 1,
- u'originalText': u'Thanks',
- u'word': u'Thanks',
- },
- {
- u'after': u'',
- u'before': u'',
- u'characterOffsetBegin': 71,
- u'characterOffsetEnd': 72,
- u'index': 2,
- u'originalText': u'.',
- u'word': u'.',
- },
- ],
- },
- ]
- }
- corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value)
- input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
- expected_output = [
- u'Good',
- u'muffins',
- u'cost',
- u'$',
- u'3.88',
- u'in',
- u'New',
- u'York',
- u'.',
- u'Please',
- u'buy',
- u'me',
- u'two',
- u'of',
- u'them',
- u'.',
- u'Thanks',
- u'.',
- ]
- tokenized_output = list(corenlp_tokenizer.tokenize(input_string))
- corenlp_tokenizer.api_call.assert_called_once_with(
- 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.',
- properties={'annotators': 'tokenize,ssplit'},
- )
- self.assertEqual(expected_output, tokenized_output)
- class TestTaggerAPI(TestCase):
- def test_pos_tagger(self):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='pos')
- api_return_value = {
- u'sentences': [
- {
- u'basicDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'enhancedDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod:of',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'enhancedPlusPlusDependencies': [
- {
- u'dep': u'ROOT',
- u'dependent': 1,
- u'dependentGloss': u'What',
- u'governor': 0,
- u'governorGloss': u'ROOT',
- },
- {
- u'dep': u'cop',
- u'dependent': 2,
- u'dependentGloss': u'is',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'det',
- u'dependent': 3,
- u'dependentGloss': u'the',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'nsubj',
- u'dependent': 4,
- u'dependentGloss': u'airspeed',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- {
- u'dep': u'case',
- u'dependent': 5,
- u'dependentGloss': u'of',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'det',
- u'dependent': 6,
- u'dependentGloss': u'an',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'compound',
- u'dependent': 7,
- u'dependentGloss': u'unladen',
- u'governor': 8,
- u'governorGloss': u'swallow',
- },
- {
- u'dep': u'nmod:of',
- u'dependent': 8,
- u'dependentGloss': u'swallow',
- u'governor': 4,
- u'governorGloss': u'airspeed',
- },
- {
- u'dep': u'punct',
- u'dependent': 9,
- u'dependentGloss': u'?',
- u'governor': 1,
- u'governorGloss': u'What',
- },
- ],
- u'index': 0,
- u'parse': u'(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))',
- u'tokens': [
- {
- u'after': u' ',
- u'before': u'',
- u'characterOffsetBegin': 0,
- u'characterOffsetEnd': 4,
- u'index': 1,
- u'lemma': u'what',
- u'originalText': u'What',
- u'pos': u'WP',
- u'word': u'What',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 5,
- u'characterOffsetEnd': 7,
- u'index': 2,
- u'lemma': u'be',
- u'originalText': u'is',
- u'pos': u'VBZ',
- u'word': u'is',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 8,
- u'characterOffsetEnd': 11,
- u'index': 3,
- u'lemma': u'the',
- u'originalText': u'the',
- u'pos': u'DT',
- u'word': u'the',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 12,
- u'characterOffsetEnd': 20,
- u'index': 4,
- u'lemma': u'airspeed',
- u'originalText': u'airspeed',
- u'pos': u'NN',
- u'word': u'airspeed',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 21,
- u'characterOffsetEnd': 23,
- u'index': 5,
- u'lemma': u'of',
- u'originalText': u'of',
- u'pos': u'IN',
- u'word': u'of',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 24,
- u'characterOffsetEnd': 26,
- u'index': 6,
- u'lemma': u'a',
- u'originalText': u'an',
- u'pos': u'DT',
- u'word': u'an',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 27,
- u'characterOffsetEnd': 34,
- u'index': 7,
- u'lemma': u'unladen',
- u'originalText': u'unladen',
- u'pos': u'JJ',
- u'word': u'unladen',
- },
- {
- u'after': u' ',
- u'before': u' ',
- u'characterOffsetBegin': 35,
- u'characterOffsetEnd': 42,
- u'index': 8,
- u'lemma': u'swallow',
- u'originalText': u'swallow',
- u'pos': u'VB',
- u'word': u'swallow',
- },
- {
- u'after': u'',
- u'before': u' ',
- u'characterOffsetBegin': 43,
- u'characterOffsetEnd': 44,
- u'index': 9,
- u'lemma': u'?',
- u'originalText': u'?',
- u'pos': u'.',
- u'word': u'?',
- },
- ],
- }
- ]
- }
- corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
- input_tokens = 'What is the airspeed of an unladen swallow ?'.split()
- expected_output = [
- ('What', 'WP'),
- ('is', 'VBZ'),
- ('the', 'DT'),
- ('airspeed', 'NN'),
- ('of', 'IN'),
- ('an', 'DT'),
- ('unladen', 'JJ'),
- ('swallow', 'VB'),
- ('?', '.'),
- ]
- tagged_output = corenlp_tagger.tag(input_tokens)
- corenlp_tagger.api_call.assert_called_once_with(
- 'What is the airspeed of an unladen swallow ?',
- properties={
- 'ssplit.isOneSentence': 'true',
- 'annotators': 'tokenize,ssplit,pos',
- },
- )
- self.assertEqual(expected_output, tagged_output)
- def test_ner_tagger(self):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='ner')
- api_return_value = {
- 'sentences': [
- {
- 'index': 0,
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 4,
- 'index': 1,
- 'lemma': 'Rami',
- 'ner': 'PERSON',
- 'originalText': 'Rami',
- 'pos': 'NNP',
- 'word': 'Rami',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 5,
- 'characterOffsetEnd': 8,
- 'index': 2,
- 'lemma': 'Eid',
- 'ner': 'PERSON',
- 'originalText': 'Eid',
- 'pos': 'NNP',
- 'word': 'Eid',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 9,
- 'characterOffsetEnd': 11,
- 'index': 3,
- 'lemma': 'be',
- 'ner': 'O',
- 'originalText': 'is',
- 'pos': 'VBZ',
- 'word': 'is',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 12,
- 'characterOffsetEnd': 20,
- 'index': 4,
- 'lemma': 'study',
- 'ner': 'O',
- 'originalText': 'studying',
- 'pos': 'VBG',
- 'word': 'studying',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 21,
- 'characterOffsetEnd': 23,
- 'index': 5,
- 'lemma': 'at',
- 'ner': 'O',
- 'originalText': 'at',
- 'pos': 'IN',
- 'word': 'at',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 24,
- 'characterOffsetEnd': 29,
- 'index': 6,
- 'lemma': 'Stony',
- 'ner': 'ORGANIZATION',
- 'originalText': 'Stony',
- 'pos': 'NNP',
- 'word': 'Stony',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 30,
- 'characterOffsetEnd': 35,
- 'index': 7,
- 'lemma': 'Brook',
- 'ner': 'ORGANIZATION',
- 'originalText': 'Brook',
- 'pos': 'NNP',
- 'word': 'Brook',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 36,
- 'characterOffsetEnd': 46,
- 'index': 8,
- 'lemma': 'University',
- 'ner': 'ORGANIZATION',
- 'originalText': 'University',
- 'pos': 'NNP',
- 'word': 'University',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 47,
- 'characterOffsetEnd': 49,
- 'index': 9,
- 'lemma': 'in',
- 'ner': 'O',
- 'originalText': 'in',
- 'pos': 'IN',
- 'word': 'in',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 50,
- 'characterOffsetEnd': 52,
- 'index': 10,
- 'lemma': 'NY',
- 'ner': 'O',
- 'originalText': 'NY',
- 'pos': 'NNP',
- 'word': 'NY',
- },
- ],
- }
- ]
- }
- corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
- input_tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
- expected_output = [
- ('Rami', 'PERSON'),
- ('Eid', 'PERSON'),
- ('is', 'O'),
- ('studying', 'O'),
- ('at', 'O'),
- ('Stony', 'ORGANIZATION'),
- ('Brook', 'ORGANIZATION'),
- ('University', 'ORGANIZATION'),
- ('in', 'O'),
- ('NY', 'O'),
- ]
- tagged_output = corenlp_tagger.tag(input_tokens)
- corenlp_tagger.api_call.assert_called_once_with(
- 'Rami Eid is studying at Stony Brook University in NY',
- properties={
- 'ssplit.isOneSentence': 'true',
- 'annotators': 'tokenize,ssplit,ner',
- },
- )
- self.assertEqual(expected_output, tagged_output)
- def test_unexpected_tagtype(self):
- with self.assertRaises(ValueError):
- corenlp_tagger = corenlp.CoreNLPParser(tagtype='test')
- class TestParserAPI(TestCase):
- def test_parse(self):
- corenlp_parser = corenlp.CoreNLPParser()
- api_return_value = {
- 'sentences': [
- {
- 'basicDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedPlusPlusDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'dep',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'index': 0,
- 'parse': '(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))',
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 3,
- 'index': 1,
- 'lemma': 'the',
- 'originalText': 'The',
- 'pos': 'DT',
- 'word': 'The',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 4,
- 'characterOffsetEnd': 9,
- 'index': 2,
- 'lemma': 'quick',
- 'originalText': 'quick',
- 'pos': 'JJ',
- 'word': 'quick',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 10,
- 'characterOffsetEnd': 15,
- 'index': 3,
- 'lemma': 'brown',
- 'originalText': 'brown',
- 'pos': 'JJ',
- 'word': 'brown',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 16,
- 'characterOffsetEnd': 19,
- 'index': 4,
- 'lemma': 'fox',
- 'originalText': 'fox',
- 'pos': 'NN',
- 'word': 'fox',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 20,
- 'characterOffsetEnd': 25,
- 'index': 5,
- 'lemma': 'jump',
- 'originalText': 'jumps',
- 'pos': 'VBZ',
- 'word': 'jumps',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 26,
- 'characterOffsetEnd': 30,
- 'index': 6,
- 'lemma': 'over',
- 'originalText': 'over',
- 'pos': 'IN',
- 'word': 'over',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 31,
- 'characterOffsetEnd': 34,
- 'index': 7,
- 'lemma': 'the',
- 'originalText': 'the',
- 'pos': 'DT',
- 'word': 'the',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 35,
- 'characterOffsetEnd': 39,
- 'index': 8,
- 'lemma': 'lazy',
- 'originalText': 'lazy',
- 'pos': 'JJ',
- 'word': 'lazy',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 40,
- 'characterOffsetEnd': 43,
- 'index': 9,
- 'lemma': 'dog',
- 'originalText': 'dog',
- 'pos': 'NN',
- 'word': 'dog',
- },
- ],
- }
- ]
- }
- corenlp_parser.api_call = MagicMock(return_value=api_return_value)
- input_string = "The quick brown fox jumps over the lazy dog".split()
- expected_output = Tree(
- 'ROOT',
- [
- Tree(
- 'NP',
- [
- Tree(
- 'NP',
- [
- Tree('DT', ['The']),
- Tree('JJ', ['quick']),
- Tree('JJ', ['brown']),
- Tree('NN', ['fox']),
- ],
- ),
- Tree(
- 'NP',
- [
- Tree('NP', [Tree('NNS', ['jumps'])]),
- Tree(
- 'PP',
- [
- Tree('IN', ['over']),
- Tree(
- 'NP',
- [
- Tree('DT', ['the']),
- Tree('JJ', ['lazy']),
- Tree('NN', ['dog']),
- ],
- ),
- ],
- ),
- ],
- ),
- ],
- )
- ],
- )
- parsed_data = next(corenlp_parser.parse(input_string))
- corenlp_parser.api_call.assert_called_once_with(
- "The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
- )
- self.assertEqual(expected_output, parsed_data)
- def test_dependency_parser(self):
- corenlp_parser = corenlp.CoreNLPDependencyParser()
- api_return_value = {
- 'sentences': [
- {
- 'basicDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'enhancedPlusPlusDependencies': [
- {
- 'dep': 'ROOT',
- 'dependent': 5,
- 'dependentGloss': 'jumps',
- 'governor': 0,
- 'governorGloss': 'ROOT',
- },
- {
- 'dep': 'det',
- 'dependent': 1,
- 'dependentGloss': 'The',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 2,
- 'dependentGloss': 'quick',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'amod',
- 'dependent': 3,
- 'dependentGloss': 'brown',
- 'governor': 4,
- 'governorGloss': 'fox',
- },
- {
- 'dep': 'nsubj',
- 'dependent': 4,
- 'dependentGloss': 'fox',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- {
- 'dep': 'case',
- 'dependent': 6,
- 'dependentGloss': 'over',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'det',
- 'dependent': 7,
- 'dependentGloss': 'the',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'amod',
- 'dependent': 8,
- 'dependentGloss': 'lazy',
- 'governor': 9,
- 'governorGloss': 'dog',
- },
- {
- 'dep': 'nmod:over',
- 'dependent': 9,
- 'dependentGloss': 'dog',
- 'governor': 5,
- 'governorGloss': 'jumps',
- },
- ],
- 'index': 0,
- 'tokens': [
- {
- 'after': ' ',
- 'before': '',
- 'characterOffsetBegin': 0,
- 'characterOffsetEnd': 3,
- 'index': 1,
- 'lemma': 'the',
- 'originalText': 'The',
- 'pos': 'DT',
- 'word': 'The',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 4,
- 'characterOffsetEnd': 9,
- 'index': 2,
- 'lemma': 'quick',
- 'originalText': 'quick',
- 'pos': 'JJ',
- 'word': 'quick',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 10,
- 'characterOffsetEnd': 15,
- 'index': 3,
- 'lemma': 'brown',
- 'originalText': 'brown',
- 'pos': 'JJ',
- 'word': 'brown',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 16,
- 'characterOffsetEnd': 19,
- 'index': 4,
- 'lemma': 'fox',
- 'originalText': 'fox',
- 'pos': 'NN',
- 'word': 'fox',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 20,
- 'characterOffsetEnd': 25,
- 'index': 5,
- 'lemma': 'jump',
- 'originalText': 'jumps',
- 'pos': 'VBZ',
- 'word': 'jumps',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 26,
- 'characterOffsetEnd': 30,
- 'index': 6,
- 'lemma': 'over',
- 'originalText': 'over',
- 'pos': 'IN',
- 'word': 'over',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 31,
- 'characterOffsetEnd': 34,
- 'index': 7,
- 'lemma': 'the',
- 'originalText': 'the',
- 'pos': 'DT',
- 'word': 'the',
- },
- {
- 'after': ' ',
- 'before': ' ',
- 'characterOffsetBegin': 35,
- 'characterOffsetEnd': 39,
- 'index': 8,
- 'lemma': 'lazy',
- 'originalText': 'lazy',
- 'pos': 'JJ',
- 'word': 'lazy',
- },
- {
- 'after': '',
- 'before': ' ',
- 'characterOffsetBegin': 40,
- 'characterOffsetEnd': 43,
- 'index': 9,
- 'lemma': 'dog',
- 'originalText': 'dog',
- 'pos': 'NN',
- 'word': 'dog',
- },
- ],
- }
- ]
- }
- corenlp_parser.api_call = MagicMock(return_value=api_return_value)
- input_string = "The quick brown fox jumps over the lazy dog".split()
- expected_output = Tree(
- 'jumps',
- [
- Tree('fox', ['The', 'quick', 'brown']),
- Tree('dog', ['over', 'the', 'lazy']),
- ],
- )
- parsed_data = next(corenlp_parser.parse(input_string))
- corenlp_parser.api_call.assert_called_once_with(
- "The quick brown fox jumps over the lazy dog",
- properties={'ssplit.eolonly': 'true'},
- )
- self.assertEqual(expected_output, parsed_data.tree())
|