test_corenlp.py 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416
  1. # -*- coding: utf-8 -*-
  2. """
  3. Mock test for Stanford CoreNLP wrappers.
  4. """
  5. import sys
  6. from itertools import chain
  7. from unittest import TestCase, SkipTest
  8. from unittest.mock import MagicMock
  9. from nltk.tree import Tree
  10. from nltk.parse import corenlp
  11. class TestTokenizerAPI(TestCase):
  12. def test_tokenize(self):
  13. corenlp_tokenizer = corenlp.CoreNLPParser()
  14. api_return_value = {
  15. u'sentences': [
  16. {
  17. u'index': 0,
  18. u'tokens': [
  19. {
  20. u'after': u' ',
  21. u'before': u'',
  22. u'characterOffsetBegin': 0,
  23. u'characterOffsetEnd': 4,
  24. u'index': 1,
  25. u'originalText': u'Good',
  26. u'word': u'Good',
  27. },
  28. {
  29. u'after': u' ',
  30. u'before': u' ',
  31. u'characterOffsetBegin': 5,
  32. u'characterOffsetEnd': 12,
  33. u'index': 2,
  34. u'originalText': u'muffins',
  35. u'word': u'muffins',
  36. },
  37. {
  38. u'after': u' ',
  39. u'before': u' ',
  40. u'characterOffsetBegin': 13,
  41. u'characterOffsetEnd': 17,
  42. u'index': 3,
  43. u'originalText': u'cost',
  44. u'word': u'cost',
  45. },
  46. {
  47. u'after': u'',
  48. u'before': u' ',
  49. u'characterOffsetBegin': 18,
  50. u'characterOffsetEnd': 19,
  51. u'index': 4,
  52. u'originalText': u'$',
  53. u'word': u'$',
  54. },
  55. {
  56. u'after': u'\n',
  57. u'before': u'',
  58. u'characterOffsetBegin': 19,
  59. u'characterOffsetEnd': 23,
  60. u'index': 5,
  61. u'originalText': u'3.88',
  62. u'word': u'3.88',
  63. },
  64. {
  65. u'after': u' ',
  66. u'before': u'\n',
  67. u'characterOffsetBegin': 24,
  68. u'characterOffsetEnd': 26,
  69. u'index': 6,
  70. u'originalText': u'in',
  71. u'word': u'in',
  72. },
  73. {
  74. u'after': u' ',
  75. u'before': u' ',
  76. u'characterOffsetBegin': 27,
  77. u'characterOffsetEnd': 30,
  78. u'index': 7,
  79. u'originalText': u'New',
  80. u'word': u'New',
  81. },
  82. {
  83. u'after': u'',
  84. u'before': u' ',
  85. u'characterOffsetBegin': 31,
  86. u'characterOffsetEnd': 35,
  87. u'index': 8,
  88. u'originalText': u'York',
  89. u'word': u'York',
  90. },
  91. {
  92. u'after': u' ',
  93. u'before': u'',
  94. u'characterOffsetBegin': 35,
  95. u'characterOffsetEnd': 36,
  96. u'index': 9,
  97. u'originalText': u'.',
  98. u'word': u'.',
  99. },
  100. ],
  101. },
  102. {
  103. u'index': 1,
  104. u'tokens': [
  105. {
  106. u'after': u' ',
  107. u'before': u' ',
  108. u'characterOffsetBegin': 38,
  109. u'characterOffsetEnd': 44,
  110. u'index': 1,
  111. u'originalText': u'Please',
  112. u'word': u'Please',
  113. },
  114. {
  115. u'after': u' ',
  116. u'before': u' ',
  117. u'characterOffsetBegin': 45,
  118. u'characterOffsetEnd': 48,
  119. u'index': 2,
  120. u'originalText': u'buy',
  121. u'word': u'buy',
  122. },
  123. {
  124. u'after': u'\n',
  125. u'before': u' ',
  126. u'characterOffsetBegin': 49,
  127. u'characterOffsetEnd': 51,
  128. u'index': 3,
  129. u'originalText': u'me',
  130. u'word': u'me',
  131. },
  132. {
  133. u'after': u' ',
  134. u'before': u'\n',
  135. u'characterOffsetBegin': 52,
  136. u'characterOffsetEnd': 55,
  137. u'index': 4,
  138. u'originalText': u'two',
  139. u'word': u'two',
  140. },
  141. {
  142. u'after': u' ',
  143. u'before': u' ',
  144. u'characterOffsetBegin': 56,
  145. u'characterOffsetEnd': 58,
  146. u'index': 5,
  147. u'originalText': u'of',
  148. u'word': u'of',
  149. },
  150. {
  151. u'after': u'',
  152. u'before': u' ',
  153. u'characterOffsetBegin': 59,
  154. u'characterOffsetEnd': 63,
  155. u'index': 6,
  156. u'originalText': u'them',
  157. u'word': u'them',
  158. },
  159. {
  160. u'after': u'\n',
  161. u'before': u'',
  162. u'characterOffsetBegin': 63,
  163. u'characterOffsetEnd': 64,
  164. u'index': 7,
  165. u'originalText': u'.',
  166. u'word': u'.',
  167. },
  168. ],
  169. },
  170. {
  171. u'index': 2,
  172. u'tokens': [
  173. {
  174. u'after': u'',
  175. u'before': u'\n',
  176. u'characterOffsetBegin': 65,
  177. u'characterOffsetEnd': 71,
  178. u'index': 1,
  179. u'originalText': u'Thanks',
  180. u'word': u'Thanks',
  181. },
  182. {
  183. u'after': u'',
  184. u'before': u'',
  185. u'characterOffsetBegin': 71,
  186. u'characterOffsetEnd': 72,
  187. u'index': 2,
  188. u'originalText': u'.',
  189. u'word': u'.',
  190. },
  191. ],
  192. },
  193. ]
  194. }
  195. corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value)
  196. input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
  197. expected_output = [
  198. u'Good',
  199. u'muffins',
  200. u'cost',
  201. u'$',
  202. u'3.88',
  203. u'in',
  204. u'New',
  205. u'York',
  206. u'.',
  207. u'Please',
  208. u'buy',
  209. u'me',
  210. u'two',
  211. u'of',
  212. u'them',
  213. u'.',
  214. u'Thanks',
  215. u'.',
  216. ]
  217. tokenized_output = list(corenlp_tokenizer.tokenize(input_string))
  218. corenlp_tokenizer.api_call.assert_called_once_with(
  219. 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.',
  220. properties={'annotators': 'tokenize,ssplit'},
  221. )
  222. self.assertEqual(expected_output, tokenized_output)
  223. class TestTaggerAPI(TestCase):
  224. def test_pos_tagger(self):
  225. corenlp_tagger = corenlp.CoreNLPParser(tagtype='pos')
  226. api_return_value = {
  227. u'sentences': [
  228. {
  229. u'basicDependencies': [
  230. {
  231. u'dep': u'ROOT',
  232. u'dependent': 1,
  233. u'dependentGloss': u'What',
  234. u'governor': 0,
  235. u'governorGloss': u'ROOT',
  236. },
  237. {
  238. u'dep': u'cop',
  239. u'dependent': 2,
  240. u'dependentGloss': u'is',
  241. u'governor': 1,
  242. u'governorGloss': u'What',
  243. },
  244. {
  245. u'dep': u'det',
  246. u'dependent': 3,
  247. u'dependentGloss': u'the',
  248. u'governor': 4,
  249. u'governorGloss': u'airspeed',
  250. },
  251. {
  252. u'dep': u'nsubj',
  253. u'dependent': 4,
  254. u'dependentGloss': u'airspeed',
  255. u'governor': 1,
  256. u'governorGloss': u'What',
  257. },
  258. {
  259. u'dep': u'case',
  260. u'dependent': 5,
  261. u'dependentGloss': u'of',
  262. u'governor': 8,
  263. u'governorGloss': u'swallow',
  264. },
  265. {
  266. u'dep': u'det',
  267. u'dependent': 6,
  268. u'dependentGloss': u'an',
  269. u'governor': 8,
  270. u'governorGloss': u'swallow',
  271. },
  272. {
  273. u'dep': u'compound',
  274. u'dependent': 7,
  275. u'dependentGloss': u'unladen',
  276. u'governor': 8,
  277. u'governorGloss': u'swallow',
  278. },
  279. {
  280. u'dep': u'nmod',
  281. u'dependent': 8,
  282. u'dependentGloss': u'swallow',
  283. u'governor': 4,
  284. u'governorGloss': u'airspeed',
  285. },
  286. {
  287. u'dep': u'punct',
  288. u'dependent': 9,
  289. u'dependentGloss': u'?',
  290. u'governor': 1,
  291. u'governorGloss': u'What',
  292. },
  293. ],
  294. u'enhancedDependencies': [
  295. {
  296. u'dep': u'ROOT',
  297. u'dependent': 1,
  298. u'dependentGloss': u'What',
  299. u'governor': 0,
  300. u'governorGloss': u'ROOT',
  301. },
  302. {
  303. u'dep': u'cop',
  304. u'dependent': 2,
  305. u'dependentGloss': u'is',
  306. u'governor': 1,
  307. u'governorGloss': u'What',
  308. },
  309. {
  310. u'dep': u'det',
  311. u'dependent': 3,
  312. u'dependentGloss': u'the',
  313. u'governor': 4,
  314. u'governorGloss': u'airspeed',
  315. },
  316. {
  317. u'dep': u'nsubj',
  318. u'dependent': 4,
  319. u'dependentGloss': u'airspeed',
  320. u'governor': 1,
  321. u'governorGloss': u'What',
  322. },
  323. {
  324. u'dep': u'case',
  325. u'dependent': 5,
  326. u'dependentGloss': u'of',
  327. u'governor': 8,
  328. u'governorGloss': u'swallow',
  329. },
  330. {
  331. u'dep': u'det',
  332. u'dependent': 6,
  333. u'dependentGloss': u'an',
  334. u'governor': 8,
  335. u'governorGloss': u'swallow',
  336. },
  337. {
  338. u'dep': u'compound',
  339. u'dependent': 7,
  340. u'dependentGloss': u'unladen',
  341. u'governor': 8,
  342. u'governorGloss': u'swallow',
  343. },
  344. {
  345. u'dep': u'nmod:of',
  346. u'dependent': 8,
  347. u'dependentGloss': u'swallow',
  348. u'governor': 4,
  349. u'governorGloss': u'airspeed',
  350. },
  351. {
  352. u'dep': u'punct',
  353. u'dependent': 9,
  354. u'dependentGloss': u'?',
  355. u'governor': 1,
  356. u'governorGloss': u'What',
  357. },
  358. ],
  359. u'enhancedPlusPlusDependencies': [
  360. {
  361. u'dep': u'ROOT',
  362. u'dependent': 1,
  363. u'dependentGloss': u'What',
  364. u'governor': 0,
  365. u'governorGloss': u'ROOT',
  366. },
  367. {
  368. u'dep': u'cop',
  369. u'dependent': 2,
  370. u'dependentGloss': u'is',
  371. u'governor': 1,
  372. u'governorGloss': u'What',
  373. },
  374. {
  375. u'dep': u'det',
  376. u'dependent': 3,
  377. u'dependentGloss': u'the',
  378. u'governor': 4,
  379. u'governorGloss': u'airspeed',
  380. },
  381. {
  382. u'dep': u'nsubj',
  383. u'dependent': 4,
  384. u'dependentGloss': u'airspeed',
  385. u'governor': 1,
  386. u'governorGloss': u'What',
  387. },
  388. {
  389. u'dep': u'case',
  390. u'dependent': 5,
  391. u'dependentGloss': u'of',
  392. u'governor': 8,
  393. u'governorGloss': u'swallow',
  394. },
  395. {
  396. u'dep': u'det',
  397. u'dependent': 6,
  398. u'dependentGloss': u'an',
  399. u'governor': 8,
  400. u'governorGloss': u'swallow',
  401. },
  402. {
  403. u'dep': u'compound',
  404. u'dependent': 7,
  405. u'dependentGloss': u'unladen',
  406. u'governor': 8,
  407. u'governorGloss': u'swallow',
  408. },
  409. {
  410. u'dep': u'nmod:of',
  411. u'dependent': 8,
  412. u'dependentGloss': u'swallow',
  413. u'governor': 4,
  414. u'governorGloss': u'airspeed',
  415. },
  416. {
  417. u'dep': u'punct',
  418. u'dependent': 9,
  419. u'dependentGloss': u'?',
  420. u'governor': 1,
  421. u'governorGloss': u'What',
  422. },
  423. ],
  424. u'index': 0,
  425. u'parse': u'(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))',
  426. u'tokens': [
  427. {
  428. u'after': u' ',
  429. u'before': u'',
  430. u'characterOffsetBegin': 0,
  431. u'characterOffsetEnd': 4,
  432. u'index': 1,
  433. u'lemma': u'what',
  434. u'originalText': u'What',
  435. u'pos': u'WP',
  436. u'word': u'What',
  437. },
  438. {
  439. u'after': u' ',
  440. u'before': u' ',
  441. u'characterOffsetBegin': 5,
  442. u'characterOffsetEnd': 7,
  443. u'index': 2,
  444. u'lemma': u'be',
  445. u'originalText': u'is',
  446. u'pos': u'VBZ',
  447. u'word': u'is',
  448. },
  449. {
  450. u'after': u' ',
  451. u'before': u' ',
  452. u'characterOffsetBegin': 8,
  453. u'characterOffsetEnd': 11,
  454. u'index': 3,
  455. u'lemma': u'the',
  456. u'originalText': u'the',
  457. u'pos': u'DT',
  458. u'word': u'the',
  459. },
  460. {
  461. u'after': u' ',
  462. u'before': u' ',
  463. u'characterOffsetBegin': 12,
  464. u'characterOffsetEnd': 20,
  465. u'index': 4,
  466. u'lemma': u'airspeed',
  467. u'originalText': u'airspeed',
  468. u'pos': u'NN',
  469. u'word': u'airspeed',
  470. },
  471. {
  472. u'after': u' ',
  473. u'before': u' ',
  474. u'characterOffsetBegin': 21,
  475. u'characterOffsetEnd': 23,
  476. u'index': 5,
  477. u'lemma': u'of',
  478. u'originalText': u'of',
  479. u'pos': u'IN',
  480. u'word': u'of',
  481. },
  482. {
  483. u'after': u' ',
  484. u'before': u' ',
  485. u'characterOffsetBegin': 24,
  486. u'characterOffsetEnd': 26,
  487. u'index': 6,
  488. u'lemma': u'a',
  489. u'originalText': u'an',
  490. u'pos': u'DT',
  491. u'word': u'an',
  492. },
  493. {
  494. u'after': u' ',
  495. u'before': u' ',
  496. u'characterOffsetBegin': 27,
  497. u'characterOffsetEnd': 34,
  498. u'index': 7,
  499. u'lemma': u'unladen',
  500. u'originalText': u'unladen',
  501. u'pos': u'JJ',
  502. u'word': u'unladen',
  503. },
  504. {
  505. u'after': u' ',
  506. u'before': u' ',
  507. u'characterOffsetBegin': 35,
  508. u'characterOffsetEnd': 42,
  509. u'index': 8,
  510. u'lemma': u'swallow',
  511. u'originalText': u'swallow',
  512. u'pos': u'VB',
  513. u'word': u'swallow',
  514. },
  515. {
  516. u'after': u'',
  517. u'before': u' ',
  518. u'characterOffsetBegin': 43,
  519. u'characterOffsetEnd': 44,
  520. u'index': 9,
  521. u'lemma': u'?',
  522. u'originalText': u'?',
  523. u'pos': u'.',
  524. u'word': u'?',
  525. },
  526. ],
  527. }
  528. ]
  529. }
  530. corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
  531. input_tokens = 'What is the airspeed of an unladen swallow ?'.split()
  532. expected_output = [
  533. ('What', 'WP'),
  534. ('is', 'VBZ'),
  535. ('the', 'DT'),
  536. ('airspeed', 'NN'),
  537. ('of', 'IN'),
  538. ('an', 'DT'),
  539. ('unladen', 'JJ'),
  540. ('swallow', 'VB'),
  541. ('?', '.'),
  542. ]
  543. tagged_output = corenlp_tagger.tag(input_tokens)
  544. corenlp_tagger.api_call.assert_called_once_with(
  545. 'What is the airspeed of an unladen swallow ?',
  546. properties={
  547. 'ssplit.isOneSentence': 'true',
  548. 'annotators': 'tokenize,ssplit,pos',
  549. },
  550. )
  551. self.assertEqual(expected_output, tagged_output)
  552. def test_ner_tagger(self):
  553. corenlp_tagger = corenlp.CoreNLPParser(tagtype='ner')
  554. api_return_value = {
  555. 'sentences': [
  556. {
  557. 'index': 0,
  558. 'tokens': [
  559. {
  560. 'after': ' ',
  561. 'before': '',
  562. 'characterOffsetBegin': 0,
  563. 'characterOffsetEnd': 4,
  564. 'index': 1,
  565. 'lemma': 'Rami',
  566. 'ner': 'PERSON',
  567. 'originalText': 'Rami',
  568. 'pos': 'NNP',
  569. 'word': 'Rami',
  570. },
  571. {
  572. 'after': ' ',
  573. 'before': ' ',
  574. 'characterOffsetBegin': 5,
  575. 'characterOffsetEnd': 8,
  576. 'index': 2,
  577. 'lemma': 'Eid',
  578. 'ner': 'PERSON',
  579. 'originalText': 'Eid',
  580. 'pos': 'NNP',
  581. 'word': 'Eid',
  582. },
  583. {
  584. 'after': ' ',
  585. 'before': ' ',
  586. 'characterOffsetBegin': 9,
  587. 'characterOffsetEnd': 11,
  588. 'index': 3,
  589. 'lemma': 'be',
  590. 'ner': 'O',
  591. 'originalText': 'is',
  592. 'pos': 'VBZ',
  593. 'word': 'is',
  594. },
  595. {
  596. 'after': ' ',
  597. 'before': ' ',
  598. 'characterOffsetBegin': 12,
  599. 'characterOffsetEnd': 20,
  600. 'index': 4,
  601. 'lemma': 'study',
  602. 'ner': 'O',
  603. 'originalText': 'studying',
  604. 'pos': 'VBG',
  605. 'word': 'studying',
  606. },
  607. {
  608. 'after': ' ',
  609. 'before': ' ',
  610. 'characterOffsetBegin': 21,
  611. 'characterOffsetEnd': 23,
  612. 'index': 5,
  613. 'lemma': 'at',
  614. 'ner': 'O',
  615. 'originalText': 'at',
  616. 'pos': 'IN',
  617. 'word': 'at',
  618. },
  619. {
  620. 'after': ' ',
  621. 'before': ' ',
  622. 'characterOffsetBegin': 24,
  623. 'characterOffsetEnd': 29,
  624. 'index': 6,
  625. 'lemma': 'Stony',
  626. 'ner': 'ORGANIZATION',
  627. 'originalText': 'Stony',
  628. 'pos': 'NNP',
  629. 'word': 'Stony',
  630. },
  631. {
  632. 'after': ' ',
  633. 'before': ' ',
  634. 'characterOffsetBegin': 30,
  635. 'characterOffsetEnd': 35,
  636. 'index': 7,
  637. 'lemma': 'Brook',
  638. 'ner': 'ORGANIZATION',
  639. 'originalText': 'Brook',
  640. 'pos': 'NNP',
  641. 'word': 'Brook',
  642. },
  643. {
  644. 'after': ' ',
  645. 'before': ' ',
  646. 'characterOffsetBegin': 36,
  647. 'characterOffsetEnd': 46,
  648. 'index': 8,
  649. 'lemma': 'University',
  650. 'ner': 'ORGANIZATION',
  651. 'originalText': 'University',
  652. 'pos': 'NNP',
  653. 'word': 'University',
  654. },
  655. {
  656. 'after': ' ',
  657. 'before': ' ',
  658. 'characterOffsetBegin': 47,
  659. 'characterOffsetEnd': 49,
  660. 'index': 9,
  661. 'lemma': 'in',
  662. 'ner': 'O',
  663. 'originalText': 'in',
  664. 'pos': 'IN',
  665. 'word': 'in',
  666. },
  667. {
  668. 'after': '',
  669. 'before': ' ',
  670. 'characterOffsetBegin': 50,
  671. 'characterOffsetEnd': 52,
  672. 'index': 10,
  673. 'lemma': 'NY',
  674. 'ner': 'O',
  675. 'originalText': 'NY',
  676. 'pos': 'NNP',
  677. 'word': 'NY',
  678. },
  679. ],
  680. }
  681. ]
  682. }
  683. corenlp_tagger.api_call = MagicMock(return_value=api_return_value)
  684. input_tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
  685. expected_output = [
  686. ('Rami', 'PERSON'),
  687. ('Eid', 'PERSON'),
  688. ('is', 'O'),
  689. ('studying', 'O'),
  690. ('at', 'O'),
  691. ('Stony', 'ORGANIZATION'),
  692. ('Brook', 'ORGANIZATION'),
  693. ('University', 'ORGANIZATION'),
  694. ('in', 'O'),
  695. ('NY', 'O'),
  696. ]
  697. tagged_output = corenlp_tagger.tag(input_tokens)
  698. corenlp_tagger.api_call.assert_called_once_with(
  699. 'Rami Eid is studying at Stony Brook University in NY',
  700. properties={
  701. 'ssplit.isOneSentence': 'true',
  702. 'annotators': 'tokenize,ssplit,ner',
  703. },
  704. )
  705. self.assertEqual(expected_output, tagged_output)
  706. def test_unexpected_tagtype(self):
  707. with self.assertRaises(ValueError):
  708. corenlp_tagger = corenlp.CoreNLPParser(tagtype='test')
  709. class TestParserAPI(TestCase):
  710. def test_parse(self):
  711. corenlp_parser = corenlp.CoreNLPParser()
  712. api_return_value = {
  713. 'sentences': [
  714. {
  715. 'basicDependencies': [
  716. {
  717. 'dep': 'ROOT',
  718. 'dependent': 4,
  719. 'dependentGloss': 'fox',
  720. 'governor': 0,
  721. 'governorGloss': 'ROOT',
  722. },
  723. {
  724. 'dep': 'det',
  725. 'dependent': 1,
  726. 'dependentGloss': 'The',
  727. 'governor': 4,
  728. 'governorGloss': 'fox',
  729. },
  730. {
  731. 'dep': 'amod',
  732. 'dependent': 2,
  733. 'dependentGloss': 'quick',
  734. 'governor': 4,
  735. 'governorGloss': 'fox',
  736. },
  737. {
  738. 'dep': 'amod',
  739. 'dependent': 3,
  740. 'dependentGloss': 'brown',
  741. 'governor': 4,
  742. 'governorGloss': 'fox',
  743. },
  744. {
  745. 'dep': 'dep',
  746. 'dependent': 5,
  747. 'dependentGloss': 'jumps',
  748. 'governor': 4,
  749. 'governorGloss': 'fox',
  750. },
  751. {
  752. 'dep': 'case',
  753. 'dependent': 6,
  754. 'dependentGloss': 'over',
  755. 'governor': 9,
  756. 'governorGloss': 'dog',
  757. },
  758. {
  759. 'dep': 'det',
  760. 'dependent': 7,
  761. 'dependentGloss': 'the',
  762. 'governor': 9,
  763. 'governorGloss': 'dog',
  764. },
  765. {
  766. 'dep': 'amod',
  767. 'dependent': 8,
  768. 'dependentGloss': 'lazy',
  769. 'governor': 9,
  770. 'governorGloss': 'dog',
  771. },
  772. {
  773. 'dep': 'nmod',
  774. 'dependent': 9,
  775. 'dependentGloss': 'dog',
  776. 'governor': 5,
  777. 'governorGloss': 'jumps',
  778. },
  779. ],
  780. 'enhancedDependencies': [
  781. {
  782. 'dep': 'ROOT',
  783. 'dependent': 4,
  784. 'dependentGloss': 'fox',
  785. 'governor': 0,
  786. 'governorGloss': 'ROOT',
  787. },
  788. {
  789. 'dep': 'det',
  790. 'dependent': 1,
  791. 'dependentGloss': 'The',
  792. 'governor': 4,
  793. 'governorGloss': 'fox',
  794. },
  795. {
  796. 'dep': 'amod',
  797. 'dependent': 2,
  798. 'dependentGloss': 'quick',
  799. 'governor': 4,
  800. 'governorGloss': 'fox',
  801. },
  802. {
  803. 'dep': 'amod',
  804. 'dependent': 3,
  805. 'dependentGloss': 'brown',
  806. 'governor': 4,
  807. 'governorGloss': 'fox',
  808. },
  809. {
  810. 'dep': 'dep',
  811. 'dependent': 5,
  812. 'dependentGloss': 'jumps',
  813. 'governor': 4,
  814. 'governorGloss': 'fox',
  815. },
  816. {
  817. 'dep': 'case',
  818. 'dependent': 6,
  819. 'dependentGloss': 'over',
  820. 'governor': 9,
  821. 'governorGloss': 'dog',
  822. },
  823. {
  824. 'dep': 'det',
  825. 'dependent': 7,
  826. 'dependentGloss': 'the',
  827. 'governor': 9,
  828. 'governorGloss': 'dog',
  829. },
  830. {
  831. 'dep': 'amod',
  832. 'dependent': 8,
  833. 'dependentGloss': 'lazy',
  834. 'governor': 9,
  835. 'governorGloss': 'dog',
  836. },
  837. {
  838. 'dep': 'nmod:over',
  839. 'dependent': 9,
  840. 'dependentGloss': 'dog',
  841. 'governor': 5,
  842. 'governorGloss': 'jumps',
  843. },
  844. ],
  845. 'enhancedPlusPlusDependencies': [
  846. {
  847. 'dep': 'ROOT',
  848. 'dependent': 4,
  849. 'dependentGloss': 'fox',
  850. 'governor': 0,
  851. 'governorGloss': 'ROOT',
  852. },
  853. {
  854. 'dep': 'det',
  855. 'dependent': 1,
  856. 'dependentGloss': 'The',
  857. 'governor': 4,
  858. 'governorGloss': 'fox',
  859. },
  860. {
  861. 'dep': 'amod',
  862. 'dependent': 2,
  863. 'dependentGloss': 'quick',
  864. 'governor': 4,
  865. 'governorGloss': 'fox',
  866. },
  867. {
  868. 'dep': 'amod',
  869. 'dependent': 3,
  870. 'dependentGloss': 'brown',
  871. 'governor': 4,
  872. 'governorGloss': 'fox',
  873. },
  874. {
  875. 'dep': 'dep',
  876. 'dependent': 5,
  877. 'dependentGloss': 'jumps',
  878. 'governor': 4,
  879. 'governorGloss': 'fox',
  880. },
  881. {
  882. 'dep': 'case',
  883. 'dependent': 6,
  884. 'dependentGloss': 'over',
  885. 'governor': 9,
  886. 'governorGloss': 'dog',
  887. },
  888. {
  889. 'dep': 'det',
  890. 'dependent': 7,
  891. 'dependentGloss': 'the',
  892. 'governor': 9,
  893. 'governorGloss': 'dog',
  894. },
  895. {
  896. 'dep': 'amod',
  897. 'dependent': 8,
  898. 'dependentGloss': 'lazy',
  899. 'governor': 9,
  900. 'governorGloss': 'dog',
  901. },
  902. {
  903. 'dep': 'nmod:over',
  904. 'dependent': 9,
  905. 'dependentGloss': 'dog',
  906. 'governor': 5,
  907. 'governorGloss': 'jumps',
  908. },
  909. ],
  910. 'index': 0,
  911. 'parse': '(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))',
  912. 'tokens': [
  913. {
  914. 'after': ' ',
  915. 'before': '',
  916. 'characterOffsetBegin': 0,
  917. 'characterOffsetEnd': 3,
  918. 'index': 1,
  919. 'lemma': 'the',
  920. 'originalText': 'The',
  921. 'pos': 'DT',
  922. 'word': 'The',
  923. },
  924. {
  925. 'after': ' ',
  926. 'before': ' ',
  927. 'characterOffsetBegin': 4,
  928. 'characterOffsetEnd': 9,
  929. 'index': 2,
  930. 'lemma': 'quick',
  931. 'originalText': 'quick',
  932. 'pos': 'JJ',
  933. 'word': 'quick',
  934. },
  935. {
  936. 'after': ' ',
  937. 'before': ' ',
  938. 'characterOffsetBegin': 10,
  939. 'characterOffsetEnd': 15,
  940. 'index': 3,
  941. 'lemma': 'brown',
  942. 'originalText': 'brown',
  943. 'pos': 'JJ',
  944. 'word': 'brown',
  945. },
  946. {
  947. 'after': ' ',
  948. 'before': ' ',
  949. 'characterOffsetBegin': 16,
  950. 'characterOffsetEnd': 19,
  951. 'index': 4,
  952. 'lemma': 'fox',
  953. 'originalText': 'fox',
  954. 'pos': 'NN',
  955. 'word': 'fox',
  956. },
  957. {
  958. 'after': ' ',
  959. 'before': ' ',
  960. 'characterOffsetBegin': 20,
  961. 'characterOffsetEnd': 25,
  962. 'index': 5,
  963. 'lemma': 'jump',
  964. 'originalText': 'jumps',
  965. 'pos': 'VBZ',
  966. 'word': 'jumps',
  967. },
  968. {
  969. 'after': ' ',
  970. 'before': ' ',
  971. 'characterOffsetBegin': 26,
  972. 'characterOffsetEnd': 30,
  973. 'index': 6,
  974. 'lemma': 'over',
  975. 'originalText': 'over',
  976. 'pos': 'IN',
  977. 'word': 'over',
  978. },
  979. {
  980. 'after': ' ',
  981. 'before': ' ',
  982. 'characterOffsetBegin': 31,
  983. 'characterOffsetEnd': 34,
  984. 'index': 7,
  985. 'lemma': 'the',
  986. 'originalText': 'the',
  987. 'pos': 'DT',
  988. 'word': 'the',
  989. },
  990. {
  991. 'after': ' ',
  992. 'before': ' ',
  993. 'characterOffsetBegin': 35,
  994. 'characterOffsetEnd': 39,
  995. 'index': 8,
  996. 'lemma': 'lazy',
  997. 'originalText': 'lazy',
  998. 'pos': 'JJ',
  999. 'word': 'lazy',
  1000. },
  1001. {
  1002. 'after': '',
  1003. 'before': ' ',
  1004. 'characterOffsetBegin': 40,
  1005. 'characterOffsetEnd': 43,
  1006. 'index': 9,
  1007. 'lemma': 'dog',
  1008. 'originalText': 'dog',
  1009. 'pos': 'NN',
  1010. 'word': 'dog',
  1011. },
  1012. ],
  1013. }
  1014. ]
  1015. }
  1016. corenlp_parser.api_call = MagicMock(return_value=api_return_value)
  1017. input_string = "The quick brown fox jumps over the lazy dog".split()
  1018. expected_output = Tree(
  1019. 'ROOT',
  1020. [
  1021. Tree(
  1022. 'NP',
  1023. [
  1024. Tree(
  1025. 'NP',
  1026. [
  1027. Tree('DT', ['The']),
  1028. Tree('JJ', ['quick']),
  1029. Tree('JJ', ['brown']),
  1030. Tree('NN', ['fox']),
  1031. ],
  1032. ),
  1033. Tree(
  1034. 'NP',
  1035. [
  1036. Tree('NP', [Tree('NNS', ['jumps'])]),
  1037. Tree(
  1038. 'PP',
  1039. [
  1040. Tree('IN', ['over']),
  1041. Tree(
  1042. 'NP',
  1043. [
  1044. Tree('DT', ['the']),
  1045. Tree('JJ', ['lazy']),
  1046. Tree('NN', ['dog']),
  1047. ],
  1048. ),
  1049. ],
  1050. ),
  1051. ],
  1052. ),
  1053. ],
  1054. )
  1055. ],
  1056. )
  1057. parsed_data = next(corenlp_parser.parse(input_string))
  1058. corenlp_parser.api_call.assert_called_once_with(
  1059. "The quick brown fox jumps over the lazy dog",
  1060. properties={'ssplit.eolonly': 'true'},
  1061. )
  1062. self.assertEqual(expected_output, parsed_data)
  1063. def test_dependency_parser(self):
  1064. corenlp_parser = corenlp.CoreNLPDependencyParser()
  1065. api_return_value = {
  1066. 'sentences': [
  1067. {
  1068. 'basicDependencies': [
  1069. {
  1070. 'dep': 'ROOT',
  1071. 'dependent': 5,
  1072. 'dependentGloss': 'jumps',
  1073. 'governor': 0,
  1074. 'governorGloss': 'ROOT',
  1075. },
  1076. {
  1077. 'dep': 'det',
  1078. 'dependent': 1,
  1079. 'dependentGloss': 'The',
  1080. 'governor': 4,
  1081. 'governorGloss': 'fox',
  1082. },
  1083. {
  1084. 'dep': 'amod',
  1085. 'dependent': 2,
  1086. 'dependentGloss': 'quick',
  1087. 'governor': 4,
  1088. 'governorGloss': 'fox',
  1089. },
  1090. {
  1091. 'dep': 'amod',
  1092. 'dependent': 3,
  1093. 'dependentGloss': 'brown',
  1094. 'governor': 4,
  1095. 'governorGloss': 'fox',
  1096. },
  1097. {
  1098. 'dep': 'nsubj',
  1099. 'dependent': 4,
  1100. 'dependentGloss': 'fox',
  1101. 'governor': 5,
  1102. 'governorGloss': 'jumps',
  1103. },
  1104. {
  1105. 'dep': 'case',
  1106. 'dependent': 6,
  1107. 'dependentGloss': 'over',
  1108. 'governor': 9,
  1109. 'governorGloss': 'dog',
  1110. },
  1111. {
  1112. 'dep': 'det',
  1113. 'dependent': 7,
  1114. 'dependentGloss': 'the',
  1115. 'governor': 9,
  1116. 'governorGloss': 'dog',
  1117. },
  1118. {
  1119. 'dep': 'amod',
  1120. 'dependent': 8,
  1121. 'dependentGloss': 'lazy',
  1122. 'governor': 9,
  1123. 'governorGloss': 'dog',
  1124. },
  1125. {
  1126. 'dep': 'nmod',
  1127. 'dependent': 9,
  1128. 'dependentGloss': 'dog',
  1129. 'governor': 5,
  1130. 'governorGloss': 'jumps',
  1131. },
  1132. ],
  1133. 'enhancedDependencies': [
  1134. {
  1135. 'dep': 'ROOT',
  1136. 'dependent': 5,
  1137. 'dependentGloss': 'jumps',
  1138. 'governor': 0,
  1139. 'governorGloss': 'ROOT',
  1140. },
  1141. {
  1142. 'dep': 'det',
  1143. 'dependent': 1,
  1144. 'dependentGloss': 'The',
  1145. 'governor': 4,
  1146. 'governorGloss': 'fox',
  1147. },
  1148. {
  1149. 'dep': 'amod',
  1150. 'dependent': 2,
  1151. 'dependentGloss': 'quick',
  1152. 'governor': 4,
  1153. 'governorGloss': 'fox',
  1154. },
  1155. {
  1156. 'dep': 'amod',
  1157. 'dependent': 3,
  1158. 'dependentGloss': 'brown',
  1159. 'governor': 4,
  1160. 'governorGloss': 'fox',
  1161. },
  1162. {
  1163. 'dep': 'nsubj',
  1164. 'dependent': 4,
  1165. 'dependentGloss': 'fox',
  1166. 'governor': 5,
  1167. 'governorGloss': 'jumps',
  1168. },
  1169. {
  1170. 'dep': 'case',
  1171. 'dependent': 6,
  1172. 'dependentGloss': 'over',
  1173. 'governor': 9,
  1174. 'governorGloss': 'dog',
  1175. },
  1176. {
  1177. 'dep': 'det',
  1178. 'dependent': 7,
  1179. 'dependentGloss': 'the',
  1180. 'governor': 9,
  1181. 'governorGloss': 'dog',
  1182. },
  1183. {
  1184. 'dep': 'amod',
  1185. 'dependent': 8,
  1186. 'dependentGloss': 'lazy',
  1187. 'governor': 9,
  1188. 'governorGloss': 'dog',
  1189. },
  1190. {
  1191. 'dep': 'nmod:over',
  1192. 'dependent': 9,
  1193. 'dependentGloss': 'dog',
  1194. 'governor': 5,
  1195. 'governorGloss': 'jumps',
  1196. },
  1197. ],
  1198. 'enhancedPlusPlusDependencies': [
  1199. {
  1200. 'dep': 'ROOT',
  1201. 'dependent': 5,
  1202. 'dependentGloss': 'jumps',
  1203. 'governor': 0,
  1204. 'governorGloss': 'ROOT',
  1205. },
  1206. {
  1207. 'dep': 'det',
  1208. 'dependent': 1,
  1209. 'dependentGloss': 'The',
  1210. 'governor': 4,
  1211. 'governorGloss': 'fox',
  1212. },
  1213. {
  1214. 'dep': 'amod',
  1215. 'dependent': 2,
  1216. 'dependentGloss': 'quick',
  1217. 'governor': 4,
  1218. 'governorGloss': 'fox',
  1219. },
  1220. {
  1221. 'dep': 'amod',
  1222. 'dependent': 3,
  1223. 'dependentGloss': 'brown',
  1224. 'governor': 4,
  1225. 'governorGloss': 'fox',
  1226. },
  1227. {
  1228. 'dep': 'nsubj',
  1229. 'dependent': 4,
  1230. 'dependentGloss': 'fox',
  1231. 'governor': 5,
  1232. 'governorGloss': 'jumps',
  1233. },
  1234. {
  1235. 'dep': 'case',
  1236. 'dependent': 6,
  1237. 'dependentGloss': 'over',
  1238. 'governor': 9,
  1239. 'governorGloss': 'dog',
  1240. },
  1241. {
  1242. 'dep': 'det',
  1243. 'dependent': 7,
  1244. 'dependentGloss': 'the',
  1245. 'governor': 9,
  1246. 'governorGloss': 'dog',
  1247. },
  1248. {
  1249. 'dep': 'amod',
  1250. 'dependent': 8,
  1251. 'dependentGloss': 'lazy',
  1252. 'governor': 9,
  1253. 'governorGloss': 'dog',
  1254. },
  1255. {
  1256. 'dep': 'nmod:over',
  1257. 'dependent': 9,
  1258. 'dependentGloss': 'dog',
  1259. 'governor': 5,
  1260. 'governorGloss': 'jumps',
  1261. },
  1262. ],
  1263. 'index': 0,
  1264. 'tokens': [
  1265. {
  1266. 'after': ' ',
  1267. 'before': '',
  1268. 'characterOffsetBegin': 0,
  1269. 'characterOffsetEnd': 3,
  1270. 'index': 1,
  1271. 'lemma': 'the',
  1272. 'originalText': 'The',
  1273. 'pos': 'DT',
  1274. 'word': 'The',
  1275. },
  1276. {
  1277. 'after': ' ',
  1278. 'before': ' ',
  1279. 'characterOffsetBegin': 4,
  1280. 'characterOffsetEnd': 9,
  1281. 'index': 2,
  1282. 'lemma': 'quick',
  1283. 'originalText': 'quick',
  1284. 'pos': 'JJ',
  1285. 'word': 'quick',
  1286. },
  1287. {
  1288. 'after': ' ',
  1289. 'before': ' ',
  1290. 'characterOffsetBegin': 10,
  1291. 'characterOffsetEnd': 15,
  1292. 'index': 3,
  1293. 'lemma': 'brown',
  1294. 'originalText': 'brown',
  1295. 'pos': 'JJ',
  1296. 'word': 'brown',
  1297. },
  1298. {
  1299. 'after': ' ',
  1300. 'before': ' ',
  1301. 'characterOffsetBegin': 16,
  1302. 'characterOffsetEnd': 19,
  1303. 'index': 4,
  1304. 'lemma': 'fox',
  1305. 'originalText': 'fox',
  1306. 'pos': 'NN',
  1307. 'word': 'fox',
  1308. },
  1309. {
  1310. 'after': ' ',
  1311. 'before': ' ',
  1312. 'characterOffsetBegin': 20,
  1313. 'characterOffsetEnd': 25,
  1314. 'index': 5,
  1315. 'lemma': 'jump',
  1316. 'originalText': 'jumps',
  1317. 'pos': 'VBZ',
  1318. 'word': 'jumps',
  1319. },
  1320. {
  1321. 'after': ' ',
  1322. 'before': ' ',
  1323. 'characterOffsetBegin': 26,
  1324. 'characterOffsetEnd': 30,
  1325. 'index': 6,
  1326. 'lemma': 'over',
  1327. 'originalText': 'over',
  1328. 'pos': 'IN',
  1329. 'word': 'over',
  1330. },
  1331. {
  1332. 'after': ' ',
  1333. 'before': ' ',
  1334. 'characterOffsetBegin': 31,
  1335. 'characterOffsetEnd': 34,
  1336. 'index': 7,
  1337. 'lemma': 'the',
  1338. 'originalText': 'the',
  1339. 'pos': 'DT',
  1340. 'word': 'the',
  1341. },
  1342. {
  1343. 'after': ' ',
  1344. 'before': ' ',
  1345. 'characterOffsetBegin': 35,
  1346. 'characterOffsetEnd': 39,
  1347. 'index': 8,
  1348. 'lemma': 'lazy',
  1349. 'originalText': 'lazy',
  1350. 'pos': 'JJ',
  1351. 'word': 'lazy',
  1352. },
  1353. {
  1354. 'after': '',
  1355. 'before': ' ',
  1356. 'characterOffsetBegin': 40,
  1357. 'characterOffsetEnd': 43,
  1358. 'index': 9,
  1359. 'lemma': 'dog',
  1360. 'originalText': 'dog',
  1361. 'pos': 'NN',
  1362. 'word': 'dog',
  1363. },
  1364. ],
  1365. }
  1366. ]
  1367. }
  1368. corenlp_parser.api_call = MagicMock(return_value=api_return_value)
  1369. input_string = "The quick brown fox jumps over the lazy dog".split()
  1370. expected_output = Tree(
  1371. 'jumps',
  1372. [
  1373. Tree('fox', ['The', 'quick', 'brown']),
  1374. Tree('dog', ['over', 'the', 'lazy']),
  1375. ],
  1376. )
  1377. parsed_data = next(corenlp_parser.parse(input_string))
  1378. corenlp_parser.api_call.assert_called_once_with(
  1379. "The quick brown fox jumps over the lazy dog",
  1380. properties={'ssplit.eolonly': 'true'},
  1381. )
  1382. self.assertEqual(expected_output, parsed_data.tree())