test_tgrep.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Natural Language Toolkit: TGrep search
  5. #
  6. # Copyright (C) 2001-2020 NLTK Project
  7. # Author: Will Roberts <wildwilhelm@gmail.com>
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. '''
  11. Unit tests for nltk.tgrep.
  12. '''
  13. import unittest
  14. from nltk.tree import ParentedTree
  15. from nltk import tgrep
  16. class TestSequenceFunctions(unittest.TestCase):
  17. '''
  18. Class containing unit tests for nltk.tgrep.
  19. '''
  20. def test_tokenize_simple(self):
  21. '''
  22. Simple test of tokenization.
  23. '''
  24. tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
  25. self.assertEqual(
  26. tokens,
  27. [
  28. 'A',
  29. '..',
  30. '(',
  31. 'B',
  32. '!',
  33. '<',
  34. 'C',
  35. '.',
  36. 'D',
  37. ')',
  38. '|',
  39. '!',
  40. '[',
  41. '<<',
  42. '(',
  43. 'E',
  44. ',',
  45. 'F',
  46. ')',
  47. '$',
  48. 'G',
  49. ']',
  50. ],
  51. )
  52. def test_tokenize_encoding(self):
  53. '''
  54. Test that tokenization handles bytes and strs the same way.
  55. '''
  56. self.assertEqual(
  57. tgrep.tgrep_tokenize(b'A .. (B !< C . D) | ![<< (E , F) $ G]'),
  58. tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
  59. )
  60. def test_tokenize_link_types(self):
  61. '''
  62. Test tokenization of basic link types.
  63. '''
  64. self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
  65. self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
  66. self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
  67. self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
  68. self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
  69. self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
  70. self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
  71. self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
  72. self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
  73. self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
  74. self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
  75. self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
  76. self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
  77. self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
  78. self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
  79. self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
  80. self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
  81. self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
  82. self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
  83. self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
  84. self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
  85. self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
  86. self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
  87. self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
  88. self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
  89. self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
  90. self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
  91. self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
  92. self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
  93. self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
  94. self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
  95. self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
  96. self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
  97. self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
  98. self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
  99. self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
  100. self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
  101. self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'), ['A', '!', '<-3', 'B'])
  102. self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'), ['A', '!', '>-3', 'B'])
  103. self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
  104. self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
  105. self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'), ['A', '!', '<\'', 'B'])
  106. self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'), ['A', '!', '>\'', 'B'])
  107. self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
  108. self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
  109. self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
  110. self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
  111. self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'), ['A', '!', '<<,', 'B'])
  112. self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'), ['A', '!', '>>,', 'B'])
  113. self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'), ['A', '!', '<<\'', 'B'])
  114. self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'), ['A', '!', '>>\'', 'B'])
  115. self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'), ['A', '!', '<<:', 'B'])
  116. self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'), ['A', '!', '>>:', 'B'])
  117. self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
  118. self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
  119. self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
  120. self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
  121. self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
  122. self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
  123. self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
  124. self.assertEqual(tgrep.tgrep_tokenize('A!$..B'), ['A', '!', '$..', 'B'])
  125. self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'), ['A', '!', '$,,', 'B'])
  126. def test_tokenize_examples(self):
  127. '''
  128. Test tokenization of the TGrep2 manual example patterns.
  129. '''
  130. self.assertEqual(tgrep.tgrep_tokenize('NP < PP'), ['NP', '<', 'PP'])
  131. self.assertEqual(tgrep.tgrep_tokenize('/^NP/'), ['/^NP/'])
  132. self.assertEqual(
  133. tgrep.tgrep_tokenize('NP << PP . VP'), ['NP', '<<', 'PP', '.', 'VP']
  134. )
  135. self.assertEqual(
  136. tgrep.tgrep_tokenize('NP << PP | . VP'), ['NP', '<<', 'PP', '|', '.', 'VP']
  137. )
  138. self.assertEqual(
  139. tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
  140. ['NP', '!', '<<', 'PP', '[', '>', 'NP', '|', '>>', 'VP', ']'],
  141. )
  142. self.assertEqual(
  143. tgrep.tgrep_tokenize('NP << (PP . VP)'),
  144. ['NP', '<<', '(', 'PP', '.', 'VP', ')'],
  145. )
  146. self.assertEqual(
  147. tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
  148. ['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<', 'on', ')', ')'],
  149. )
  150. self.assertEqual(
  151. tgrep.tgrep_tokenize('S < (A < B) < C'),
  152. ['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'],
  153. )
  154. self.assertEqual(
  155. tgrep.tgrep_tokenize('S < ((A < B) < C)'),
  156. ['S', '<', '(', '(', 'A', '<', 'B', ')', '<', 'C', ')'],
  157. )
  158. self.assertEqual(
  159. tgrep.tgrep_tokenize('S < (A < B < C)'),
  160. ['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'],
  161. )
  162. self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'), ['A', '<', 'B', '&', '.', 'C'])
  163. def test_tokenize_quoting(self):
  164. '''
  165. Test tokenization of quoting.
  166. '''
  167. self.assertEqual(
  168. tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
  169. ['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"', '<', 'C'],
  170. )
  171. def test_tokenize_nodenames(self):
  172. '''
  173. Test tokenization of node names.
  174. '''
  175. self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
  176. self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
  177. self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
  178. self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
  179. # test tokenization of NLTK tree position syntax
  180. self.assertEqual(tgrep.tgrep_tokenize('N()'), ['N(', ')'])
  181. self.assertEqual(tgrep.tgrep_tokenize('N(0,)'), ['N(', '0', ',', ')'])
  182. self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'), ['N(', '0', ',', '0', ')'])
  183. self.assertEqual(
  184. tgrep.tgrep_tokenize('N(0,0,)'), ['N(', '0', ',', '0', ',', ')']
  185. )
  186. def test_tokenize_macros(self):
  187. '''
  188. Test tokenization of macro definitions.
  189. '''
  190. self.assertEqual(
  191. tgrep.tgrep_tokenize(
  192. '@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'
  193. ),
  194. [
  195. '@',
  196. 'NP',
  197. '/^NP/',
  198. ';',
  199. '@',
  200. 'NN',
  201. '/^NN/',
  202. ';',
  203. '@NP',
  204. '[',
  205. '!',
  206. '<',
  207. 'NP',
  208. '|',
  209. '<',
  210. '@NN',
  211. ']',
  212. '!',
  213. '$..',
  214. '@NN',
  215. ],
  216. )
  217. def test_node_simple(self):
  218. '''
  219. Test a simple use of tgrep for finding nodes matching a given
  220. pattern.
  221. '''
  222. tree = ParentedTree.fromstring(
  223. '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
  224. )
  225. self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
  226. self.assertEqual(
  227. list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0, 2], tree[2, 1]]]
  228. )
  229. self.assertEqual(
  230. list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]]
  231. )
  232. def test_node_printing(self):
  233. '''Test that the tgrep print operator ' is properly ignored.'''
  234. tree = ParentedTree.fromstring('(S (n x) (N x))')
  235. self.assertEqual(
  236. list(tgrep.tgrep_positions('N', [tree])),
  237. list(tgrep.tgrep_positions('\'N', [tree])),
  238. )
  239. self.assertEqual(
  240. list(tgrep.tgrep_positions('/[Nn]/', [tree])),
  241. list(tgrep.tgrep_positions('\'/[Nn]/', [tree])),
  242. )
  243. def test_node_encoding(self):
  244. '''
  245. Test that tgrep search strings handles bytes and strs the same
  246. way.
  247. '''
  248. tree = ParentedTree.fromstring(
  249. '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
  250. )
  251. self.assertEqual(
  252. list(tgrep.tgrep_positions(b'NN', [tree])),
  253. list(tgrep.tgrep_positions(b'NN', [tree])),
  254. )
  255. self.assertEqual(
  256. list(tgrep.tgrep_nodes(b'NN', [tree])),
  257. list(tgrep.tgrep_nodes('NN', [tree])),
  258. )
  259. self.assertEqual(
  260. list(tgrep.tgrep_positions(b'NN|JJ', [tree])),
  261. list(tgrep.tgrep_positions('NN|JJ', [tree])),
  262. )
  263. def test_node_nocase(self):
  264. '''
  265. Test selecting nodes using case insensitive node names.
  266. '''
  267. tree = ParentedTree.fromstring('(S (n x) (N x))')
  268. self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
  269. self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
  270. def test_node_quoted(self):
  271. '''
  272. Test selecting nodes using quoted node names.
  273. '''
  274. tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
  275. self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
  276. self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
  277. self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
  278. self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
  279. def test_node_regex(self):
  280. '''
  281. Test regex matching on nodes.
  282. '''
  283. tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
  284. # This is a regular expression that matches any node whose
  285. # name starts with NP, including NP-SBJ:
  286. self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
  287. def test_node_regex_2(self):
  288. '''
  289. Test regex matching on nodes.
  290. '''
  291. tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
  292. self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]])
  293. # This is a regular expression that matches any node whose
  294. # name includes SBJ, including NP-SBJ:
  295. self.assertEqual(
  296. list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]]
  297. )
  298. def test_node_tree_position(self):
  299. '''
  300. Test matching on nodes based on NLTK tree position.
  301. '''
  302. tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
  303. # test all tree positions that are not leaves
  304. leaf_positions = set(
  305. tree.leaf_treeposition(x) for x in range(len(tree.leaves()))
  306. )
  307. tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
  308. for position in tree_positions:
  309. node_id = 'N{0}'.format(position)
  310. tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
  311. self.assertEqual(len(tgrep_positions[0]), 1)
  312. self.assertEqual(tgrep_positions[0][0], position)
  313. def test_node_noleaves(self):
  314. '''
  315. Test node name matching with the search_leaves flag set to False.
  316. '''
  317. tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
  318. self.assertEqual(
  319. list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]
  320. )
  321. self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])
  322. def tests_rel_dominance(self):
  323. '''
  324. Test matching nodes based on dominance relations.
  325. '''
  326. tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
  327. self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,)]])
  328. self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])), [[(0,)]])
  329. self.assertEqual(
  330. list(tgrep.tgrep_positions('* !< T', [tree])),
  331. [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
  332. )
  333. self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])), [[(1,)]])
  334. self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])), [[(0, 0)]])
  335. self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])), [[(1, 0)]])
  336. self.assertEqual(
  337. list(tgrep.tgrep_positions('* !> B', [tree])),
  338. [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
  339. )
  340. self.assertEqual(
  341. list(tgrep.tgrep_positions('* !> B >> S', [tree])), [[(0,), (0, 0), (1,)]]
  342. )
  343. self.assertEqual(
  344. list(tgrep.tgrep_positions('* >> S', [tree])),
  345. [[(0,), (0, 0), (1,), (1, 0)]],
  346. )
  347. self.assertEqual(
  348. list(tgrep.tgrep_positions('* >>, S', [tree])), [[(0,), (0, 0)]]
  349. )
  350. self.assertEqual(
  351. list(tgrep.tgrep_positions('* >>\' S', [tree])), [[(1,), (1, 0)]]
  352. )
  353. # Known issue:
  354. # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
  355. # [[()]])
  356. self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])), [[(), (0,)]])
  357. self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])), [[(0,)]])
  358. self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])), [[(1,)]])
  359. self.assertEqual(
  360. list(tgrep.tgrep_positions('* !<< T', [tree])),
  361. [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
  362. )
  363. tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
  364. self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])), [[(0,)]])
  365. self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,), (1,)]])
  366. self.assertEqual(
  367. list(tgrep.tgrep_positions('* !<: T', [tree])),
  368. [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
  369. )
  370. self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])), [[(1,)]])
  371. tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
  372. self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])), [[(1, 0)]])
  373. self.assertEqual(
  374. list(tgrep.tgrep_positions('* !>: T', [tree])),
  375. [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
  376. )
  377. tree = ParentedTree.fromstring(
  378. '(S (A (B (C (D (E (T x))))))' ' (A (B (C (D (E (T x))) (N x)))))'
  379. )
  380. self.assertEqual(
  381. list(tgrep.tgrep_positions('* <<: T', [tree])),
  382. [
  383. [
  384. (0,),
  385. (0, 0),
  386. (0, 0, 0),
  387. (0, 0, 0, 0),
  388. (0, 0, 0, 0, 0),
  389. (1, 0, 0, 0),
  390. (1, 0, 0, 0, 0),
  391. ]
  392. ],
  393. )
  394. self.assertEqual(
  395. list(tgrep.tgrep_positions('* >>: A', [tree])),
  396. [
  397. [
  398. (0, 0),
  399. (0, 0, 0),
  400. (0, 0, 0, 0),
  401. (0, 0, 0, 0, 0),
  402. (0, 0, 0, 0, 0, 0),
  403. (1, 0),
  404. (1, 0, 0),
  405. ]
  406. ],
  407. )
  408. def test_bad_operator(self):
  409. '''
  410. Test error handling of undefined tgrep operators.
  411. '''
  412. tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
  413. self.assertRaises(
  414. tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
  415. )
  416. def test_comments(self):
  417. '''
  418. Test that comments are correctly filtered out of tgrep search
  419. strings.
  420. '''
  421. tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
  422. search1 = '''
  423. @ NP /^NP/;
  424. @ NN /^NN/;
  425. @NN
  426. '''
  427. self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
  428. search2 = '''
  429. # macros
  430. @ NP /^NP/;
  431. @ NN /^NN/;
  432. # search string
  433. @NN
  434. '''
  435. self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
  436. def test_rel_sister_nodes(self):
  437. '''
  438. Test matching sister nodes in a tree.
  439. '''
  440. tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
  441. self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
  442. self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
  443. self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
  444. self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
  445. self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
  446. def tests_rel_indexed_children(self):
  447. '''
  448. Test matching nodes based on their index in their parent node.
  449. '''
  450. tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
  451. self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
  452. self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
  453. self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
  454. self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
  455. self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
  456. self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
  457. self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
  458. self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
  459. tree = ParentedTree.fromstring(
  460. '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))'
  461. )
  462. self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
  463. self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
  464. self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
  465. self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
  466. self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
  467. self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
  468. self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
  469. self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
  470. def test_rel_precedence(self):
  471. '''
  472. Test matching nodes based on precedence relations.
  473. '''
  474. tree = ParentedTree.fromstring(
  475. '(S (NP (NP (PP x)) (NP (AP x)))'
  476. ' (VP (AP (X (PP x)) (Y (AP x))))'
  477. ' (NP (RC (NP (AP x)))))'
  478. )
  479. self.assertEqual(
  480. list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]
  481. )
  482. self.assertEqual(
  483. list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
  484. )
  485. self.assertEqual(
  486. list(tgrep.tgrep_positions('* .. X', [tree])),
  487. [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
  488. )
  489. self.assertEqual(
  490. list(tgrep.tgrep_positions('* .. Y', [tree])),
  491. [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
  492. )
  493. self.assertEqual(
  494. list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
  495. )
  496. self.assertEqual(
  497. list(tgrep.tgrep_positions('* , Y', [tree])),
  498. [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
  499. )
  500. self.assertEqual(
  501. list(tgrep.tgrep_positions('* ,, X', [tree])),
  502. [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
  503. )
  504. self.assertEqual(
  505. list(tgrep.tgrep_positions('* ,, Y', [tree])),
  506. [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
  507. )
  508. def test_examples(self):
  509. '''
  510. Test the Basic Examples from the TGrep2 manual.
  511. '''
  512. tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
  513. # This matches any NP node that immediately dominates a PP:
  514. self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])), [[(1,)]])
  515. tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
  516. # This matches an NP that dominates a PP and is immediately
  517. # followed by a VP:
  518. self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])), [[(2,)]])
  519. tree = ParentedTree.fromstring(
  520. '(S (NP (AP x)) (NP (PP x)) ' '(NP (DET x) (NN x)) (VP x))'
  521. )
  522. # This matches an NP that dominates a PP or is immediately
  523. # followed by a VP:
  524. self.assertEqual(
  525. list(tgrep.tgrep_positions('NP << PP | . VP', [tree])), [[(1,), (2,)]]
  526. )
  527. tree = ParentedTree.fromstring(
  528. '(S (NP (NP (PP x)) (NP (AP x)))'
  529. ' (VP (AP (NP (PP x)) (NP (AP x))))'
  530. ' (NP (RC (NP (AP x)))))'
  531. )
  532. # This matches an NP that does not dominate a PP. Also, the NP
  533. # must either have a parent that is an NP or be dominated by a
  534. # VP:
  535. self.assertEqual(
  536. list(tgrep.tgrep_positions('NP !<< PP [> NP | >> VP]', [tree])),
  537. [[(0, 1), (1, 0, 1)]],
  538. )
  539. tree = ParentedTree.fromstring(
  540. '(S (NP (AP (PP x) (VP x))) ' '(NP (AP (PP x) (NP x))) (NP x))'
  541. )
  542. # This matches an NP that dominates a PP which itself is
  543. # immediately followed by a VP. Note the use of parentheses to
  544. # group ". VP" with the PP rather than with the NP:
  545. self.assertEqual(
  546. list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])), [[(0,)]]
  547. )
  548. tree = ParentedTree.fromstring(
  549. '(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
  550. ' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
  551. ' (NP x))'
  552. )
  553. # This matches an NP whose last child is a PP that begins with
  554. # the preposition "on":
  555. self.assertEqual(
  556. list(tgrep.tgrep_positions('NP <\' (PP <, (IN < on))', [tree])), [[(0,)]]
  557. )
  558. tree = ParentedTree.fromstring(
  559. '(S (S (C x) (A (B x))) (S (C x) (A x)) ' '(S (D x) (A (B x))))'
  560. )
  561. # The following pattern matches an S which has a child A and
  562. # another child that is a C and that the A has a child B:
  563. self.assertEqual(
  564. list(tgrep.tgrep_positions('S < (A < B) < C', [tree])), [[(0,)]]
  565. )
  566. tree = ParentedTree.fromstring(
  567. '(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))'
  568. )
  569. # However, this pattern means that S has child A and that A
  570. # has children B and C:
  571. self.assertEqual(
  572. list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])), [[(0,)]]
  573. )
  574. # It is equivalent to this:
  575. self.assertEqual(
  576. list(tgrep.tgrep_positions('S < (A < B < C)', [tree])), [[(0,)]]
  577. )
  578. def test_use_macros(self):
  579. '''
  580. Test defining and using tgrep2 macros.
  581. '''
  582. tree = ParentedTree.fromstring(
  583. '(VP (VB sold) (NP (DET the) '
  584. '(NN heiress)) (NP (NN deed) (PREP to) '
  585. '(NP (DET the) (NN school) (NN house))))'
  586. )
  587. self.assertEqual(
  588. list(
  589. tgrep.tgrep_positions(
  590. '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
  591. )
  592. ),
  593. [[(1,), (2, 2)]],
  594. )
  595. # use undefined macro @CNP
  596. self.assertRaises(
  597. tgrep.TgrepException,
  598. list,
  599. tgrep.tgrep_positions(
  600. '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
  601. ),
  602. )
  603. def test_tokenize_node_labels(self):
  604. '''Test tokenization of labeled nodes.'''
  605. self.assertEqual(
  606. tgrep.tgrep_tokenize('S < @SBJ < (@VP < (@VB $.. @OBJ))'),
  607. [
  608. 'S',
  609. '<',
  610. '@SBJ',
  611. '<',
  612. '(',
  613. '@VP',
  614. '<',
  615. '(',
  616. '@VB',
  617. '$..',
  618. '@OBJ',
  619. ')',
  620. ')',
  621. ],
  622. )
  623. self.assertEqual(
  624. tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
  625. [
  626. 'S',
  627. '<',
  628. '@SBJ',
  629. '=',
  630. 's',
  631. '<',
  632. '(',
  633. '@VP',
  634. '=',
  635. 'v',
  636. '<',
  637. '(',
  638. '@VB',
  639. '$..',
  640. '@OBJ',
  641. ')',
  642. ')',
  643. ],
  644. )
  645. def test_tokenize_segmented_patterns(self):
  646. '''Test tokenization of segmented patterns.'''
  647. self.assertEqual(
  648. tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
  649. [
  650. 'S',
  651. '<',
  652. '@SBJ',
  653. '=',
  654. 's',
  655. '<',
  656. '(',
  657. '@VP',
  658. '=',
  659. 'v',
  660. '<',
  661. '(',
  662. '@VB',
  663. '$..',
  664. '@OBJ',
  665. ')',
  666. ')',
  667. ':',
  668. '=s',
  669. '..',
  670. '=v',
  671. ],
  672. )
  673. def test_labeled_nodes(self):
  674. '''
  675. Test labeled nodes.
  676. Test case from Emily M. Bender.
  677. '''
  678. search = '''
  679. # macros
  680. @ SBJ /SBJ/;
  681. @ VP /VP/;
  682. @ VB /VB/;
  683. @ VPoB /V[PB]/;
  684. @ OBJ /OBJ/;
  685. # 1 svo
  686. S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
  687. sent1 = ParentedTree.fromstring(
  688. '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))'
  689. )
  690. sent2 = ParentedTree.fromstring(
  691. '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))'
  692. )
  693. search_firsthalf = search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))'
  694. search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
  695. self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
  696. self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
  697. self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
  698. self.assertEqual(
  699. list(tgrep.tgrep_positions(search, [sent1])),
  700. list(tgrep.tgrep_positions(search_rewrite, [sent1])),
  701. )
  702. self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
  703. self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
  704. self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
  705. self.assertEqual(
  706. list(tgrep.tgrep_positions(search, [sent2])),
  707. list(tgrep.tgrep_positions(search_rewrite, [sent2])),
  708. )
  709. def test_multiple_conjs(self):
  710. '''
  711. Test that multiple (3 or more) conjunctions of node relations are
  712. handled properly.
  713. '''
  714. sent = ParentedTree.fromstring('((A (B b) (C c)) (A (B b) (C c) (D d)))')
  715. # search = '(A < B < C < D)'
  716. # search_tworels = '(A < B < C)'
  717. self.assertEqual(
  718. list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]
  719. )
  720. self.assertEqual(
  721. list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]]
  722. )
  723. def test_trailing_semicolon(self):
  724. '''
  725. Test that semicolons at the end of a tgrep2 search string won't
  726. cause a parse failure.
  727. '''
  728. tree = ParentedTree.fromstring(
  729. '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
  730. )
  731. self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
  732. self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]])
  733. self.assertEqual(
  734. list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]]
  735. )
  736. if __name__ == '__main__':
  737. unittest.main()