recursivedescent.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. # Natural Language Toolkit: Recursive Descent Parser
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from nltk.grammar import Nonterminal
  9. from nltk.tree import Tree, ImmutableTree
  10. from nltk.parse.api import ParserI
  11. ##//////////////////////////////////////////////////////
  12. ## Recursive Descent Parser
  13. ##//////////////////////////////////////////////////////
  14. class RecursiveDescentParser(ParserI):
  15. """
  16. A simple top-down CFG parser that parses texts by recursively
  17. expanding the fringe of a Tree, and matching it against a
  18. text.
  19. ``RecursiveDescentParser`` uses a list of tree locations called a
  20. "frontier" to remember which subtrees have not yet been expanded
  21. and which leaves have not yet been matched against the text. Each
  22. tree location consists of a list of child indices specifying the
  23. path from the root of the tree to a subtree or a leaf; see the
  24. reference documentation for Tree for more information
  25. about tree locations.
  26. When the parser begins parsing a text, it constructs a tree
  27. containing only the start symbol, and a frontier containing the
  28. location of the tree's root node. It then extends the tree to
  29. cover the text, using the following recursive procedure:
  30. - If the frontier is empty, and the text is covered by the tree,
  31. then return the tree as a possible parse.
  32. - If the frontier is empty, and the text is not covered by the
  33. tree, then return no parses.
  34. - If the first element of the frontier is a subtree, then
  35. use CFG productions to "expand" it. For each applicable
  36. production, add the expanded subtree's children to the
  37. frontier, and recursively find all parses that can be
  38. generated by the new tree and frontier.
  39. - If the first element of the frontier is a token, then "match"
  40. it against the next token from the text. Remove the token
  41. from the frontier, and recursively find all parses that can be
  42. generated by the new tree and frontier.
  43. :see: ``nltk.grammar``
  44. """
  45. def __init__(self, grammar, trace=0):
  46. """
  47. Create a new ``RecursiveDescentParser``, that uses ``grammar``
  48. to parse texts.
  49. :type grammar: CFG
  50. :param grammar: The grammar used to parse texts.
  51. :type trace: int
  52. :param trace: The level of tracing that should be used when
  53. parsing a text. ``0`` will generate no tracing output;
  54. and higher numbers will produce more verbose tracing
  55. output.
  56. """
  57. self._grammar = grammar
  58. self._trace = trace
  59. def grammar(self):
  60. return self._grammar
  61. def parse(self, tokens):
  62. # Inherit docs from ParserI
  63. tokens = list(tokens)
  64. self._grammar.check_coverage(tokens)
  65. # Start a recursive descent parse, with an initial tree
  66. # containing just the start symbol.
  67. start = self._grammar.start().symbol()
  68. initial_tree = Tree(start, [])
  69. frontier = [()]
  70. if self._trace:
  71. self._trace_start(initial_tree, frontier, tokens)
  72. return self._parse(tokens, initial_tree, frontier)
  73. def _parse(self, remaining_text, tree, frontier):
  74. """
  75. Recursively expand and match each elements of ``tree``
  76. specified by ``frontier``, to cover ``remaining_text``. Return
  77. a list of all parses found.
  78. :return: An iterator of all parses that can be generated by
  79. matching and expanding the elements of ``tree``
  80. specified by ``frontier``.
  81. :rtype: iter(Tree)
  82. :type tree: Tree
  83. :param tree: A partial structure for the text that is
  84. currently being parsed. The elements of ``tree``
  85. that are specified by ``frontier`` have not yet been
  86. expanded or matched.
  87. :type remaining_text: list(str)
  88. :param remaining_text: The portion of the text that is not yet
  89. covered by ``tree``.
  90. :type frontier: list(tuple(int))
  91. :param frontier: A list of the locations within ``tree`` of
  92. all subtrees that have not yet been expanded, and all
  93. leaves that have not yet been matched. This list sorted
  94. in left-to-right order of location within the tree.
  95. """
  96. # If the tree covers the text, and there's nothing left to
  97. # expand, then we've found a complete parse; return it.
  98. if len(remaining_text) == 0 and len(frontier) == 0:
  99. if self._trace:
  100. self._trace_succeed(tree, frontier)
  101. yield tree
  102. # If there's still text, but nothing left to expand, we failed.
  103. elif len(frontier) == 0:
  104. if self._trace:
  105. self._trace_backtrack(tree, frontier)
  106. # If the next element on the frontier is a tree, expand it.
  107. elif isinstance(tree[frontier[0]], Tree):
  108. for result in self._expand(remaining_text, tree, frontier):
  109. yield result
  110. # If the next element on the frontier is a token, match it.
  111. else:
  112. for result in self._match(remaining_text, tree, frontier):
  113. yield result
  114. def _match(self, rtext, tree, frontier):
  115. """
  116. :rtype: iter(Tree)
  117. :return: an iterator of all parses that can be generated by
  118. matching the first element of ``frontier`` against the
  119. first token in ``rtext``. In particular, if the first
  120. element of ``frontier`` has the same type as the first
  121. token in ``rtext``, then substitute the token into
  122. ``tree``; and return all parses that can be generated by
  123. matching and expanding the remaining elements of
  124. ``frontier``. If the first element of ``frontier`` does not
  125. have the same type as the first token in ``rtext``, then
  126. return empty list.
  127. :type tree: Tree
  128. :param tree: A partial structure for the text that is
  129. currently being parsed. The elements of ``tree``
  130. that are specified by ``frontier`` have not yet been
  131. expanded or matched.
  132. :type rtext: list(str)
  133. :param rtext: The portion of the text that is not yet
  134. covered by ``tree``.
  135. :type frontier: list of tuple of int
  136. :param frontier: A list of the locations within ``tree`` of
  137. all subtrees that have not yet been expanded, and all
  138. leaves that have not yet been matched.
  139. """
  140. tree_leaf = tree[frontier[0]]
  141. if len(rtext) > 0 and tree_leaf == rtext[0]:
  142. # If it's a terminal that matches rtext[0], then substitute
  143. # in the token, and continue parsing.
  144. newtree = tree.copy(deep=True)
  145. newtree[frontier[0]] = rtext[0]
  146. if self._trace:
  147. self._trace_match(newtree, frontier[1:], rtext[0])
  148. for result in self._parse(rtext[1:], newtree, frontier[1:]):
  149. yield result
  150. else:
  151. # If it's a non-matching terminal, fail.
  152. if self._trace:
  153. self._trace_backtrack(tree, frontier, rtext[:1])
  154. def _expand(self, remaining_text, tree, frontier, production=None):
  155. """
  156. :rtype: iter(Tree)
  157. :return: An iterator of all parses that can be generated by
  158. expanding the first element of ``frontier`` with
  159. ``production``. In particular, if the first element of
  160. ``frontier`` is a subtree whose node type is equal to
  161. ``production``'s left hand side, then add a child to that
  162. subtree for each element of ``production``'s right hand
  163. side; and return all parses that can be generated by
  164. matching and expanding the remaining elements of
  165. ``frontier``. If the first element of ``frontier`` is not a
  166. subtree whose node type is equal to ``production``'s left
  167. hand side, then return an empty list. If ``production`` is
  168. not specified, then return a list of all parses that can
  169. be generated by expanding the first element of ``frontier``
  170. with *any* CFG production.
  171. :type tree: Tree
  172. :param tree: A partial structure for the text that is
  173. currently being parsed. The elements of ``tree``
  174. that are specified by ``frontier`` have not yet been
  175. expanded or matched.
  176. :type remaining_text: list(str)
  177. :param remaining_text: The portion of the text that is not yet
  178. covered by ``tree``.
  179. :type frontier: list(tuple(int))
  180. :param frontier: A list of the locations within ``tree`` of
  181. all subtrees that have not yet been expanded, and all
  182. leaves that have not yet been matched.
  183. """
  184. if production is None:
  185. productions = self._grammar.productions()
  186. else:
  187. productions = [production]
  188. for production in productions:
  189. lhs = production.lhs().symbol()
  190. if lhs == tree[frontier[0]].label():
  191. subtree = self._production_to_tree(production)
  192. if frontier[0] == ():
  193. newtree = subtree
  194. else:
  195. newtree = tree.copy(deep=True)
  196. newtree[frontier[0]] = subtree
  197. new_frontier = [
  198. frontier[0] + (i,) for i in range(len(production.rhs()))
  199. ]
  200. if self._trace:
  201. self._trace_expand(newtree, new_frontier, production)
  202. for result in self._parse(
  203. remaining_text, newtree, new_frontier + frontier[1:]
  204. ):
  205. yield result
  206. def _production_to_tree(self, production):
  207. """
  208. :rtype: Tree
  209. :return: The Tree that is licensed by ``production``.
  210. In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
  211. return a tree that has a node ``lhs.symbol``, and
  212. ``n`` children. For each nonterminal element
  213. ``elt[i]`` in the production, the tree token has a
  214. childless subtree with node value ``elt[i].symbol``; and
  215. for each terminal element ``elt[j]``, the tree token has
  216. a leaf token with type ``elt[j]``.
  217. :param production: The CFG production that licenses the tree
  218. token that should be returned.
  219. :type production: Production
  220. """
  221. children = []
  222. for elt in production.rhs():
  223. if isinstance(elt, Nonterminal):
  224. children.append(Tree(elt.symbol(), []))
  225. else:
  226. # This will be matched.
  227. children.append(elt)
  228. return Tree(production.lhs().symbol(), children)
  229. def trace(self, trace=2):
  230. """
  231. Set the level of tracing output that should be generated when
  232. parsing a text.
  233. :type trace: int
  234. :param trace: The trace level. A trace level of ``0`` will
  235. generate no tracing output; and higher trace levels will
  236. produce more verbose tracing output.
  237. :rtype: None
  238. """
  239. self._trace = trace
  240. def _trace_fringe(self, tree, treeloc=None):
  241. """
  242. Print trace output displaying the fringe of ``tree``. The
  243. fringe of ``tree`` consists of all of its leaves and all of
  244. its childless subtrees.
  245. :rtype: None
  246. """
  247. if treeloc == ():
  248. print("*", end=" ")
  249. if isinstance(tree, Tree):
  250. if len(tree) == 0:
  251. print(repr(Nonterminal(tree.label())), end=" ")
  252. for i in range(len(tree)):
  253. if treeloc is not None and i == treeloc[0]:
  254. self._trace_fringe(tree[i], treeloc[1:])
  255. else:
  256. self._trace_fringe(tree[i])
  257. else:
  258. print(repr(tree), end=" ")
  259. def _trace_tree(self, tree, frontier, operation):
  260. """
  261. Print trace output displaying the parser's current state.
  262. :param operation: A character identifying the operation that
  263. generated the current state.
  264. :rtype: None
  265. """
  266. if self._trace == 2:
  267. print(" %c [" % operation, end=" ")
  268. else:
  269. print(" [", end=" ")
  270. if len(frontier) > 0:
  271. self._trace_fringe(tree, frontier[0])
  272. else:
  273. self._trace_fringe(tree)
  274. print("]")
  275. def _trace_start(self, tree, frontier, text):
  276. print("Parsing %r" % " ".join(text))
  277. if self._trace > 2:
  278. print("Start:")
  279. if self._trace > 1:
  280. self._trace_tree(tree, frontier, " ")
  281. def _trace_expand(self, tree, frontier, production):
  282. if self._trace > 2:
  283. print("Expand: %s" % production)
  284. if self._trace > 1:
  285. self._trace_tree(tree, frontier, "E")
  286. def _trace_match(self, tree, frontier, tok):
  287. if self._trace > 2:
  288. print("Match: %r" % tok)
  289. if self._trace > 1:
  290. self._trace_tree(tree, frontier, "M")
  291. def _trace_succeed(self, tree, frontier):
  292. if self._trace > 2:
  293. print("GOOD PARSE:")
  294. if self._trace == 1:
  295. print("Found a parse:\n%s" % tree)
  296. if self._trace > 1:
  297. self._trace_tree(tree, frontier, "+")
  298. def _trace_backtrack(self, tree, frontier, toks=None):
  299. if self._trace > 2:
  300. if toks:
  301. print("Backtrack: %r match failed" % toks[0])
  302. else:
  303. print("Backtrack")
  304. ##//////////////////////////////////////////////////////
  305. ## Stepping Recursive Descent Parser
  306. ##//////////////////////////////////////////////////////
  307. class SteppingRecursiveDescentParser(RecursiveDescentParser):
  308. """
  309. A ``RecursiveDescentParser`` that allows you to step through the
  310. parsing process, performing a single operation at a time.
  311. The ``initialize`` method is used to start parsing a text.
  312. ``expand`` expands the first element on the frontier using a single
  313. CFG production, and ``match`` matches the first element on the
  314. frontier against the next text token. ``backtrack`` undoes the most
  315. recent expand or match operation. ``step`` performs a single
  316. expand, match, or backtrack operation. ``parses`` returns the set
  317. of parses that have been found by the parser.
  318. :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
  319. containing the previous states of the parser. This history is
  320. used to implement the ``backtrack`` operation.
  321. :ivar _tried_e: A record of all productions that have been tried
  322. for a given tree. This record is used by ``expand`` to perform
  323. the next untried production.
  324. :ivar _tried_m: A record of what tokens have been matched for a
  325. given tree. This record is used by ``step`` to decide whether
  326. or not to match a token.
  327. :see: ``nltk.grammar``
  328. """
  329. def __init__(self, grammar, trace=0):
  330. super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
  331. self._rtext = None
  332. self._tree = None
  333. self._frontier = [()]
  334. self._tried_e = {}
  335. self._tried_m = {}
  336. self._history = []
  337. self._parses = []
  338. # [XX] TEMPORARY HACK WARNING! This should be replaced with
  339. # something nicer when we get the chance.
  340. def _freeze(self, tree):
  341. c = tree.copy()
  342. # for pos in c.treepositions('leaves'):
  343. # c[pos] = c[pos].freeze()
  344. return ImmutableTree.convert(c)
  345. def parse(self, tokens):
  346. tokens = list(tokens)
  347. self.initialize(tokens)
  348. while self.step() is not None:
  349. pass
  350. return self.parses()
  351. def initialize(self, tokens):
  352. """
  353. Start parsing a given text. This sets the parser's tree to
  354. the start symbol, its frontier to the root node, and its
  355. remaining text to ``token['SUBTOKENS']``.
  356. """
  357. self._rtext = tokens
  358. start = self._grammar.start().symbol()
  359. self._tree = Tree(start, [])
  360. self._frontier = [()]
  361. self._tried_e = {}
  362. self._tried_m = {}
  363. self._history = []
  364. self._parses = []
  365. if self._trace:
  366. self._trace_start(self._tree, self._frontier, self._rtext)
  367. def remaining_text(self):
  368. """
  369. :return: The portion of the text that is not yet covered by the
  370. tree.
  371. :rtype: list(str)
  372. """
  373. return self._rtext
  374. def frontier(self):
  375. """
  376. :return: A list of the tree locations of all subtrees that
  377. have not yet been expanded, and all leaves that have not
  378. yet been matched.
  379. :rtype: list(tuple(int))
  380. """
  381. return self._frontier
  382. def tree(self):
  383. """
  384. :return: A partial structure for the text that is
  385. currently being parsed. The elements specified by the
  386. frontier have not yet been expanded or matched.
  387. :rtype: Tree
  388. """
  389. return self._tree
  390. def step(self):
  391. """
  392. Perform a single parsing operation. If an untried match is
  393. possible, then perform the match, and return the matched
  394. token. If an untried expansion is possible, then perform the
  395. expansion, and return the production that it is based on. If
  396. backtracking is possible, then backtrack, and return True.
  397. Otherwise, return None.
  398. :return: None if no operation was performed; a token if a match
  399. was performed; a production if an expansion was performed;
  400. and True if a backtrack operation was performed.
  401. :rtype: Production or String or bool
  402. """
  403. # Try matching (if we haven't already)
  404. if self.untried_match():
  405. token = self.match()
  406. if token is not None:
  407. return token
  408. # Try expanding.
  409. production = self.expand()
  410. if production is not None:
  411. return production
  412. # Try backtracking
  413. if self.backtrack():
  414. self._trace_backtrack(self._tree, self._frontier)
  415. return True
  416. # Nothing left to do.
  417. return None
  418. def expand(self, production=None):
  419. """
  420. Expand the first element of the frontier. In particular, if
  421. the first element of the frontier is a subtree whose node type
  422. is equal to ``production``'s left hand side, then add a child
  423. to that subtree for each element of ``production``'s right hand
  424. side. If ``production`` is not specified, then use the first
  425. untried expandable production. If all expandable productions
  426. have been tried, do nothing.
  427. :return: The production used to expand the frontier, if an
  428. expansion was performed. If no expansion was performed,
  429. return None.
  430. :rtype: Production or None
  431. """
  432. # Make sure we *can* expand.
  433. if len(self._frontier) == 0:
  434. return None
  435. if not isinstance(self._tree[self._frontier[0]], Tree):
  436. return None
  437. # If they didn't specify a production, check all untried ones.
  438. if production is None:
  439. productions = self.untried_expandable_productions()
  440. else:
  441. productions = [production]
  442. parses = []
  443. for prod in productions:
  444. # Record that we've tried this production now.
  445. self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
  446. # Try expanding.
  447. for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
  448. return prod
  449. # We didn't expand anything.
  450. return None
  451. def match(self):
  452. """
  453. Match the first element of the frontier. In particular, if
  454. the first element of the frontier has the same type as the
  455. next text token, then substitute the text token into the tree.
  456. :return: The token matched, if a match operation was
  457. performed. If no match was performed, return None
  458. :rtype: str or None
  459. """
  460. # Record that we've tried matching this token.
  461. tok = self._rtext[0]
  462. self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
  463. # Make sure we *can* match.
  464. if len(self._frontier) == 0:
  465. return None
  466. if isinstance(self._tree[self._frontier[0]], Tree):
  467. return None
  468. for _result in self._match(self._rtext, self._tree, self._frontier):
  469. # Return the token we just matched.
  470. return self._history[-1][0][0]
  471. return None
  472. def backtrack(self):
  473. """
  474. Return the parser to its state before the most recent
  475. match or expand operation. Calling ``undo`` repeatedly return
  476. the parser to successively earlier states. If no match or
  477. expand operations have been performed, ``undo`` will make no
  478. changes.
  479. :return: true if an operation was successfully undone.
  480. :rtype: bool
  481. """
  482. if len(self._history) == 0:
  483. return False
  484. (self._rtext, self._tree, self._frontier) = self._history.pop()
  485. return True
  486. def expandable_productions(self):
  487. """
  488. :return: A list of all the productions for which expansions
  489. are available for the current parser state.
  490. :rtype: list(Production)
  491. """
  492. # Make sure we *can* expand.
  493. if len(self._frontier) == 0:
  494. return []
  495. frontier_child = self._tree[self._frontier[0]]
  496. if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
  497. return []
  498. return [
  499. p
  500. for p in self._grammar.productions()
  501. if p.lhs().symbol() == frontier_child.label()
  502. ]
  503. def untried_expandable_productions(self):
  504. """
  505. :return: A list of all the untried productions for which
  506. expansions are available for the current parser state.
  507. :rtype: list(Production)
  508. """
  509. tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
  510. return [p for p in self.expandable_productions() if p not in tried_expansions]
  511. def untried_match(self):
  512. """
  513. :return: Whether the first element of the frontier is a token
  514. that has not yet been matched.
  515. :rtype: bool
  516. """
  517. if len(self._rtext) == 0:
  518. return False
  519. tried_matches = self._tried_m.get(self._freeze(self._tree), [])
  520. return self._rtext[0] not in tried_matches
  521. def currently_complete(self):
  522. """
  523. :return: Whether the parser's current state represents a
  524. complete parse.
  525. :rtype: bool
  526. """
  527. return len(self._frontier) == 0 and len(self._rtext) == 0
  528. def _parse(self, remaining_text, tree, frontier):
  529. """
  530. A stub version of ``_parse`` that sets the parsers current
  531. state to the given arguments. In ``RecursiveDescentParser``,
  532. the ``_parse`` method is used to recursively continue parsing a
  533. text. ``SteppingRecursiveDescentParser`` overrides it to
  534. capture these recursive calls. It records the parser's old
  535. state in the history (to allow for backtracking), and updates
  536. the parser's new state using the given arguments. Finally, it
  537. returns ``[1]``, which is used by ``match`` and ``expand`` to
  538. detect whether their operations were successful.
  539. :return: ``[1]``
  540. :rtype: list of int
  541. """
  542. self._history.append((self._rtext, self._tree, self._frontier))
  543. self._rtext = remaining_text
  544. self._tree = tree
  545. self._frontier = frontier
  546. # Is it a good parse? If so, record it.
  547. if len(frontier) == 0 and len(remaining_text) == 0:
  548. self._parses.append(tree)
  549. self._trace_succeed(self._tree, self._frontier)
  550. return [1]
  551. def parses(self):
  552. """
  553. :return: An iterator of the parses that have been found by this
  554. parser so far.
  555. :rtype: list of Tree
  556. """
  557. return iter(self._parses)
  558. def set_grammar(self, grammar):
  559. """
  560. Change the grammar used to parse texts.
  561. :param grammar: The new grammar.
  562. :type grammar: CFG
  563. """
  564. self._grammar = grammar
  565. ##//////////////////////////////////////////////////////
  566. ## Demonstration Code
  567. ##//////////////////////////////////////////////////////
  568. def demo():
  569. """
  570. A demonstration of the recursive descent parser.
  571. """
  572. from nltk import parse, CFG
  573. grammar = CFG.fromstring(
  574. """
  575. S -> NP VP
  576. NP -> Det N | Det N PP
  577. VP -> V NP | V NP PP
  578. PP -> P NP
  579. NP -> 'I'
  580. N -> 'man' | 'park' | 'telescope' | 'dog'
  581. Det -> 'the' | 'a'
  582. P -> 'in' | 'with'
  583. V -> 'saw'
  584. """
  585. )
  586. for prod in grammar.productions():
  587. print(prod)
  588. sent = "I saw a man in the park".split()
  589. parser = parse.RecursiveDescentParser(grammar, trace=2)
  590. for p in parser.parse(sent):
  591. print(p)
  592. if __name__ == "__main__":
  593. demo()