featurechart.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Chart Parser for Feature-Based Grammars
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Rob Speer <rspeer@mit.edu>
  6. # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Extension of chart parsing implementation to handle grammars with
  11. feature structures as nodes.
  12. """
  13. from time import perf_counter
  14. from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
  15. from nltk.sem import logic
  16. from nltk.tree import Tree
  17. from nltk.grammar import (
  18. Nonterminal,
  19. Production,
  20. CFG,
  21. FeatStructNonterminal,
  22. is_nonterminal,
  23. is_terminal,
  24. )
  25. from nltk.parse.chart import (
  26. TreeEdge,
  27. Chart,
  28. ChartParser,
  29. EdgeI,
  30. FundamentalRule,
  31. LeafInitRule,
  32. EmptyPredictRule,
  33. BottomUpPredictRule,
  34. SingleEdgeFundamentalRule,
  35. BottomUpPredictCombineRule,
  36. CachedTopDownPredictRule,
  37. TopDownInitRule,
  38. )
  39. # ////////////////////////////////////////////////////////////
  40. # Tree Edge
  41. # ////////////////////////////////////////////////////////////
  42. class FeatureTreeEdge(TreeEdge):
  43. """
  44. A specialized tree edge that allows shared variable bindings
  45. between nonterminals on the left-hand side and right-hand side.
  46. Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
  47. dictionary mapping from variables to values. If the edge is not
  48. complete, then these bindings are simply stored. However, if the
  49. edge is complete, then the constructor applies these bindings to
  50. every nonterminal in the edge whose symbol implements the
  51. interface ``SubstituteBindingsI``.
  52. """
  53. def __init__(self, span, lhs, rhs, dot=0, bindings=None):
  54. """
  55. Construct a new edge. If the edge is incomplete (i.e., if
  56. ``dot<len(rhs)``), then store the bindings as-is. If the edge
  57. is complete (i.e., if ``dot==len(rhs)``), then apply the
  58. bindings to all nonterminals in ``lhs`` and ``rhs``, and then
  59. clear the bindings. See ``TreeEdge`` for a description of
  60. the other arguments.
  61. """
  62. if bindings is None:
  63. bindings = {}
  64. # If the edge is complete, then substitute in the bindings,
  65. # and then throw them away. (If we didn't throw them away, we
  66. # might think that 2 complete edges are different just because
  67. # they have different bindings, even though all bindings have
  68. # already been applied.)
  69. if dot == len(rhs) and bindings:
  70. lhs = self._bind(lhs, bindings)
  71. rhs = [self._bind(elt, bindings) for elt in rhs]
  72. bindings = {}
  73. # Initialize the edge.
  74. TreeEdge.__init__(self, span, lhs, rhs, dot)
  75. self._bindings = bindings
  76. self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
  77. @staticmethod
  78. def from_production(production, index):
  79. """
  80. :return: A new ``TreeEdge`` formed from the given production.
  81. The new edge's left-hand side and right-hand side will
  82. be taken from ``production``; its span will be
  83. ``(index,index)``; and its dot position will be ``0``.
  84. :rtype: TreeEdge
  85. """
  86. return FeatureTreeEdge(
  87. span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
  88. )
  89. def move_dot_forward(self, new_end, bindings=None):
  90. """
  91. :return: A new ``FeatureTreeEdge`` formed from this edge.
  92. The new edge's dot position is increased by ``1``,
  93. and its end index will be replaced by ``new_end``.
  94. :rtype: FeatureTreeEdge
  95. :param new_end: The new end index.
  96. :type new_end: int
  97. :param bindings: Bindings for the new edge.
  98. :type bindings: dict
  99. """
  100. return FeatureTreeEdge(
  101. span=(self._span[0], new_end),
  102. lhs=self._lhs,
  103. rhs=self._rhs,
  104. dot=self._dot + 1,
  105. bindings=bindings,
  106. )
  107. def _bind(self, nt, bindings):
  108. if not isinstance(nt, FeatStructNonterminal):
  109. return nt
  110. return nt.substitute_bindings(bindings)
  111. def next_with_bindings(self):
  112. return self._bind(self.nextsym(), self._bindings)
  113. def bindings(self):
  114. """
  115. Return a copy of this edge's bindings dictionary.
  116. """
  117. return self._bindings.copy()
  118. def variables(self):
  119. """
  120. :return: The set of variables used by this edge.
  121. :rtype: set(Variable)
  122. """
  123. return find_variables(
  124. [self._lhs]
  125. + list(self._rhs)
  126. + list(self._bindings.keys())
  127. + list(self._bindings.values()),
  128. fs_class=FeatStruct,
  129. )
  130. def __str__(self):
  131. if self.is_complete():
  132. return super().__str__()
  133. else:
  134. bindings = "{%s}" % ", ".join(
  135. "%s: %r" % item for item in sorted(self._bindings.items())
  136. )
  137. return "%s %s" % (super().__str__(), bindings)
  138. # ////////////////////////////////////////////////////////////
  139. # A specialized Chart for feature grammars
  140. # ////////////////////////////////////////////////////////////
  141. # TODO: subsumes check when adding new edges
  142. class FeatureChart(Chart):
  143. """
  144. A Chart for feature grammars.
  145. :see: ``Chart`` for more information.
  146. """
  147. def select(self, **restrictions):
  148. """
  149. Returns an iterator over the edges in this chart.
  150. See ``Chart.select`` for more information about the
  151. ``restrictions`` on the edges.
  152. """
  153. # If there are no restrictions, then return all edges.
  154. if restrictions == {}:
  155. return iter(self._edges)
  156. # Find the index corresponding to the given restrictions.
  157. restr_keys = sorted(restrictions.keys())
  158. restr_keys = tuple(restr_keys)
  159. # If it doesn't exist, then create it.
  160. if restr_keys not in self._indexes:
  161. self._add_index(restr_keys)
  162. vals = tuple(
  163. self._get_type_if_possible(restrictions[key]) for key in restr_keys
  164. )
  165. return iter(self._indexes[restr_keys].get(vals, []))
  166. def _add_index(self, restr_keys):
  167. """
  168. A helper function for ``select``, which creates a new index for
  169. a given set of attributes (aka restriction keys).
  170. """
  171. # Make sure it's a valid index.
  172. for key in restr_keys:
  173. if not hasattr(EdgeI, key):
  174. raise ValueError("Bad restriction: %s" % key)
  175. # Create the index.
  176. index = self._indexes[restr_keys] = {}
  177. # Add all existing edges to the index.
  178. for edge in self._edges:
  179. vals = tuple(
  180. self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
  181. )
  182. index.setdefault(vals, []).append(edge)
  183. def _register_with_indexes(self, edge):
  184. """
  185. A helper function for ``insert``, which registers the new
  186. edge with all existing indexes.
  187. """
  188. for (restr_keys, index) in self._indexes.items():
  189. vals = tuple(
  190. self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
  191. )
  192. index.setdefault(vals, []).append(edge)
  193. def _get_type_if_possible(self, item):
  194. """
  195. Helper function which returns the ``TYPE`` feature of the ``item``,
  196. if it exists, otherwise it returns the ``item`` itself
  197. """
  198. if isinstance(item, dict) and TYPE in item:
  199. return item[TYPE]
  200. else:
  201. return item
  202. def parses(self, start, tree_class=Tree):
  203. for edge in self.select(start=0, end=self._num_leaves):
  204. if (
  205. (isinstance(edge, FeatureTreeEdge))
  206. and (edge.lhs()[TYPE] == start[TYPE])
  207. and (unify(edge.lhs(), start, rename_vars=True))
  208. ):
  209. for tree in self.trees(edge, complete=True, tree_class=tree_class):
  210. yield tree
  211. # ////////////////////////////////////////////////////////////
  212. # Fundamental Rule
  213. # ////////////////////////////////////////////////////////////
  214. class FeatureFundamentalRule(FundamentalRule):
  215. """
  216. A specialized version of the fundamental rule that operates on
  217. nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
  218. tha simply comparing the nonterminals for equality, they are
  219. unified. Variable bindings from these unifications are collected
  220. and stored in the chart using a ``FeatureTreeEdge``. When a
  221. complete edge is generated, these bindings are applied to all
  222. nonterminals in the edge.
  223. The fundamental rule states that:
  224. - ``[A -> alpha \* B1 beta][i:j]``
  225. - ``[B2 -> gamma \*][j:k]``
  226. licenses the edge:
  227. - ``[A -> alpha B3 \* beta][i:j]``
  228. assuming that B1 and B2 can be unified to generate B3.
  229. """
  230. def apply(self, chart, grammar, left_edge, right_edge):
  231. # Make sure the rule is applicable.
  232. if not (
  233. left_edge.end() == right_edge.start()
  234. and left_edge.is_incomplete()
  235. and right_edge.is_complete()
  236. and isinstance(left_edge, FeatureTreeEdge)
  237. ):
  238. return
  239. found = right_edge.lhs()
  240. nextsym = left_edge.nextsym()
  241. if isinstance(right_edge, FeatureTreeEdge):
  242. if not is_nonterminal(nextsym):
  243. return
  244. if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
  245. return
  246. # Create a copy of the bindings.
  247. bindings = left_edge.bindings()
  248. # We rename vars here, because we don't want variables
  249. # from the two different productions to match.
  250. found = found.rename_variables(used_vars=left_edge.variables())
  251. # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
  252. # generate B3 (result).
  253. result = unify(nextsym, found, bindings, rename_vars=False)
  254. if result is None:
  255. return
  256. else:
  257. if nextsym != found:
  258. return
  259. # Create a copy of the bindings.
  260. bindings = left_edge.bindings()
  261. # Construct the new edge.
  262. new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
  263. # Add it to the chart, with appropriate child pointers.
  264. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  265. yield new_edge
  266. class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
  267. """
  268. A specialized version of the completer / single edge fundamental rule
  269. that operates on nonterminals whose symbols are ``FeatStructNonterminal``s.
  270. Rather than simply comparing the nonterminals for equality, they are
  271. unified.
  272. """
  273. _fundamental_rule = FeatureFundamentalRule()
  274. def _apply_complete(self, chart, grammar, right_edge):
  275. fr = self._fundamental_rule
  276. for left_edge in chart.select(
  277. end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
  278. ):
  279. for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
  280. yield new_edge
  281. def _apply_incomplete(self, chart, grammar, left_edge):
  282. fr = self._fundamental_rule
  283. for right_edge in chart.select(
  284. start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
  285. ):
  286. for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
  287. yield new_edge
  288. # ////////////////////////////////////////////////////////////
  289. # Top-Down Prediction
  290. # ////////////////////////////////////////////////////////////
  291. class FeatureTopDownInitRule(TopDownInitRule):
  292. def apply(self, chart, grammar):
  293. for prod in grammar.productions(lhs=grammar.start()):
  294. new_edge = FeatureTreeEdge.from_production(prod, 0)
  295. if chart.insert(new_edge, ()):
  296. yield new_edge
  297. class FeatureTopDownPredictRule(CachedTopDownPredictRule):
  298. """
  299. A specialized version of the (cached) top down predict rule that operates
  300. on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
  301. than simply comparing the nonterminals for equality, they are
  302. unified.
  303. The top down expand rule states that:
  304. - ``[A -> alpha \* B1 beta][i:j]``
  305. licenses the edge:
  306. - ``[B2 -> \* gamma][j:j]``
  307. for each grammar production ``B2 -> gamma``, assuming that B1
  308. and B2 can be unified.
  309. """
  310. def apply(self, chart, grammar, edge):
  311. if edge.is_complete():
  312. return
  313. nextsym, index = edge.nextsym(), edge.end()
  314. if not is_nonterminal(nextsym):
  315. return
  316. # If we've already applied this rule to an edge with the same
  317. # next & end, and the chart & grammar have not changed, then
  318. # just return (no new edges to add).
  319. nextsym_with_bindings = edge.next_with_bindings()
  320. done = self._done.get((nextsym_with_bindings, index), (None, None))
  321. if done[0] is chart and done[1] is grammar:
  322. return
  323. for prod in grammar.productions(lhs=nextsym):
  324. # If the left corner in the predicted production is
  325. # leaf, it must match with the input.
  326. if prod.rhs():
  327. first = prod.rhs()[0]
  328. if is_terminal(first):
  329. if index >= chart.num_leaves():
  330. continue
  331. if first != chart.leaf(index):
  332. continue
  333. # We rename vars here, because we don't want variables
  334. # from the two different productions to match.
  335. if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
  336. new_edge = FeatureTreeEdge.from_production(prod, edge.end())
  337. if chart.insert(new_edge, ()):
  338. yield new_edge
  339. # Record the fact that we've applied this rule.
  340. self._done[nextsym_with_bindings, index] = (chart, grammar)
  341. # ////////////////////////////////////////////////////////////
  342. # Bottom-Up Prediction
  343. # ////////////////////////////////////////////////////////////
  344. class FeatureBottomUpPredictRule(BottomUpPredictRule):
  345. def apply(self, chart, grammar, edge):
  346. if edge.is_incomplete():
  347. return
  348. for prod in grammar.productions(rhs=edge.lhs()):
  349. if isinstance(edge, FeatureTreeEdge):
  350. _next = prod.rhs()[0]
  351. if not is_nonterminal(_next):
  352. continue
  353. new_edge = FeatureTreeEdge.from_production(prod, edge.start())
  354. if chart.insert(new_edge, ()):
  355. yield new_edge
  356. class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
  357. def apply(self, chart, grammar, edge):
  358. if edge.is_incomplete():
  359. return
  360. found = edge.lhs()
  361. for prod in grammar.productions(rhs=found):
  362. bindings = {}
  363. if isinstance(edge, FeatureTreeEdge):
  364. _next = prod.rhs()[0]
  365. if not is_nonterminal(_next):
  366. continue
  367. # We rename vars here, because we don't want variables
  368. # from the two different productions to match.
  369. used_vars = find_variables(
  370. (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
  371. )
  372. found = found.rename_variables(used_vars=used_vars)
  373. result = unify(_next, found, bindings, rename_vars=False)
  374. if result is None:
  375. continue
  376. new_edge = FeatureTreeEdge.from_production(
  377. prod, edge.start()
  378. ).move_dot_forward(edge.end(), bindings)
  379. if chart.insert(new_edge, (edge,)):
  380. yield new_edge
  381. class FeatureEmptyPredictRule(EmptyPredictRule):
  382. def apply(self, chart, grammar):
  383. for prod in grammar.productions(empty=True):
  384. for index in range(chart.num_leaves() + 1):
  385. new_edge = FeatureTreeEdge.from_production(prod, index)
  386. if chart.insert(new_edge, ()):
  387. yield new_edge
  388. # ////////////////////////////////////////////////////////////
  389. # Feature Chart Parser
  390. # ////////////////////////////////////////////////////////////
  391. TD_FEATURE_STRATEGY = [
  392. LeafInitRule(),
  393. FeatureTopDownInitRule(),
  394. FeatureTopDownPredictRule(),
  395. FeatureSingleEdgeFundamentalRule(),
  396. ]
  397. BU_FEATURE_STRATEGY = [
  398. LeafInitRule(),
  399. FeatureEmptyPredictRule(),
  400. FeatureBottomUpPredictRule(),
  401. FeatureSingleEdgeFundamentalRule(),
  402. ]
  403. BU_LC_FEATURE_STRATEGY = [
  404. LeafInitRule(),
  405. FeatureEmptyPredictRule(),
  406. FeatureBottomUpPredictCombineRule(),
  407. FeatureSingleEdgeFundamentalRule(),
  408. ]
  409. class FeatureChartParser(ChartParser):
  410. def __init__(
  411. self,
  412. grammar,
  413. strategy=BU_LC_FEATURE_STRATEGY,
  414. trace_chart_width=20,
  415. chart_class=FeatureChart,
  416. **parser_args
  417. ):
  418. ChartParser.__init__(
  419. self,
  420. grammar,
  421. strategy=strategy,
  422. trace_chart_width=trace_chart_width,
  423. chart_class=chart_class,
  424. **parser_args
  425. )
  426. class FeatureTopDownChartParser(FeatureChartParser):
  427. def __init__(self, grammar, **parser_args):
  428. FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
  429. class FeatureBottomUpChartParser(FeatureChartParser):
  430. def __init__(self, grammar, **parser_args):
  431. FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
  432. class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
  433. def __init__(self, grammar, **parser_args):
  434. FeatureChartParser.__init__(
  435. self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
  436. )
  437. # ////////////////////////////////////////////////////////////
  438. # Instantiate Variable Chart
  439. # ////////////////////////////////////////////////////////////
  440. class InstantiateVarsChart(FeatureChart):
  441. """
  442. A specialized chart that 'instantiates' variables whose names
  443. start with '@', by replacing them with unique new variables.
  444. In particular, whenever a complete edge is added to the chart, any
  445. variables in the edge's ``lhs`` whose names start with '@' will be
  446. replaced by unique new ``Variable``s.
  447. """
  448. def __init__(self, tokens):
  449. FeatureChart.__init__(self, tokens)
  450. def initialize(self):
  451. self._instantiated = set()
  452. FeatureChart.initialize(self)
  453. def insert(self, edge, child_pointer_list):
  454. if edge in self._instantiated:
  455. return False
  456. self.instantiate_edge(edge)
  457. return FeatureChart.insert(self, edge, child_pointer_list)
  458. def instantiate_edge(self, edge):
  459. """
  460. If the edge is a ``FeatureTreeEdge``, and it is complete,
  461. then instantiate all variables whose names start with '@',
  462. by replacing them with unique new variables.
  463. Note that instantiation is done in-place, since the
  464. parsing algorithms might already hold a reference to
  465. the edge for future use.
  466. """
  467. # If the edge is a leaf, or is not complete, or is
  468. # already in the chart, then just return it as-is.
  469. if not isinstance(edge, FeatureTreeEdge):
  470. return
  471. if not edge.is_complete():
  472. return
  473. if edge in self._edge_to_cpls:
  474. return
  475. # Get a list of variables that need to be instantiated.
  476. # If there are none, then return as-is.
  477. inst_vars = self.inst_vars(edge)
  478. if not inst_vars:
  479. return
  480. # Instantiate the edge!
  481. self._instantiated.add(edge)
  482. edge._lhs = edge.lhs().substitute_bindings(inst_vars)
  483. def inst_vars(self, edge):
  484. return dict(
  485. (var, logic.unique_variable())
  486. for var in edge.lhs().variables()
  487. if var.name.startswith("@")
  488. )
  489. # ////////////////////////////////////////////////////////////
  490. # Demo
  491. # ////////////////////////////////////////////////////////////
  492. def demo_grammar():
  493. from nltk.grammar import FeatureGrammar
  494. return FeatureGrammar.fromstring(
  495. """
  496. S -> NP VP
  497. PP -> Prep NP
  498. NP -> NP PP
  499. VP -> VP PP
  500. VP -> Verb NP
  501. VP -> Verb
  502. NP -> Det[pl=?x] Noun[pl=?x]
  503. NP -> "John"
  504. NP -> "I"
  505. Det -> "the"
  506. Det -> "my"
  507. Det[-pl] -> "a"
  508. Noun[-pl] -> "dog"
  509. Noun[-pl] -> "cookie"
  510. Verb -> "ate"
  511. Verb -> "saw"
  512. Prep -> "with"
  513. Prep -> "under"
  514. """
  515. )
  516. def demo(
  517. print_times=True,
  518. print_grammar=True,
  519. print_trees=True,
  520. print_sentence=True,
  521. trace=1,
  522. parser=FeatureChartParser,
  523. sent="I saw John with a dog with my cookie",
  524. ):
  525. import sys, time
  526. print()
  527. grammar = demo_grammar()
  528. if print_grammar:
  529. print(grammar)
  530. print()
  531. print("*", parser.__name__)
  532. if print_sentence:
  533. print("Sentence:", sent)
  534. tokens = sent.split()
  535. t = perf_counter()
  536. cp = parser(grammar, trace=trace)
  537. chart = cp.chart_parse(tokens)
  538. trees = list(chart.parses(grammar.start()))
  539. if print_times:
  540. print("Time: %s" % (perf_counter() - t))
  541. if print_trees:
  542. for tree in trees:
  543. print(tree)
  544. else:
  545. print("Nr trees:", len(trees))
  546. def run_profile():
  547. import profile
  548. profile.run("for i in range(1): demo()", "/tmp/profile.out")
  549. import pstats
  550. p = pstats.Stats("/tmp/profile.out")
  551. p.strip_dirs().sort_stats("time", "cum").print_stats(60)
  552. p.strip_dirs().sort_stats("cum", "time").print_stats(60)
  553. if __name__ == "__main__":
  554. from nltk.data import load
  555. demo()
  556. print()
  557. grammar = load("grammars/book_grammars/feat0.fcfg")
  558. cp = FeatureChartParser(grammar, trace=2)
  559. sent = "Kim likes children"
  560. tokens = sent.split()
  561. trees = cp.parse(tokens)
  562. for tree in trees:
  563. print(tree)