earleychart.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: An Incremental Earley Chart Parser
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  6. # Rob Speer <rspeer@mit.edu>
  7. # Edward Loper <edloper@gmail.com>
  8. # Steven Bird <stevenbird1@gmail.com>
  9. # Jean Mark Gawron <gawron@mail.sdsu.edu>
  10. # URL: <http://nltk.org/>
  11. # For license information, see LICENSE.TXT
  12. """
  13. Data classes and parser implementations for *incremental* chart
  14. parsers, which use dynamic programming to efficiently parse a text.
  15. A "chart parser" derives parse trees for a text by iteratively adding
  16. \"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
  17. structure for a subsequence of the text. The "chart" is a
  18. \"blackboard\" for composing and combining these hypotheses.
  19. A parser is "incremental", if it guarantees that for all i, j where i < j,
  20. all edges ending at i are built before any edges ending at j.
  21. This is appealing for, say, speech recognizer hypothesis filtering.
  22. The main parser class is ``EarleyChartParser``, which is a top-down
  23. algorithm, originally formulated by Jay Earley (1970).
  24. """
  25. from time import perf_counter
  26. from nltk.parse.chart import (
  27. Chart,
  28. ChartParser,
  29. EdgeI,
  30. LeafEdge,
  31. LeafInitRule,
  32. BottomUpPredictRule,
  33. BottomUpPredictCombineRule,
  34. TopDownInitRule,
  35. SingleEdgeFundamentalRule,
  36. EmptyPredictRule,
  37. CachedTopDownPredictRule,
  38. FilteredSingleEdgeFundamentalRule,
  39. FilteredBottomUpPredictCombineRule,
  40. )
  41. from nltk.parse.featurechart import (
  42. FeatureChart,
  43. FeatureChartParser,
  44. FeatureTopDownInitRule,
  45. FeatureTopDownPredictRule,
  46. FeatureEmptyPredictRule,
  47. FeatureBottomUpPredictRule,
  48. FeatureBottomUpPredictCombineRule,
  49. FeatureSingleEdgeFundamentalRule,
  50. )
  51. # ////////////////////////////////////////////////////////////
  52. # Incremental Chart
  53. # ////////////////////////////////////////////////////////////
  54. class IncrementalChart(Chart):
  55. def initialize(self):
  56. # A sequence of edge lists contained in this chart.
  57. self._edgelists = tuple([] for x in self._positions())
  58. # The set of child pointer lists associated with each edge.
  59. self._edge_to_cpls = {}
  60. # Indexes mapping attribute values to lists of edges
  61. # (used by select()).
  62. self._indexes = {}
  63. def edges(self):
  64. return list(self.iteredges())
  65. def iteredges(self):
  66. return (edge for edgelist in self._edgelists for edge in edgelist)
  67. def select(self, end, **restrictions):
  68. edgelist = self._edgelists[end]
  69. # If there are no restrictions, then return all edges.
  70. if restrictions == {}:
  71. return iter(edgelist)
  72. # Find the index corresponding to the given restrictions.
  73. restr_keys = sorted(restrictions.keys())
  74. restr_keys = tuple(restr_keys)
  75. # If it doesn't exist, then create it.
  76. if restr_keys not in self._indexes:
  77. self._add_index(restr_keys)
  78. vals = tuple(restrictions[key] for key in restr_keys)
  79. return iter(self._indexes[restr_keys][end].get(vals, []))
  80. def _add_index(self, restr_keys):
  81. # Make sure it's a valid index.
  82. for key in restr_keys:
  83. if not hasattr(EdgeI, key):
  84. raise ValueError("Bad restriction: %s" % key)
  85. # Create the index.
  86. index = self._indexes[restr_keys] = tuple({} for x in self._positions())
  87. # Add all existing edges to the index.
  88. for end, edgelist in enumerate(self._edgelists):
  89. this_index = index[end]
  90. for edge in edgelist:
  91. vals = tuple(getattr(edge, key)() for key in restr_keys)
  92. this_index.setdefault(vals, []).append(edge)
  93. def _register_with_indexes(self, edge):
  94. end = edge.end()
  95. for (restr_keys, index) in self._indexes.items():
  96. vals = tuple(getattr(edge, key)() for key in restr_keys)
  97. index[end].setdefault(vals, []).append(edge)
  98. def _append_edge(self, edge):
  99. self._edgelists[edge.end()].append(edge)
  100. def _positions(self):
  101. return range(self.num_leaves() + 1)
  102. class FeatureIncrementalChart(IncrementalChart, FeatureChart):
  103. def select(self, end, **restrictions):
  104. edgelist = self._edgelists[end]
  105. # If there are no restrictions, then return all edges.
  106. if restrictions == {}:
  107. return iter(edgelist)
  108. # Find the index corresponding to the given restrictions.
  109. restr_keys = sorted(restrictions.keys())
  110. restr_keys = tuple(restr_keys)
  111. # If it doesn't exist, then create it.
  112. if restr_keys not in self._indexes:
  113. self._add_index(restr_keys)
  114. vals = tuple(
  115. self._get_type_if_possible(restrictions[key]) for key in restr_keys
  116. )
  117. return iter(self._indexes[restr_keys][end].get(vals, []))
  118. def _add_index(self, restr_keys):
  119. # Make sure it's a valid index.
  120. for key in restr_keys:
  121. if not hasattr(EdgeI, key):
  122. raise ValueError("Bad restriction: %s" % key)
  123. # Create the index.
  124. index = self._indexes[restr_keys] = tuple({} for x in self._positions())
  125. # Add all existing edges to the index.
  126. for end, edgelist in enumerate(self._edgelists):
  127. this_index = index[end]
  128. for edge in edgelist:
  129. vals = tuple(
  130. self._get_type_if_possible(getattr(edge, key)())
  131. for key in restr_keys
  132. )
  133. this_index.setdefault(vals, []).append(edge)
  134. def _register_with_indexes(self, edge):
  135. end = edge.end()
  136. for (restr_keys, index) in self._indexes.items():
  137. vals = tuple(
  138. self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
  139. )
  140. index[end].setdefault(vals, []).append(edge)
  141. # ////////////////////////////////////////////////////////////
  142. # Incremental CFG Rules
  143. # ////////////////////////////////////////////////////////////
  144. class CompleteFundamentalRule(SingleEdgeFundamentalRule):
  145. def _apply_incomplete(self, chart, grammar, left_edge):
  146. end = left_edge.end()
  147. # When the chart is incremental, we only have to look for
  148. # empty complete edges here.
  149. for right_edge in chart.select(
  150. start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
  151. ):
  152. new_edge = left_edge.move_dot_forward(right_edge.end())
  153. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  154. yield new_edge
  155. class CompleterRule(CompleteFundamentalRule):
  156. _fundamental_rule = CompleteFundamentalRule()
  157. def apply(self, chart, grammar, edge):
  158. if not isinstance(edge, LeafEdge):
  159. for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
  160. yield new_edge
  161. class ScannerRule(CompleteFundamentalRule):
  162. _fundamental_rule = CompleteFundamentalRule()
  163. def apply(self, chart, grammar, edge):
  164. if isinstance(edge, LeafEdge):
  165. for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
  166. yield new_edge
  167. class PredictorRule(CachedTopDownPredictRule):
  168. pass
  169. class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
  170. def apply(self, chart, grammar, edge):
  171. # Since the Filtered rule only works for grammars without empty productions,
  172. # we only have to bother with complete edges here.
  173. if edge.is_complete():
  174. for new_edge in self._apply_complete(chart, grammar, edge):
  175. yield new_edge
  176. # ////////////////////////////////////////////////////////////
  177. # Incremental FCFG Rules
  178. # ////////////////////////////////////////////////////////////
  179. class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
  180. def _apply_incomplete(self, chart, grammar, left_edge):
  181. fr = self._fundamental_rule
  182. end = left_edge.end()
  183. # When the chart is incremental, we only have to look for
  184. # empty complete edges here.
  185. for right_edge in chart.select(
  186. start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
  187. ):
  188. for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
  189. yield new_edge
  190. class FeatureCompleterRule(CompleterRule):
  191. _fundamental_rule = FeatureCompleteFundamentalRule()
  192. class FeatureScannerRule(ScannerRule):
  193. _fundamental_rule = FeatureCompleteFundamentalRule()
  194. class FeaturePredictorRule(FeatureTopDownPredictRule):
  195. pass
  196. # ////////////////////////////////////////////////////////////
  197. # Incremental CFG Chart Parsers
  198. # ////////////////////////////////////////////////////////////
  199. EARLEY_STRATEGY = [
  200. LeafInitRule(),
  201. TopDownInitRule(),
  202. CompleterRule(),
  203. ScannerRule(),
  204. PredictorRule(),
  205. ]
  206. TD_INCREMENTAL_STRATEGY = [
  207. LeafInitRule(),
  208. TopDownInitRule(),
  209. CachedTopDownPredictRule(),
  210. CompleteFundamentalRule(),
  211. ]
  212. BU_INCREMENTAL_STRATEGY = [
  213. LeafInitRule(),
  214. EmptyPredictRule(),
  215. BottomUpPredictRule(),
  216. CompleteFundamentalRule(),
  217. ]
  218. BU_LC_INCREMENTAL_STRATEGY = [
  219. LeafInitRule(),
  220. EmptyPredictRule(),
  221. BottomUpPredictCombineRule(),
  222. CompleteFundamentalRule(),
  223. ]
  224. LC_INCREMENTAL_STRATEGY = [
  225. LeafInitRule(),
  226. FilteredBottomUpPredictCombineRule(),
  227. FilteredCompleteFundamentalRule(),
  228. ]
  229. class IncrementalChartParser(ChartParser):
  230. """
  231. An *incremental* chart parser implementing Jay Earley's
  232. parsing algorithm:
  233. | For each index end in [0, 1, ..., N]:
  234. | For each edge such that edge.end = end:
  235. | If edge is incomplete and edge.next is not a part of speech:
  236. | Apply PredictorRule to edge
  237. | If edge is incomplete and edge.next is a part of speech:
  238. | Apply ScannerRule to edge
  239. | If edge is complete:
  240. | Apply CompleterRule to edge
  241. | Return any complete parses in the chart
  242. """
  243. def __init__(
  244. self,
  245. grammar,
  246. strategy=BU_LC_INCREMENTAL_STRATEGY,
  247. trace=0,
  248. trace_chart_width=50,
  249. chart_class=IncrementalChart,
  250. ):
  251. """
  252. Create a new Earley chart parser, that uses ``grammar`` to
  253. parse texts.
  254. :type grammar: CFG
  255. :param grammar: The grammar used to parse texts.
  256. :type trace: int
  257. :param trace: The level of tracing that should be used when
  258. parsing a text. ``0`` will generate no tracing output;
  259. and higher numbers will produce more verbose tracing
  260. output.
  261. :type trace_chart_width: int
  262. :param trace_chart_width: The default total width reserved for
  263. the chart in trace output. The remainder of each line will
  264. be used to display edges.
  265. :param chart_class: The class that should be used to create
  266. the charts used by this parser.
  267. """
  268. self._grammar = grammar
  269. self._trace = trace
  270. self._trace_chart_width = trace_chart_width
  271. self._chart_class = chart_class
  272. self._axioms = []
  273. self._inference_rules = []
  274. for rule in strategy:
  275. if rule.NUM_EDGES == 0:
  276. self._axioms.append(rule)
  277. elif rule.NUM_EDGES == 1:
  278. self._inference_rules.append(rule)
  279. else:
  280. raise ValueError(
  281. "Incremental inference rules must have " "NUM_EDGES == 0 or 1"
  282. )
  283. def chart_parse(self, tokens, trace=None):
  284. if trace is None:
  285. trace = self._trace
  286. trace_new_edges = self._trace_new_edges
  287. tokens = list(tokens)
  288. self._grammar.check_coverage(tokens)
  289. chart = self._chart_class(tokens)
  290. grammar = self._grammar
  291. # Width, for printing trace edges.
  292. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
  293. if trace:
  294. print(chart.pretty_format_leaves(trace_edge_width))
  295. for axiom in self._axioms:
  296. new_edges = list(axiom.apply(chart, grammar))
  297. trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
  298. inference_rules = self._inference_rules
  299. for end in range(chart.num_leaves() + 1):
  300. if trace > 1:
  301. print("\n* Processing queue:", end, "\n")
  302. agenda = list(chart.select(end=end))
  303. while agenda:
  304. edge = agenda.pop()
  305. for rule in inference_rules:
  306. new_edges = list(rule.apply(chart, grammar, edge))
  307. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  308. for new_edge in new_edges:
  309. if new_edge.end() == end:
  310. agenda.append(new_edge)
  311. return chart
  312. class EarleyChartParser(IncrementalChartParser):
  313. def __init__(self, grammar, **parser_args):
  314. IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
  315. class IncrementalTopDownChartParser(IncrementalChartParser):
  316. def __init__(self, grammar, **parser_args):
  317. IncrementalChartParser.__init__(
  318. self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
  319. )
  320. class IncrementalBottomUpChartParser(IncrementalChartParser):
  321. def __init__(self, grammar, **parser_args):
  322. IncrementalChartParser.__init__(
  323. self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
  324. )
  325. class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
  326. def __init__(self, grammar, **parser_args):
  327. IncrementalChartParser.__init__(
  328. self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
  329. )
  330. class IncrementalLeftCornerChartParser(IncrementalChartParser):
  331. def __init__(self, grammar, **parser_args):
  332. if not grammar.is_nonempty():
  333. raise ValueError(
  334. "IncrementalLeftCornerParser only works for grammars "
  335. "without empty productions."
  336. )
  337. IncrementalChartParser.__init__(
  338. self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
  339. )
  340. # ////////////////////////////////////////////////////////////
  341. # Incremental FCFG Chart Parsers
  342. # ////////////////////////////////////////////////////////////
  343. EARLEY_FEATURE_STRATEGY = [
  344. LeafInitRule(),
  345. FeatureTopDownInitRule(),
  346. FeatureCompleterRule(),
  347. FeatureScannerRule(),
  348. FeaturePredictorRule(),
  349. ]
  350. TD_INCREMENTAL_FEATURE_STRATEGY = [
  351. LeafInitRule(),
  352. FeatureTopDownInitRule(),
  353. FeatureTopDownPredictRule(),
  354. FeatureCompleteFundamentalRule(),
  355. ]
  356. BU_INCREMENTAL_FEATURE_STRATEGY = [
  357. LeafInitRule(),
  358. FeatureEmptyPredictRule(),
  359. FeatureBottomUpPredictRule(),
  360. FeatureCompleteFundamentalRule(),
  361. ]
  362. BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
  363. LeafInitRule(),
  364. FeatureEmptyPredictRule(),
  365. FeatureBottomUpPredictCombineRule(),
  366. FeatureCompleteFundamentalRule(),
  367. ]
  368. class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
  369. def __init__(
  370. self,
  371. grammar,
  372. strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
  373. trace_chart_width=20,
  374. chart_class=FeatureIncrementalChart,
  375. **parser_args
  376. ):
  377. IncrementalChartParser.__init__(
  378. self,
  379. grammar,
  380. strategy=strategy,
  381. trace_chart_width=trace_chart_width,
  382. chart_class=chart_class,
  383. **parser_args
  384. )
  385. class FeatureEarleyChartParser(FeatureIncrementalChartParser):
  386. def __init__(self, grammar, **parser_args):
  387. FeatureIncrementalChartParser.__init__(
  388. self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
  389. )
  390. class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
  391. def __init__(self, grammar, **parser_args):
  392. FeatureIncrementalChartParser.__init__(
  393. self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  394. )
  395. class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
  396. def __init__(self, grammar, **parser_args):
  397. FeatureIncrementalChartParser.__init__(
  398. self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  399. )
  400. class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
  401. def __init__(self, grammar, **parser_args):
  402. FeatureIncrementalChartParser.__init__(
  403. self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  404. )
  405. # ////////////////////////////////////////////////////////////
  406. # Demonstration
  407. # ////////////////////////////////////////////////////////////
  408. def demo(
  409. print_times=True,
  410. print_grammar=False,
  411. print_trees=True,
  412. trace=2,
  413. sent="I saw John with a dog with my cookie",
  414. numparses=5,
  415. ):
  416. """
  417. A demonstration of the Earley parsers.
  418. """
  419. import sys, time
  420. from nltk.parse.chart import demo_grammar
  421. # The grammar for ChartParser and SteppingChartParser:
  422. grammar = demo_grammar()
  423. if print_grammar:
  424. print("* Grammar")
  425. print(grammar)
  426. # Tokenize the sample sentence.
  427. print("* Sentence:")
  428. print(sent)
  429. tokens = sent.split()
  430. print(tokens)
  431. print()
  432. # Do the parsing.
  433. earley = EarleyChartParser(grammar, trace=trace)
  434. t = perf_counter()
  435. chart = earley.chart_parse(tokens)
  436. parses = list(chart.parses(grammar.start()))
  437. t = perf_counter() - t
  438. # Print results.
  439. if numparses:
  440. assert len(parses) == numparses, "Not all parses found"
  441. if print_trees:
  442. for tree in parses:
  443. print(tree)
  444. else:
  445. print("Nr trees:", len(parses))
  446. if print_times:
  447. print("Time:", t)
  448. if __name__ == "__main__":
  449. demo()