pchart.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. # Natural Language Toolkit: Probabilistic Chart Parsers
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Classes and interfaces for associating probabilities with tree
  10. structures that represent the internal organization of a text. The
  11. probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
  12. ``BottomUpProbabilisticChartParser`` is an abstract class that implements
  13. a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges,
  14. and adds them to the chart one at a time. The ordering of this queue
  15. is based on the probabilities associated with the edges, allowing the
  16. parser to expand more likely edges before less likely ones. Each
  17. subclass implements a different queue ordering, producing different
  18. search strategies. Currently the following subclasses are defined:
  19. - ``InsideChartParser`` searches edges in decreasing order of
  20. their trees' inside probabilities.
  21. - ``RandomChartParser`` searches edges in random order.
  22. - ``LongestChartParser`` searches edges in decreasing order of their
  23. location's length.
  24. The ``BottomUpProbabilisticChartParser`` constructor has an optional
  25. argument beam_size. If non-zero, this controls the size of the beam
  26. (aka the edge queue). This option is most useful with InsideChartParser.
  27. """
  28. ##//////////////////////////////////////////////////////
  29. ## Bottom-Up PCFG Chart Parser
  30. ##//////////////////////////////////////////////////////
  31. # [XX] This might not be implemented quite right -- it would be better
  32. # to associate probabilities with child pointer lists.
  33. import random
  34. from functools import reduce
  35. from nltk.tree import Tree, ProbabilisticTree
  36. from nltk.grammar import Nonterminal, PCFG
  37. from nltk.parse.api import ParserI
  38. from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
  39. # Probabilistic edges
  40. class ProbabilisticLeafEdge(LeafEdge):
  41. def prob(self):
  42. return 1.0
  43. class ProbabilisticTreeEdge(TreeEdge):
  44. def __init__(self, prob, *args, **kwargs):
  45. TreeEdge.__init__(self, *args, **kwargs)
  46. self._prob = prob
  47. # two edges with different probabilities are not equal.
  48. self._comparison_key = (self._comparison_key, prob)
  49. def prob(self):
  50. return self._prob
  51. @staticmethod
  52. def from_production(production, index, p):
  53. return ProbabilisticTreeEdge(
  54. p, (index, index), production.lhs(), production.rhs(), 0
  55. )
  56. # Rules using probabilistic edges
  57. class ProbabilisticBottomUpInitRule(AbstractChartRule):
  58. NUM_EDGES = 0
  59. def apply(self, chart, grammar):
  60. for index in range(chart.num_leaves()):
  61. new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
  62. if chart.insert(new_edge, ()):
  63. yield new_edge
  64. class ProbabilisticBottomUpPredictRule(AbstractChartRule):
  65. NUM_EDGES = 1
  66. def apply(self, chart, grammar, edge):
  67. if edge.is_incomplete():
  68. return
  69. for prod in grammar.productions():
  70. if edge.lhs() == prod.rhs()[0]:
  71. new_edge = ProbabilisticTreeEdge.from_production(
  72. prod, edge.start(), prod.prob()
  73. )
  74. if chart.insert(new_edge, ()):
  75. yield new_edge
  76. class ProbabilisticFundamentalRule(AbstractChartRule):
  77. NUM_EDGES = 2
  78. def apply(self, chart, grammar, left_edge, right_edge):
  79. # Make sure the rule is applicable.
  80. if not (
  81. left_edge.end() == right_edge.start()
  82. and left_edge.nextsym() == right_edge.lhs()
  83. and left_edge.is_incomplete()
  84. and right_edge.is_complete()
  85. ):
  86. return
  87. # Construct the new edge.
  88. p = left_edge.prob() * right_edge.prob()
  89. new_edge = ProbabilisticTreeEdge(
  90. p,
  91. span=(left_edge.start(), right_edge.end()),
  92. lhs=left_edge.lhs(),
  93. rhs=left_edge.rhs(),
  94. dot=left_edge.dot() + 1,
  95. )
  96. # Add it to the chart, with appropriate child pointers.
  97. changed_chart = False
  98. for cpl1 in chart.child_pointer_lists(left_edge):
  99. if chart.insert(new_edge, cpl1 + (right_edge,)):
  100. changed_chart = True
  101. # If we changed the chart, then generate the edge.
  102. if changed_chart:
  103. yield new_edge
  104. class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
  105. NUM_EDGES = 1
  106. _fundamental_rule = ProbabilisticFundamentalRule()
  107. def apply(self, chart, grammar, edge1):
  108. fr = self._fundamental_rule
  109. if edge1.is_incomplete():
  110. # edge1 = left_edge; edge2 = right_edge
  111. for edge2 in chart.select(
  112. start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
  113. ):
  114. for new_edge in fr.apply(chart, grammar, edge1, edge2):
  115. yield new_edge
  116. else:
  117. # edge2 = left_edge; edge1 = right_edge
  118. for edge2 in chart.select(
  119. end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
  120. ):
  121. for new_edge in fr.apply(chart, grammar, edge2, edge1):
  122. yield new_edge
  123. def __str__(self):
  124. return "Fundamental Rule"
  125. class BottomUpProbabilisticChartParser(ParserI):
  126. """
  127. An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
  128. record partial results. ``BottomUpProbabilisticChartParser`` maintains
  129. a queue of edges that can be added to the chart. This queue is
  130. initialized with edges for each token in the text that is being
  131. parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into
  132. the chart one at a time, starting with the most likely edges, and
  133. proceeding to less likely edges. For each edge that is added to
  134. the chart, it may become possible to insert additional edges into
  135. the chart; these are added to the queue. This process continues
  136. until enough complete parses have been generated, or until the
  137. queue is empty.
  138. The sorting order for the queue is not specified by
  139. ``BottomUpProbabilisticChartParser``. Different sorting orders will
  140. result in different search strategies. The sorting order for the
  141. queue is defined by the method ``sort_queue``; subclasses are required
  142. to provide a definition for this method.
  143. :type _grammar: PCFG
  144. :ivar _grammar: The grammar used to parse sentences.
  145. :type _trace: int
  146. :ivar _trace: The level of tracing output that should be generated
  147. when parsing a text.
  148. """
  149. def __init__(self, grammar, beam_size=0, trace=0):
  150. """
  151. Create a new ``BottomUpProbabilisticChartParser``, that uses
  152. ``grammar`` to parse texts.
  153. :type grammar: PCFG
  154. :param grammar: The grammar used to parse texts.
  155. :type beam_size: int
  156. :param beam_size: The maximum length for the parser's edge queue.
  157. :type trace: int
  158. :param trace: The level of tracing that should be used when
  159. parsing a text. ``0`` will generate no tracing output;
  160. and higher numbers will produce more verbose tracing
  161. output.
  162. """
  163. if not isinstance(grammar, PCFG):
  164. raise ValueError("The grammar must be probabilistic PCFG")
  165. self._grammar = grammar
  166. self.beam_size = beam_size
  167. self._trace = trace
  168. def grammar(self):
  169. return self._grammar
  170. def trace(self, trace=2):
  171. """
  172. Set the level of tracing output that should be generated when
  173. parsing a text.
  174. :type trace: int
  175. :param trace: The trace level. A trace level of ``0`` will
  176. generate no tracing output; and higher trace levels will
  177. produce more verbose tracing output.
  178. :rtype: None
  179. """
  180. self._trace = trace
  181. # TODO: change this to conform more with the standard ChartParser
  182. def parse(self, tokens):
  183. self._grammar.check_coverage(tokens)
  184. chart = Chart(list(tokens))
  185. grammar = self._grammar
  186. # Chart parser rules.
  187. bu_init = ProbabilisticBottomUpInitRule()
  188. bu = ProbabilisticBottomUpPredictRule()
  189. fr = SingleEdgeProbabilisticFundamentalRule()
  190. # Our queue
  191. queue = []
  192. # Initialize the chart.
  193. for edge in bu_init.apply(chart, grammar):
  194. if self._trace > 1:
  195. print(
  196. " %-50s [%s]"
  197. % (chart.pretty_format_edge(edge, width=2), edge.prob())
  198. )
  199. queue.append(edge)
  200. while len(queue) > 0:
  201. # Re-sort the queue.
  202. self.sort_queue(queue, chart)
  203. # Prune the queue to the correct size if a beam was defined
  204. if self.beam_size:
  205. self._prune(queue, chart)
  206. # Get the best edge.
  207. edge = queue.pop()
  208. if self._trace > 0:
  209. print(
  210. " %-50s [%s]"
  211. % (chart.pretty_format_edge(edge, width=2), edge.prob())
  212. )
  213. # Apply BU & FR to it.
  214. queue.extend(bu.apply(chart, grammar, edge))
  215. queue.extend(fr.apply(chart, grammar, edge))
  216. # Get a list of complete parses.
  217. parses = list(chart.parses(grammar.start(), ProbabilisticTree))
  218. # Assign probabilities to the trees.
  219. prod_probs = {}
  220. for prod in grammar.productions():
  221. prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
  222. for parse in parses:
  223. self._setprob(parse, prod_probs)
  224. # Sort by probability
  225. parses.sort(reverse=True, key=lambda tree: tree.prob())
  226. return iter(parses)
  227. def _setprob(self, tree, prod_probs):
  228. if tree.prob() is not None:
  229. return
  230. # Get the prob of the CFG production.
  231. lhs = Nonterminal(tree.label())
  232. rhs = []
  233. for child in tree:
  234. if isinstance(child, Tree):
  235. rhs.append(Nonterminal(child.label()))
  236. else:
  237. rhs.append(child)
  238. prob = prod_probs[lhs, tuple(rhs)]
  239. # Get the probs of children.
  240. for child in tree:
  241. if isinstance(child, Tree):
  242. self._setprob(child, prod_probs)
  243. prob *= child.prob()
  244. tree.set_prob(prob)
  245. def sort_queue(self, queue, chart):
  246. """
  247. Sort the given queue of ``Edge`` objects, placing the edge that should
  248. be tried first at the beginning of the queue. This method
  249. will be called after each ``Edge`` is added to the queue.
  250. :param queue: The queue of ``Edge`` objects to sort. Each edge in
  251. this queue is an edge that could be added to the chart by
  252. the fundamental rule; but that has not yet been added.
  253. :type queue: list(Edge)
  254. :param chart: The chart being used to parse the text. This
  255. chart can be used to provide extra information for sorting
  256. the queue.
  257. :type chart: Chart
  258. :rtype: None
  259. """
  260. raise NotImplementedError()
  261. def _prune(self, queue, chart):
  262. """ Discard items in the queue if the queue is longer than the beam."""
  263. if len(queue) > self.beam_size:
  264. split = len(queue) - self.beam_size
  265. if self._trace > 2:
  266. for edge in queue[:split]:
  267. print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2))
  268. del queue[:split]
  269. class InsideChartParser(BottomUpProbabilisticChartParser):
  270. """
  271. A bottom-up parser for ``PCFG`` grammars that tries edges in descending
  272. order of the inside probabilities of their trees. The "inside
  273. probability" of a tree is simply the
  274. probability of the entire tree, ignoring its context. In
  275. particular, the inside probability of a tree generated by
  276. production *p* with children *c[1], c[2], ..., c[n]* is
  277. *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
  278. probability of a token is 1 if it is present in the text, and 0 if
  279. it is absent.
  280. This sorting order results in a type of lowest-cost-first search
  281. strategy.
  282. """
  283. # Inherit constructor.
  284. def sort_queue(self, queue, chart):
  285. """
  286. Sort the given queue of edges, in descending order of the
  287. inside probabilities of the edges' trees.
  288. :param queue: The queue of ``Edge`` objects to sort. Each edge in
  289. this queue is an edge that could be added to the chart by
  290. the fundamental rule; but that has not yet been added.
  291. :type queue: list(Edge)
  292. :param chart: The chart being used to parse the text. This
  293. chart can be used to provide extra information for sorting
  294. the queue.
  295. :type chart: Chart
  296. :rtype: None
  297. """
  298. queue.sort(key=lambda edge: edge.prob())
  299. # Eventually, this will become some sort of inside-outside parser:
  300. # class InsideOutsideParser(BottomUpProbabilisticChartParser):
  301. # def __init__(self, grammar, trace=0):
  302. # # Inherit docs.
  303. # BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
  304. #
  305. # # Find the best path from S to each nonterminal
  306. # bestp = {}
  307. # for production in grammar.productions(): bestp[production.lhs()]=0
  308. # bestp[grammar.start()] = 1.0
  309. #
  310. # for i in range(len(grammar.productions())):
  311. # for production in grammar.productions():
  312. # lhs = production.lhs()
  313. # for elt in production.rhs():
  314. # bestp[elt] = max(bestp[lhs]*production.prob(),
  315. # bestp.get(elt,0))
  316. #
  317. # self._bestp = bestp
  318. # for (k,v) in self._bestp.items(): print(k,v)
  319. #
  320. # def _sortkey(self, edge):
  321. # return edge.structure()[PROB] * self._bestp[edge.lhs()]
  322. #
  323. # def sort_queue(self, queue, chart):
  324. # queue.sort(key=self._sortkey)
  325. class RandomChartParser(BottomUpProbabilisticChartParser):
  326. """
  327. A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
  328. This sorting order results in a random search strategy.
  329. """
  330. # Inherit constructor
  331. def sort_queue(self, queue, chart):
  332. i = random.randint(0, len(queue) - 1)
  333. (queue[-1], queue[i]) = (queue[i], queue[-1])
  334. class UnsortedChartParser(BottomUpProbabilisticChartParser):
  335. """
  336. A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
  337. """
  338. # Inherit constructor
  339. def sort_queue(self, queue, chart):
  340. return
  341. class LongestChartParser(BottomUpProbabilisticChartParser):
  342. """
  343. A bottom-up parser for ``PCFG`` grammars that tries longer edges before
  344. shorter ones. This sorting order results in a type of best-first
  345. search strategy.
  346. """
  347. # Inherit constructor
  348. def sort_queue(self, queue, chart):
  349. queue.sort(key=lambda edge: edge.length())
  350. ##//////////////////////////////////////////////////////
  351. ## Test Code
  352. ##//////////////////////////////////////////////////////
  353. def demo(choice=None, draw_parses=None, print_parses=None):
  354. """
  355. A demonstration of the probabilistic parsers. The user is
  356. prompted to select which demo to run, and how many parses should
  357. be found; and then each parser is run on the same demo, and a
  358. summary of the results are displayed.
  359. """
  360. import sys, time
  361. from nltk import tokenize
  362. from nltk.parse import pchart
  363. # Define two demos. Each demo has a sentence and a grammar.
  364. toy_pcfg1 = PCFG.fromstring(
  365. """
  366. S -> NP VP [1.0]
  367. NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
  368. Det -> 'the' [0.8] | 'my' [0.2]
  369. N -> 'man' [0.5] | 'telescope' [0.5]
  370. VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
  371. V -> 'ate' [0.35] | 'saw' [0.65]
  372. PP -> P NP [1.0]
  373. P -> 'with' [0.61] | 'under' [0.39]
  374. """
  375. )
  376. toy_pcfg2 = PCFG.fromstring(
  377. """
  378. S -> NP VP [1.0]
  379. VP -> V NP [.59]
  380. VP -> V [.40]
  381. VP -> VP PP [.01]
  382. NP -> Det N [.41]
  383. NP -> Name [.28]
  384. NP -> NP PP [.31]
  385. PP -> P NP [1.0]
  386. V -> 'saw' [.21]
  387. V -> 'ate' [.51]
  388. V -> 'ran' [.28]
  389. N -> 'boy' [.11]
  390. N -> 'cookie' [.12]
  391. N -> 'table' [.13]
  392. N -> 'telescope' [.14]
  393. N -> 'hill' [.5]
  394. Name -> 'Jack' [.52]
  395. Name -> 'Bob' [.48]
  396. P -> 'with' [.61]
  397. P -> 'under' [.39]
  398. Det -> 'the' [.41]
  399. Det -> 'a' [.31]
  400. Det -> 'my' [.28]
  401. """
  402. )
  403. demos = [
  404. ("I saw John with my telescope", toy_pcfg1),
  405. ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2),
  406. ]
  407. if choice is None:
  408. # Ask the user which demo they want to use.
  409. print()
  410. for i in range(len(demos)):
  411. print("%3s: %s" % (i + 1, demos[i][0]))
  412. print(" %r" % demos[i][1])
  413. print()
  414. print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
  415. choice = int(sys.stdin.readline().strip()) - 1
  416. try:
  417. sent, grammar = demos[choice]
  418. except:
  419. print("Bad sentence number")
  420. return
  421. # Tokenize the sentence.
  422. tokens = sent.split()
  423. # Define a list of parsers. We'll use all parsers.
  424. parsers = [
  425. pchart.InsideChartParser(grammar),
  426. pchart.RandomChartParser(grammar),
  427. pchart.UnsortedChartParser(grammar),
  428. pchart.LongestChartParser(grammar),
  429. pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
  430. ]
  431. # Run the parsers on the tokenized sentence.
  432. times = []
  433. average_p = []
  434. num_parses = []
  435. all_parses = {}
  436. for parser in parsers:
  437. print("\ns: %s\nparser: %s\ngrammar: %s" % (sent, parser, grammar))
  438. parser.trace(3)
  439. t = time.time()
  440. parses = list(parser.parse(tokens))
  441. times.append(time.time() - t)
  442. p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
  443. average_p.append(p)
  444. num_parses.append(len(parses))
  445. for p in parses:
  446. all_parses[p.freeze()] = 1
  447. # Print some summary statistics
  448. print()
  449. print(" Parser Beam | Time (secs) # Parses Average P(parse)")
  450. print("------------------------+------------------------------------------")
  451. for i in range(len(parsers)):
  452. print(
  453. "%18s %4d |%11.4f%11d%19.14f"
  454. % (
  455. parsers[i].__class__.__name__,
  456. parsers[i].beam_size,
  457. times[i],
  458. num_parses[i],
  459. average_p[i],
  460. )
  461. )
  462. parses = all_parses.keys()
  463. if parses:
  464. p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
  465. else:
  466. p = 0
  467. print("------------------------+------------------------------------------")
  468. print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p))
  469. if draw_parses is None:
  470. # Ask the user if we should draw the parses.
  471. print()
  472. print("Draw parses (y/n)? ", end=" ")
  473. draw_parses = sys.stdin.readline().strip().lower().startswith("y")
  474. if draw_parses:
  475. from nltk.draw.tree import draw_trees
  476. print(" please wait...")
  477. draw_trees(*parses)
  478. if print_parses is None:
  479. # Ask the user if we should print the parses.
  480. print()
  481. print("Print parses (y/n)? ", end=" ")
  482. print_parses = sys.stdin.readline().strip().lower().startswith("y")
  483. if print_parses:
  484. for parse in parses:
  485. print(parse)
  486. if __name__ == "__main__":
  487. demo()