chart.py 61 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: A Chart Parser
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Edward Loper <edloper@gmail.com>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # Jean Mark Gawron <gawron@mail.sdsu.edu>
  8. # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. """
  12. Data classes and parser implementations for "chart parsers", which
  13. use dynamic programming to efficiently parse a text. A chart
  14. parser derives parse trees for a text by iteratively adding "edges"
  15. to a "chart." Each edge represents a hypothesis about the tree
  16. structure for a subsequence of the text. The chart is a
  17. "blackboard" for composing and combining these hypotheses.
  18. When a chart parser begins parsing a text, it creates a new (empty)
  19. chart, spanning the text. It then incrementally adds new edges to the
  20. chart. A set of "chart rules" specifies the conditions under which
  21. new edges should be added to the chart. Once the chart reaches a
  22. stage where none of the chart rules adds any new edges, parsing is
  23. complete.
  24. Charts are encoded with the ``Chart`` class, and edges are encoded with
  25. the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module
  26. defines three chart parsers:
  27. - ``ChartParser`` is a simple and flexible chart parser. Given a
  28. set of chart rules, it will apply those rules to the chart until
  29. no more edges are added.
  30. - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
  31. be used to step through the parsing process.
  32. """
  33. import itertools
  34. import re
  35. import warnings
  36. from functools import total_ordering
  37. from nltk.tree import Tree
  38. from nltk.grammar import PCFG, is_nonterminal, is_terminal
  39. from nltk.util import OrderedDict
  40. from nltk.internals import raise_unorderable_types
  41. from nltk.parse.api import ParserI
  42. ########################################################################
  43. ## Edges
  44. ########################################################################
  45. @total_ordering
  46. class EdgeI(object):
  47. """
  48. A hypothesis about the structure of part of a sentence.
  49. Each edge records the fact that a structure is (partially)
  50. consistent with the sentence. An edge contains:
  51. - A span, indicating what part of the sentence is
  52. consistent with the hypothesized structure.
  53. - A left-hand side, specifying what kind of structure is
  54. hypothesized.
  55. - A right-hand side, specifying the contents of the
  56. hypothesized structure.
  57. - A dot position, indicating how much of the hypothesized
  58. structure is consistent with the sentence.
  59. Every edge is either complete or incomplete:
  60. - An edge is complete if its structure is fully consistent
  61. with the sentence.
  62. - An edge is incomplete if its structure is partially
  63. consistent with the sentence. For every incomplete edge, the
  64. span specifies a possible prefix for the edge's structure.
  65. There are two kinds of edge:
  66. - A ``TreeEdge`` records which trees have been found to
  67. be (partially) consistent with the text.
  68. - A ``LeafEdge`` records the tokens occurring in the text.
  69. The ``EdgeI`` interface provides a common interface to both types
  70. of edge, allowing chart parsers to treat them in a uniform manner.
  71. """
  72. def __init__(self):
  73. if self.__class__ == EdgeI:
  74. raise TypeError("Edge is an abstract interface")
  75. # ////////////////////////////////////////////////////////////
  76. # Span
  77. # ////////////////////////////////////////////////////////////
  78. def span(self):
  79. """
  80. Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
  81. portion of the sentence that is consistent with this
  82. edge's structure.
  83. :rtype: tuple(int, int)
  84. """
  85. raise NotImplementedError()
  86. def start(self):
  87. """
  88. Return the start index of this edge's span.
  89. :rtype: int
  90. """
  91. raise NotImplementedError()
  92. def end(self):
  93. """
  94. Return the end index of this edge's span.
  95. :rtype: int
  96. """
  97. raise NotImplementedError()
  98. def length(self):
  99. """
  100. Return the length of this edge's span.
  101. :rtype: int
  102. """
  103. raise NotImplementedError()
  104. # ////////////////////////////////////////////////////////////
  105. # Left Hand Side
  106. # ////////////////////////////////////////////////////////////
  107. def lhs(self):
  108. """
  109. Return this edge's left-hand side, which specifies what kind
  110. of structure is hypothesized by this edge.
  111. :see: ``TreeEdge`` and ``LeafEdge`` for a description of
  112. the left-hand side values for each edge type.
  113. """
  114. raise NotImplementedError()
  115. # ////////////////////////////////////////////////////////////
  116. # Right Hand Side
  117. # ////////////////////////////////////////////////////////////
  118. def rhs(self):
  119. """
  120. Return this edge's right-hand side, which specifies
  121. the content of the structure hypothesized by this edge.
  122. :see: ``TreeEdge`` and ``LeafEdge`` for a description of
  123. the right-hand side values for each edge type.
  124. """
  125. raise NotImplementedError()
  126. def dot(self):
  127. """
  128. Return this edge's dot position, which indicates how much of
  129. the hypothesized structure is consistent with the
  130. sentence. In particular, ``self.rhs[:dot]`` is consistent
  131. with ``tokens[self.start():self.end()]``.
  132. :rtype: int
  133. """
  134. raise NotImplementedError()
  135. def nextsym(self):
  136. """
  137. Return the element of this edge's right-hand side that
  138. immediately follows its dot.
  139. :rtype: Nonterminal or terminal or None
  140. """
  141. raise NotImplementedError()
  142. def is_complete(self):
  143. """
  144. Return True if this edge's structure is fully consistent
  145. with the text.
  146. :rtype: bool
  147. """
  148. raise NotImplementedError()
  149. def is_incomplete(self):
  150. """
  151. Return True if this edge's structure is partially consistent
  152. with the text.
  153. :rtype: bool
  154. """
  155. raise NotImplementedError()
  156. # ////////////////////////////////////////////////////////////
  157. # Comparisons & hashing
  158. # ////////////////////////////////////////////////////////////
  159. def __eq__(self, other):
  160. return (
  161. self.__class__ is other.__class__
  162. and self._comparison_key == other._comparison_key
  163. )
  164. def __ne__(self, other):
  165. return not self == other
  166. def __lt__(self, other):
  167. if not isinstance(other, EdgeI):
  168. raise_unorderable_types("<", self, other)
  169. if self.__class__ is other.__class__:
  170. return self._comparison_key < other._comparison_key
  171. else:
  172. return self.__class__.__name__ < other.__class__.__name__
  173. def __hash__(self):
  174. try:
  175. return self._hash
  176. except AttributeError:
  177. self._hash = hash(self._comparison_key)
  178. return self._hash
  179. class TreeEdge(EdgeI):
  180. """
  181. An edge that records the fact that a tree is (partially)
  182. consistent with the sentence. A tree edge consists of:
  183. - A span, indicating what part of the sentence is
  184. consistent with the hypothesized tree.
  185. - A left-hand side, specifying the hypothesized tree's node
  186. value.
  187. - A right-hand side, specifying the hypothesized tree's
  188. children. Each element of the right-hand side is either a
  189. terminal, specifying a token with that terminal as its leaf
  190. value; or a nonterminal, specifying a subtree with that
  191. nonterminal's symbol as its node value.
  192. - A dot position, indicating which children are consistent
  193. with part of the sentence. In particular, if ``dot`` is the
  194. dot position, ``rhs`` is the right-hand size, ``(start,end)``
  195. is the span, and ``sentence`` is the list of tokens in the
  196. sentence, then ``tokens[start:end]`` can be spanned by the
  197. children specified by ``rhs[:dot]``.
  198. For more information about edges, see the ``EdgeI`` interface.
  199. """
  200. def __init__(self, span, lhs, rhs, dot=0):
  201. """
  202. Construct a new ``TreeEdge``.
  203. :type span: tuple(int, int)
  204. :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
  205. portion of the sentence that is consistent with the new
  206. edge's structure.
  207. :type lhs: Nonterminal
  208. :param lhs: The new edge's left-hand side, specifying the
  209. hypothesized tree's node value.
  210. :type rhs: list(Nonterminal and str)
  211. :param rhs: The new edge's right-hand side, specifying the
  212. hypothesized tree's children.
  213. :type dot: int
  214. :param dot: The position of the new edge's dot. This position
  215. specifies what prefix of the production's right hand side
  216. is consistent with the text. In particular, if
  217. ``sentence`` is the list of tokens in the sentence, then
  218. ``okens[span[0]:span[1]]`` can be spanned by the
  219. children specified by ``rhs[:dot]``.
  220. """
  221. self._span = span
  222. self._lhs = lhs
  223. rhs = tuple(rhs)
  224. self._rhs = rhs
  225. self._dot = dot
  226. self._comparison_key = (span, lhs, rhs, dot)
  227. @staticmethod
  228. def from_production(production, index):
  229. """
  230. Return a new ``TreeEdge`` formed from the given production.
  231. The new edge's left-hand side and right-hand side will
  232. be taken from ``production``; its span will be
  233. ``(index,index)``; and its dot position will be ``0``.
  234. :rtype: TreeEdge
  235. """
  236. return TreeEdge(
  237. span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
  238. )
  239. def move_dot_forward(self, new_end):
  240. """
  241. Return a new ``TreeEdge`` formed from this edge.
  242. The new edge's dot position is increased by ``1``,
  243. and its end index will be replaced by ``new_end``.
  244. :param new_end: The new end index.
  245. :type new_end: int
  246. :rtype: TreeEdge
  247. """
  248. return TreeEdge(
  249. span=(self._span[0], new_end),
  250. lhs=self._lhs,
  251. rhs=self._rhs,
  252. dot=self._dot + 1,
  253. )
  254. # Accessors
  255. def lhs(self):
  256. return self._lhs
  257. def span(self):
  258. return self._span
  259. def start(self):
  260. return self._span[0]
  261. def end(self):
  262. return self._span[1]
  263. def length(self):
  264. return self._span[1] - self._span[0]
  265. def rhs(self):
  266. return self._rhs
  267. def dot(self):
  268. return self._dot
  269. def is_complete(self):
  270. return self._dot == len(self._rhs)
  271. def is_incomplete(self):
  272. return self._dot != len(self._rhs)
  273. def nextsym(self):
  274. if self._dot >= len(self._rhs):
  275. return None
  276. else:
  277. return self._rhs[self._dot]
  278. # String representation
  279. def __str__(self):
  280. str = "[%s:%s] " % (self._span[0], self._span[1])
  281. str += "%-2r ->" % (self._lhs,)
  282. for i in range(len(self._rhs)):
  283. if i == self._dot:
  284. str += " *"
  285. str += " %s" % repr(self._rhs[i])
  286. if len(self._rhs) == self._dot:
  287. str += " *"
  288. return str
  289. def __repr__(self):
  290. return "[Edge: %s]" % self
  291. class LeafEdge(EdgeI):
  292. """
  293. An edge that records the fact that a leaf value is consistent with
  294. a word in the sentence. A leaf edge consists of:
  295. - An index, indicating the position of the word.
  296. - A leaf, specifying the word's content.
  297. A leaf edge's left-hand side is its leaf value, and its right hand
  298. side is ``()``. Its span is ``[index, index+1]``, and its dot
  299. position is ``0``.
  300. """
  301. def __init__(self, leaf, index):
  302. """
  303. Construct a new ``LeafEdge``.
  304. :param leaf: The new edge's leaf value, specifying the word
  305. that is recorded by this edge.
  306. :param index: The new edge's index, specifying the position of
  307. the word that is recorded by this edge.
  308. """
  309. self._leaf = leaf
  310. self._index = index
  311. self._comparison_key = (leaf, index)
  312. # Accessors
  313. def lhs(self):
  314. return self._leaf
  315. def span(self):
  316. return (self._index, self._index + 1)
  317. def start(self):
  318. return self._index
  319. def end(self):
  320. return self._index + 1
  321. def length(self):
  322. return 1
  323. def rhs(self):
  324. return ()
  325. def dot(self):
  326. return 0
  327. def is_complete(self):
  328. return True
  329. def is_incomplete(self):
  330. return False
  331. def nextsym(self):
  332. return None
  333. # String representations
  334. def __str__(self):
  335. return "[%s:%s] %s" % (self._index, self._index + 1, repr(self._leaf))
  336. def __repr__(self):
  337. return "[Edge: %s]" % (self)
  338. ########################################################################
  339. ## Chart
  340. ########################################################################
  341. class Chart(object):
  342. """
  343. A blackboard for hypotheses about the syntactic constituents of a
  344. sentence. A chart contains a set of edges, and each edge encodes
  345. a single hypothesis about the structure of some portion of the
  346. sentence.
  347. The ``select`` method can be used to select a specific collection
  348. of edges. For example ``chart.select(is_complete=True, start=0)``
  349. yields all complete edges whose start indices are 0. To ensure
  350. the efficiency of these selection operations, ``Chart`` dynamically
  351. creates and maintains an index for each set of attributes that
  352. have been selected on.
  353. In order to reconstruct the trees that are represented by an edge,
  354. the chart associates each edge with a set of child pointer lists.
  355. A child pointer list is a list of the edges that license an
  356. edge's right-hand side.
  357. :ivar _tokens: The sentence that the chart covers.
  358. :ivar _num_leaves: The number of tokens.
  359. :ivar _edges: A list of the edges in the chart
  360. :ivar _edge_to_cpls: A dictionary mapping each edge to a set
  361. of child pointer lists that are associated with that edge.
  362. :ivar _indexes: A dictionary mapping tuples of edge attributes
  363. to indices, where each index maps the corresponding edge
  364. attribute values to lists of edges.
  365. """
  366. def __init__(self, tokens):
  367. """
  368. Construct a new chart. The chart is initialized with the
  369. leaf edges corresponding to the terminal leaves.
  370. :type tokens: list
  371. :param tokens: The sentence that this chart will be used to parse.
  372. """
  373. # Record the sentence token and the sentence length.
  374. self._tokens = tuple(tokens)
  375. self._num_leaves = len(self._tokens)
  376. # Initialise the chart.
  377. self.initialize()
  378. def initialize(self):
  379. """
  380. Clear the chart.
  381. """
  382. # A list of edges contained in this chart.
  383. self._edges = []
  384. # The set of child pointer lists associated with each edge.
  385. self._edge_to_cpls = {}
  386. # Indexes mapping attribute values to lists of edges
  387. # (used by select()).
  388. self._indexes = {}
  389. # ////////////////////////////////////////////////////////////
  390. # Sentence Access
  391. # ////////////////////////////////////////////////////////////
  392. def num_leaves(self):
  393. """
  394. Return the number of words in this chart's sentence.
  395. :rtype: int
  396. """
  397. return self._num_leaves
  398. def leaf(self, index):
  399. """
  400. Return the leaf value of the word at the given index.
  401. :rtype: str
  402. """
  403. return self._tokens[index]
  404. def leaves(self):
  405. """
  406. Return a list of the leaf values of each word in the
  407. chart's sentence.
  408. :rtype: list(str)
  409. """
  410. return self._tokens
  411. # ////////////////////////////////////////////////////////////
  412. # Edge access
  413. # ////////////////////////////////////////////////////////////
  414. def edges(self):
  415. """
  416. Return a list of all edges in this chart. New edges
  417. that are added to the chart after the call to edges()
  418. will *not* be contained in this list.
  419. :rtype: list(EdgeI)
  420. :see: ``iteredges``, ``select``
  421. """
  422. return self._edges[:]
  423. def iteredges(self):
  424. """
  425. Return an iterator over the edges in this chart. It is
  426. not guaranteed that new edges which are added to the
  427. chart before the iterator is exhausted will also be generated.
  428. :rtype: iter(EdgeI)
  429. :see: ``edges``, ``select``
  430. """
  431. return iter(self._edges)
  432. # Iterating over the chart yields its edges.
  433. __iter__ = iteredges
  434. def num_edges(self):
  435. """
  436. Return the number of edges contained in this chart.
  437. :rtype: int
  438. """
  439. return len(self._edge_to_cpls)
  440. def select(self, **restrictions):
  441. """
  442. Return an iterator over the edges in this chart. Any
  443. new edges that are added to the chart before the iterator
  444. is exahusted will also be generated. ``restrictions``
  445. can be used to restrict the set of edges that will be
  446. generated.
  447. :param span: Only generate edges ``e`` where ``e.span()==span``
  448. :param start: Only generate edges ``e`` where ``e.start()==start``
  449. :param end: Only generate edges ``e`` where ``e.end()==end``
  450. :param length: Only generate edges ``e`` where ``e.length()==length``
  451. :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
  452. :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
  453. :param nextsym: Only generate edges ``e`` where
  454. ``e.nextsym()==nextsym``
  455. :param dot: Only generate edges ``e`` where ``e.dot()==dot``
  456. :param is_complete: Only generate edges ``e`` where
  457. ``e.is_complete()==is_complete``
  458. :param is_incomplete: Only generate edges ``e`` where
  459. ``e.is_incomplete()==is_incomplete``
  460. :rtype: iter(EdgeI)
  461. """
  462. # If there are no restrictions, then return all edges.
  463. if restrictions == {}:
  464. return iter(self._edges)
  465. # Find the index corresponding to the given restrictions.
  466. restr_keys = sorted(restrictions.keys())
  467. restr_keys = tuple(restr_keys)
  468. # If it doesn't exist, then create it.
  469. if restr_keys not in self._indexes:
  470. self._add_index(restr_keys)
  471. vals = tuple(restrictions[key] for key in restr_keys)
  472. return iter(self._indexes[restr_keys].get(vals, []))
  473. def _add_index(self, restr_keys):
  474. """
  475. A helper function for ``select``, which creates a new index for
  476. a given set of attributes (aka restriction keys).
  477. """
  478. # Make sure it's a valid index.
  479. for key in restr_keys:
  480. if not hasattr(EdgeI, key):
  481. raise ValueError("Bad restriction: %s" % key)
  482. # Create the index.
  483. index = self._indexes[restr_keys] = {}
  484. # Add all existing edges to the index.
  485. for edge in self._edges:
  486. vals = tuple(getattr(edge, key)() for key in restr_keys)
  487. index.setdefault(vals, []).append(edge)
  488. def _register_with_indexes(self, edge):
  489. """
  490. A helper function for ``insert``, which registers the new
  491. edge with all existing indexes.
  492. """
  493. for (restr_keys, index) in self._indexes.items():
  494. vals = tuple(getattr(edge, key)() for key in restr_keys)
  495. index.setdefault(vals, []).append(edge)
  496. # ////////////////////////////////////////////////////////////
  497. # Edge Insertion
  498. # ////////////////////////////////////////////////////////////
  499. def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
  500. """
  501. Add a new edge to the chart, using a pointer to the previous edge.
  502. """
  503. cpls = self.child_pointer_lists(previous_edge)
  504. new_cpls = [cpl + (child_edge,) for cpl in cpls]
  505. return self.insert(new_edge, *new_cpls)
  506. def insert(self, edge, *child_pointer_lists):
  507. """
  508. Add a new edge to the chart, and return True if this operation
  509. modified the chart. In particular, return true iff the chart
  510. did not already contain ``edge``, or if it did not already associate
  511. ``child_pointer_lists`` with ``edge``.
  512. :type edge: EdgeI
  513. :param edge: The new edge
  514. :type child_pointer_lists: sequence of tuple(EdgeI)
  515. :param child_pointer_lists: A sequence of lists of the edges that
  516. were used to form this edge. This list is used to reconstruct
  517. the trees (or partial trees) that are associated with ``edge``.
  518. :rtype: bool
  519. """
  520. # Is it a new edge?
  521. if edge not in self._edge_to_cpls:
  522. # Add it to the list of edges.
  523. self._append_edge(edge)
  524. # Register with indexes.
  525. self._register_with_indexes(edge)
  526. # Get the set of child pointer lists for this edge.
  527. cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
  528. chart_was_modified = False
  529. for child_pointer_list in child_pointer_lists:
  530. child_pointer_list = tuple(child_pointer_list)
  531. if child_pointer_list not in cpls:
  532. # It's a new CPL; register it, and return true.
  533. cpls[child_pointer_list] = True
  534. chart_was_modified = True
  535. return chart_was_modified
  536. def _append_edge(self, edge):
  537. self._edges.append(edge)
  538. # ////////////////////////////////////////////////////////////
  539. # Tree extraction & child pointer lists
  540. # ////////////////////////////////////////////////////////////
  541. def parses(self, root, tree_class=Tree):
  542. """
  543. Return an iterator of the complete tree structures that span
  544. the entire chart, and whose root node is ``root``.
  545. """
  546. for edge in self.select(start=0, end=self._num_leaves, lhs=root):
  547. for tree in self.trees(edge, tree_class=tree_class, complete=True):
  548. yield tree
  549. def trees(self, edge, tree_class=Tree, complete=False):
  550. """
  551. Return an iterator of the tree structures that are associated
  552. with ``edge``.
  553. If ``edge`` is incomplete, then the unexpanded children will be
  554. encoded as childless subtrees, whose node value is the
  555. corresponding terminal or nonterminal.
  556. :rtype: list(Tree)
  557. :note: If two trees share a common subtree, then the same
  558. Tree may be used to encode that subtree in
  559. both trees. If you need to eliminate this subtree
  560. sharing, then create a deep copy of each tree.
  561. """
  562. return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
  563. def _trees(self, edge, complete, memo, tree_class):
  564. """
  565. A helper function for ``trees``.
  566. :param memo: A dictionary used to record the trees that we've
  567. generated for each edge, so that when we see an edge more
  568. than once, we can reuse the same trees.
  569. """
  570. # If we've seen this edge before, then reuse our old answer.
  571. if edge in memo:
  572. return memo[edge]
  573. # when we're reading trees off the chart, don't use incomplete edges
  574. if complete and edge.is_incomplete():
  575. return []
  576. # Leaf edges.
  577. if isinstance(edge, LeafEdge):
  578. leaf = self._tokens[edge.start()]
  579. memo[edge] = [leaf]
  580. return [leaf]
  581. # Until we're done computing the trees for edge, set
  582. # memo[edge] to be empty. This has the effect of filtering
  583. # out any cyclic trees (i.e., trees that contain themselves as
  584. # descendants), because if we reach this edge via a cycle,
  585. # then it will appear that the edge doesn't generate any trees.
  586. memo[edge] = []
  587. trees = []
  588. lhs = edge.lhs().symbol()
  589. # Each child pointer list can be used to form trees.
  590. for cpl in self.child_pointer_lists(edge):
  591. # Get the set of child choices for each child pointer.
  592. # child_choices[i] is the set of choices for the tree's
  593. # ith child.
  594. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
  595. # For each combination of children, add a tree.
  596. for children in itertools.product(*child_choices):
  597. trees.append(tree_class(lhs, children))
  598. # If the edge is incomplete, then extend it with "partial trees":
  599. if edge.is_incomplete():
  600. unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]]
  601. for tree in trees:
  602. tree.extend(unexpanded)
  603. # Update the memoization dictionary.
  604. memo[edge] = trees
  605. # Return the list of trees.
  606. return trees
  607. def child_pointer_lists(self, edge):
  608. """
  609. Return the set of child pointer lists for the given edge.
  610. Each child pointer list is a list of edges that have
  611. been used to form this edge.
  612. :rtype: list(list(EdgeI))
  613. """
  614. # Make a copy, in case they modify it.
  615. return self._edge_to_cpls.get(edge, {}).keys()
  616. # ////////////////////////////////////////////////////////////
  617. # Display
  618. # ////////////////////////////////////////////////////////////
  619. def pretty_format_edge(self, edge, width=None):
  620. """
  621. Return a pretty-printed string representation of a given edge
  622. in this chart.
  623. :rtype: str
  624. :param width: The number of characters allotted to each
  625. index in the sentence.
  626. """
  627. if width is None:
  628. width = 50 // (self.num_leaves() + 1)
  629. (start, end) = (edge.start(), edge.end())
  630. str = "|" + ("." + " " * (width - 1)) * start
  631. # Zero-width edges are "#" if complete, ">" if incomplete
  632. if start == end:
  633. if edge.is_complete():
  634. str += "#"
  635. else:
  636. str += ">"
  637. # Spanning complete edges are "[===]"; Other edges are
  638. # "[---]" if complete, "[--->" if incomplete
  639. elif edge.is_complete() and edge.span() == (0, self._num_leaves):
  640. str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]"
  641. elif edge.is_complete():
  642. str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]"
  643. else:
  644. str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">"
  645. str += (" " * (width - 1) + ".") * (self._num_leaves - end)
  646. return str + "| %s" % edge
  647. def pretty_format_leaves(self, width=None):
  648. """
  649. Return a pretty-printed string representation of this
  650. chart's leaves. This string can be used as a header
  651. for calls to ``pretty_format_edge``.
  652. """
  653. if width is None:
  654. width = 50 // (self.num_leaves() + 1)
  655. if self._tokens is not None and width > 1:
  656. header = "|."
  657. for tok in self._tokens:
  658. header += tok[: width - 1].center(width - 1) + "."
  659. header += "|"
  660. else:
  661. header = ""
  662. return header
  663. def pretty_format(self, width=None):
  664. """
  665. Return a pretty-printed string representation of this chart.
  666. :param width: The number of characters allotted to each
  667. index in the sentence.
  668. :rtype: str
  669. """
  670. if width is None:
  671. width = 50 // (self.num_leaves() + 1)
  672. # sort edges: primary key=length, secondary key=start index.
  673. # (and filter out the token edges)
  674. edges = sorted([(e.length(), e.start(), e) for e in self])
  675. edges = [e for (_, _, e) in edges]
  676. return (
  677. self.pretty_format_leaves(width)
  678. + "\n"
  679. + "\n".join(self.pretty_format_edge(edge, width) for edge in edges)
  680. )
  681. # ////////////////////////////////////////////////////////////
  682. # Display: Dot (AT&T Graphviz)
  683. # ////////////////////////////////////////////////////////////
  684. def dot_digraph(self):
  685. # Header
  686. s = "digraph nltk_chart {\n"
  687. # s += ' size="5,5";\n'
  688. s += " rankdir=LR;\n"
  689. s += " node [height=0.1,width=0.1];\n"
  690. s += ' node [style=filled, color="lightgray"];\n'
  691. # Set up the nodes
  692. for y in range(self.num_edges(), -1, -1):
  693. if y == 0:
  694. s += ' node [style=filled, color="black"];\n'
  695. for x in range(self.num_leaves() + 1):
  696. if y == 0 or (
  697. x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
  698. ):
  699. s += ' %04d.%04d [label=""];\n' % (x, y)
  700. # Add a spacer
  701. s += " x [style=invis]; x->0000.0000 [style=invis];\n"
  702. # Declare ranks.
  703. for x in range(self.num_leaves() + 1):
  704. s += " {rank=same;"
  705. for y in range(self.num_edges() + 1):
  706. if y == 0 or (
  707. x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
  708. ):
  709. s += " %04d.%04d" % (x, y)
  710. s += "}\n"
  711. # Add the leaves
  712. s += " edge [style=invis, weight=100];\n"
  713. s += " node [shape=plaintext]\n"
  714. s += " 0000.0000"
  715. for x in range(self.num_leaves()):
  716. s += "->%s->%04d.0000" % (self.leaf(x), x + 1)
  717. s += ";\n\n"
  718. # Add the edges
  719. s += " edge [style=solid, weight=1];\n"
  720. for y, edge in enumerate(self):
  721. for x in range(edge.start()):
  722. s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
  723. x,
  724. y + 1,
  725. x + 1,
  726. y + 1,
  727. )
  728. s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % (
  729. edge.start(),
  730. y + 1,
  731. edge.end(),
  732. y + 1,
  733. edge,
  734. )
  735. for x in range(edge.end(), self.num_leaves()):
  736. s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
  737. x,
  738. y + 1,
  739. x + 1,
  740. y + 1,
  741. )
  742. s += "}\n"
  743. return s
  744. ########################################################################
  745. ## Chart Rules
  746. ########################################################################
  747. class ChartRuleI(object):
  748. """
  749. A rule that specifies what new edges are licensed by any given set
  750. of existing edges. Each chart rule expects a fixed number of
  751. edges, as indicated by the class variable ``NUM_EDGES``. In
  752. particular:
  753. - A chart rule with ``NUM_EDGES=0`` specifies what new edges are
  754. licensed, regardless of existing edges.
  755. - A chart rule with ``NUM_EDGES=1`` specifies what new edges are
  756. licensed by a single existing edge.
  757. - A chart rule with ``NUM_EDGES=2`` specifies what new edges are
  758. licensed by a pair of existing edges.
  759. :type NUM_EDGES: int
  760. :cvar NUM_EDGES: The number of existing edges that this rule uses
  761. to license new edges. Typically, this number ranges from zero
  762. to two.
  763. """
  764. def apply(self, chart, grammar, *edges):
  765. """
  766. Return a generator that will add edges licensed by this rule
  767. and the given edges to the chart, one at a time. Each
  768. time the generator is resumed, it will either add a new
  769. edge and yield that edge; or return.
  770. :type edges: list(EdgeI)
  771. :param edges: A set of existing edges. The number of edges
  772. that should be passed to ``apply()`` is specified by the
  773. ``NUM_EDGES`` class variable.
  774. :rtype: iter(EdgeI)
  775. """
  776. raise NotImplementedError()
  777. def apply_everywhere(self, chart, grammar):
  778. """
  779. Return a generator that will add all edges licensed by
  780. this rule, given the edges that are currently in the
  781. chart, one at a time. Each time the generator is resumed,
  782. it will either add a new edge and yield that edge; or return.
  783. :rtype: iter(EdgeI)
  784. """
  785. raise NotImplementedError()
  786. class AbstractChartRule(ChartRuleI):
  787. """
  788. An abstract base class for chart rules. ``AbstractChartRule``
  789. provides:
  790. - A default implementation for ``apply``.
  791. - A default implementation for ``apply_everywhere``,
  792. (Currently, this implementation assumes that ``NUM_EDGES``<=3.)
  793. - A default implementation for ``__str__``, which returns a
  794. name based on the rule's class name.
  795. """
  796. # Subclasses must define apply.
  797. def apply(self, chart, grammar, *edges):
  798. raise NotImplementedError()
  799. # Default: loop through the given number of edges, and call
  800. # self.apply() for each set of edges.
  801. def apply_everywhere(self, chart, grammar):
  802. if self.NUM_EDGES == 0:
  803. for new_edge in self.apply(chart, grammar):
  804. yield new_edge
  805. elif self.NUM_EDGES == 1:
  806. for e1 in chart:
  807. for new_edge in self.apply(chart, grammar, e1):
  808. yield new_edge
  809. elif self.NUM_EDGES == 2:
  810. for e1 in chart:
  811. for e2 in chart:
  812. for new_edge in self.apply(chart, grammar, e1, e2):
  813. yield new_edge
  814. elif self.NUM_EDGES == 3:
  815. for e1 in chart:
  816. for e2 in chart:
  817. for e3 in chart:
  818. for new_edge in self.apply(chart, grammar, e1, e2, e3):
  819. yield new_edge
  820. else:
  821. raise AssertionError("NUM_EDGES>3 is not currently supported")
  822. # Default: return a name based on the class name.
  823. def __str__(self):
  824. # Add spaces between InitialCapsWords.
  825. return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__)
  826. # ////////////////////////////////////////////////////////////
  827. # Fundamental Rule
  828. # ////////////////////////////////////////////////////////////
  829. class FundamentalRule(AbstractChartRule):
  830. """
  831. A rule that joins two adjacent edges to form a single combined
  832. edge. In particular, this rule specifies that any pair of edges
  833. - ``[A -> alpha \* B beta][i:j]``
  834. - ``[B -> gamma \*][j:k]``
  835. licenses the edge:
  836. - ``[A -> alpha B * beta][i:j]``
  837. """
  838. NUM_EDGES = 2
  839. def apply(self, chart, grammar, left_edge, right_edge):
  840. # Make sure the rule is applicable.
  841. if not (
  842. left_edge.is_incomplete()
  843. and right_edge.is_complete()
  844. and left_edge.end() == right_edge.start()
  845. and left_edge.nextsym() == right_edge.lhs()
  846. ):
  847. return
  848. # Construct the new edge.
  849. new_edge = left_edge.move_dot_forward(right_edge.end())
  850. # Insert it into the chart.
  851. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  852. yield new_edge
  853. class SingleEdgeFundamentalRule(FundamentalRule):
  854. """
  855. A rule that joins a given edge with adjacent edges in the chart,
  856. to form combined edges. In particular, this rule specifies that
  857. either of the edges:
  858. - ``[A -> alpha \* B beta][i:j]``
  859. - ``[B -> gamma \*][j:k]``
  860. licenses the edge:
  861. - ``[A -> alpha B * beta][i:j]``
  862. if the other edge is already in the chart.
  863. :note: This is basically ``FundamentalRule``, with one edge left
  864. unspecified.
  865. """
  866. NUM_EDGES = 1
  867. def apply(self, chart, grammar, edge):
  868. if edge.is_incomplete():
  869. for new_edge in self._apply_incomplete(chart, grammar, edge):
  870. yield new_edge
  871. else:
  872. for new_edge in self._apply_complete(chart, grammar, edge):
  873. yield new_edge
  874. def _apply_complete(self, chart, grammar, right_edge):
  875. for left_edge in chart.select(
  876. end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
  877. ):
  878. new_edge = left_edge.move_dot_forward(right_edge.end())
  879. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  880. yield new_edge
  881. def _apply_incomplete(self, chart, grammar, left_edge):
  882. for right_edge in chart.select(
  883. start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
  884. ):
  885. new_edge = left_edge.move_dot_forward(right_edge.end())
  886. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  887. yield new_edge
  888. # ////////////////////////////////////////////////////////////
  889. # Inserting Terminal Leafs
  890. # ////////////////////////////////////////////////////////////
  891. class LeafInitRule(AbstractChartRule):
  892. NUM_EDGES = 0
  893. def apply(self, chart, grammar):
  894. for index in range(chart.num_leaves()):
  895. new_edge = LeafEdge(chart.leaf(index), index)
  896. if chart.insert(new_edge, ()):
  897. yield new_edge
  898. # ////////////////////////////////////////////////////////////
  899. # Top-Down Prediction
  900. # ////////////////////////////////////////////////////////////
  901. class TopDownInitRule(AbstractChartRule):
  902. """
  903. A rule licensing edges corresponding to the grammar productions for
  904. the grammar's start symbol. In particular, this rule specifies that
  905. ``[S -> \* alpha][0:i]`` is licensed for each grammar production
  906. ``S -> alpha``, where ``S`` is the grammar's start symbol.
  907. """
  908. NUM_EDGES = 0
  909. def apply(self, chart, grammar):
  910. for prod in grammar.productions(lhs=grammar.start()):
  911. new_edge = TreeEdge.from_production(prod, 0)
  912. if chart.insert(new_edge, ()):
  913. yield new_edge
  914. class TopDownPredictRule(AbstractChartRule):
  915. """
  916. A rule licensing edges corresponding to the grammar productions
  917. for the nonterminal following an incomplete edge's dot. In
  918. particular, this rule specifies that
  919. ``[A -> alpha \* B beta][i:j]`` licenses the edge
  920. ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
  921. :note: This rule corresponds to the Predictor Rule in Earley parsing.
  922. """
  923. NUM_EDGES = 1
  924. def apply(self, chart, grammar, edge):
  925. if edge.is_complete():
  926. return
  927. for prod in grammar.productions(lhs=edge.nextsym()):
  928. new_edge = TreeEdge.from_production(prod, edge.end())
  929. if chart.insert(new_edge, ()):
  930. yield new_edge
  931. class CachedTopDownPredictRule(TopDownPredictRule):
  932. """
  933. A cached version of ``TopDownPredictRule``. After the first time
  934. this rule is applied to an edge with a given ``end`` and ``next``,
  935. it will not generate any more edges for edges with that ``end`` and
  936. ``next``.
  937. If ``chart`` or ``grammar`` are changed, then the cache is flushed.
  938. """
  939. def __init__(self):
  940. TopDownPredictRule.__init__(self)
  941. self._done = {}
  942. def apply(self, chart, grammar, edge):
  943. if edge.is_complete():
  944. return
  945. nextsym, index = edge.nextsym(), edge.end()
  946. if not is_nonterminal(nextsym):
  947. return
  948. # If we've already applied this rule to an edge with the same
  949. # next & end, and the chart & grammar have not changed, then
  950. # just return (no new edges to add).
  951. done = self._done.get((nextsym, index), (None, None))
  952. if done[0] is chart and done[1] is grammar:
  953. return
  954. # Add all the edges indicated by the top down expand rule.
  955. for prod in grammar.productions(lhs=nextsym):
  956. # If the left corner in the predicted production is
  957. # leaf, it must match with the input.
  958. if prod.rhs():
  959. first = prod.rhs()[0]
  960. if is_terminal(first):
  961. if index >= chart.num_leaves() or first != chart.leaf(index):
  962. continue
  963. new_edge = TreeEdge.from_production(prod, index)
  964. if chart.insert(new_edge, ()):
  965. yield new_edge
  966. # Record the fact that we've applied this rule.
  967. self._done[nextsym, index] = (chart, grammar)
  968. # ////////////////////////////////////////////////////////////
  969. # Bottom-Up Prediction
  970. # ////////////////////////////////////////////////////////////
  971. class BottomUpPredictRule(AbstractChartRule):
  972. """
  973. A rule licensing any edge corresponding to a production whose
  974. right-hand side begins with a complete edge's left-hand side. In
  975. particular, this rule specifies that ``[A -> alpha \*]`` licenses
  976. the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
  977. """
  978. NUM_EDGES = 1
  979. def apply(self, chart, grammar, edge):
  980. if edge.is_incomplete():
  981. return
  982. for prod in grammar.productions(rhs=edge.lhs()):
  983. new_edge = TreeEdge.from_production(prod, edge.start())
  984. if chart.insert(new_edge, ()):
  985. yield new_edge
  986. class BottomUpPredictCombineRule(BottomUpPredictRule):
  987. """
  988. A rule licensing any edge corresponding to a production whose
  989. right-hand side begins with a complete edge's left-hand side. In
  990. particular, this rule specifies that ``[A -> alpha \*]``
  991. licenses the edge ``[B -> A \* beta]`` for each grammar
  992. production ``B -> A beta``.
  993. :note: This is like ``BottomUpPredictRule``, but it also applies
  994. the ``FundamentalRule`` to the resulting edge.
  995. """
  996. NUM_EDGES = 1
  997. def apply(self, chart, grammar, edge):
  998. if edge.is_incomplete():
  999. return
  1000. for prod in grammar.productions(rhs=edge.lhs()):
  1001. new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
  1002. if chart.insert(new_edge, (edge,)):
  1003. yield new_edge
  1004. class EmptyPredictRule(AbstractChartRule):
  1005. """
  1006. A rule that inserts all empty productions as passive edges,
  1007. in every position in the chart.
  1008. """
  1009. NUM_EDGES = 0
  1010. def apply(self, chart, grammar):
  1011. for prod in grammar.productions(empty=True):
  1012. for index in range(chart.num_leaves() + 1):
  1013. new_edge = TreeEdge.from_production(prod, index)
  1014. if chart.insert(new_edge, ()):
  1015. yield new_edge
  1016. ########################################################################
  1017. ## Filtered Bottom Up
  1018. ########################################################################
  1019. class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
  1020. def _apply_complete(self, chart, grammar, right_edge):
  1021. end = right_edge.end()
  1022. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1023. for left_edge in chart.select(
  1024. end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
  1025. ):
  1026. if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
  1027. new_edge = left_edge.move_dot_forward(right_edge.end())
  1028. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  1029. yield new_edge
  1030. def _apply_incomplete(self, chart, grammar, left_edge):
  1031. for right_edge in chart.select(
  1032. start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
  1033. ):
  1034. end = right_edge.end()
  1035. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1036. if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
  1037. new_edge = left_edge.move_dot_forward(right_edge.end())
  1038. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  1039. yield new_edge
  1040. class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
  1041. def apply(self, chart, grammar, edge):
  1042. if edge.is_incomplete():
  1043. return
  1044. end = edge.end()
  1045. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1046. for prod in grammar.productions(rhs=edge.lhs()):
  1047. if _bottomup_filter(grammar, nexttoken, prod.rhs()):
  1048. new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
  1049. if chart.insert(new_edge, (edge,)):
  1050. yield new_edge
  1051. def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
  1052. if len(rhs) <= dot + 1:
  1053. return True
  1054. _next = rhs[dot + 1]
  1055. if is_terminal(_next):
  1056. return nexttoken == _next
  1057. else:
  1058. return grammar.is_leftcorner(_next, nexttoken)
  1059. ########################################################################
  1060. ## Generic Chart Parser
  1061. ########################################################################
  1062. TD_STRATEGY = [
  1063. LeafInitRule(),
  1064. TopDownInitRule(),
  1065. CachedTopDownPredictRule(),
  1066. SingleEdgeFundamentalRule(),
  1067. ]
  1068. BU_STRATEGY = [
  1069. LeafInitRule(),
  1070. EmptyPredictRule(),
  1071. BottomUpPredictRule(),
  1072. SingleEdgeFundamentalRule(),
  1073. ]
  1074. BU_LC_STRATEGY = [
  1075. LeafInitRule(),
  1076. EmptyPredictRule(),
  1077. BottomUpPredictCombineRule(),
  1078. SingleEdgeFundamentalRule(),
  1079. ]
  1080. LC_STRATEGY = [
  1081. LeafInitRule(),
  1082. FilteredBottomUpPredictCombineRule(),
  1083. FilteredSingleEdgeFundamentalRule(),
  1084. ]
  1085. class ChartParser(ParserI):
  1086. """
  1087. A generic chart parser. A "strategy", or list of
  1088. ``ChartRuleI`` instances, is used to decide what edges to add to
  1089. the chart. In particular, ``ChartParser`` uses the following
  1090. algorithm to parse texts:
  1091. | Until no new edges are added:
  1092. | For each *rule* in *strategy*:
  1093. | Apply *rule* to any applicable edges in the chart.
  1094. | Return any complete parses in the chart
  1095. """
  1096. def __init__(
  1097. self,
  1098. grammar,
  1099. strategy=BU_LC_STRATEGY,
  1100. trace=0,
  1101. trace_chart_width=50,
  1102. use_agenda=True,
  1103. chart_class=Chart,
  1104. ):
  1105. """
  1106. Create a new chart parser, that uses ``grammar`` to parse
  1107. texts.
  1108. :type grammar: CFG
  1109. :param grammar: The grammar used to parse texts.
  1110. :type strategy: list(ChartRuleI)
  1111. :param strategy: A list of rules that should be used to decide
  1112. what edges to add to the chart (top-down strategy by default).
  1113. :type trace: int
  1114. :param trace: The level of tracing that should be used when
  1115. parsing a text. ``0`` will generate no tracing output;
  1116. and higher numbers will produce more verbose tracing
  1117. output.
  1118. :type trace_chart_width: int
  1119. :param trace_chart_width: The default total width reserved for
  1120. the chart in trace output. The remainder of each line will
  1121. be used to display edges.
  1122. :type use_agenda: bool
  1123. :param use_agenda: Use an optimized agenda-based algorithm,
  1124. if possible.
  1125. :param chart_class: The class that should be used to create
  1126. the parse charts.
  1127. """
  1128. self._grammar = grammar
  1129. self._strategy = strategy
  1130. self._trace = trace
  1131. self._trace_chart_width = trace_chart_width
  1132. # If the strategy only consists of axioms (NUM_EDGES==0) and
  1133. # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
  1134. self._use_agenda = use_agenda
  1135. self._chart_class = chart_class
  1136. self._axioms = []
  1137. self._inference_rules = []
  1138. for rule in strategy:
  1139. if rule.NUM_EDGES == 0:
  1140. self._axioms.append(rule)
  1141. elif rule.NUM_EDGES == 1:
  1142. self._inference_rules.append(rule)
  1143. else:
  1144. self._use_agenda = False
  1145. def grammar(self):
  1146. return self._grammar
  1147. def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
  1148. if not trace:
  1149. return
  1150. print_rule_header = trace > 1
  1151. for edge in new_edges:
  1152. if print_rule_header:
  1153. print("%s:" % rule)
  1154. print_rule_header = False
  1155. print(chart.pretty_format_edge(edge, edge_width))
  1156. def chart_parse(self, tokens, trace=None):
  1157. """
  1158. Return the final parse ``Chart`` from which all possible
  1159. parse trees can be extracted.
  1160. :param tokens: The sentence to be parsed
  1161. :type tokens: list(str)
  1162. :rtype: Chart
  1163. """
  1164. if trace is None:
  1165. trace = self._trace
  1166. trace_new_edges = self._trace_new_edges
  1167. tokens = list(tokens)
  1168. self._grammar.check_coverage(tokens)
  1169. chart = self._chart_class(tokens)
  1170. grammar = self._grammar
  1171. # Width, for printing trace edges.
  1172. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
  1173. if trace:
  1174. print(chart.pretty_format_leaves(trace_edge_width))
  1175. if self._use_agenda:
  1176. # Use an agenda-based algorithm.
  1177. for axiom in self._axioms:
  1178. new_edges = list(axiom.apply(chart, grammar))
  1179. trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
  1180. inference_rules = self._inference_rules
  1181. agenda = chart.edges()
  1182. # We reverse the initial agenda, since it is a stack
  1183. # but chart.edges() functions as a queue.
  1184. agenda.reverse()
  1185. while agenda:
  1186. edge = agenda.pop()
  1187. for rule in inference_rules:
  1188. new_edges = list(rule.apply(chart, grammar, edge))
  1189. if trace:
  1190. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  1191. agenda += new_edges
  1192. else:
  1193. # Do not use an agenda-based algorithm.
  1194. edges_added = True
  1195. while edges_added:
  1196. edges_added = False
  1197. for rule in self._strategy:
  1198. new_edges = list(rule.apply_everywhere(chart, grammar))
  1199. edges_added = len(new_edges)
  1200. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  1201. # Return the final chart.
  1202. return chart
  1203. def parse(self, tokens, tree_class=Tree):
  1204. chart = self.chart_parse(tokens)
  1205. return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
  1206. class TopDownChartParser(ChartParser):
  1207. """
  1208. A ``ChartParser`` using a top-down parsing strategy.
  1209. See ``ChartParser`` for more information.
  1210. """
  1211. def __init__(self, grammar, **parser_args):
  1212. ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
  1213. class BottomUpChartParser(ChartParser):
  1214. """
  1215. A ``ChartParser`` using a bottom-up parsing strategy.
  1216. See ``ChartParser`` for more information.
  1217. """
  1218. def __init__(self, grammar, **parser_args):
  1219. if isinstance(grammar, PCFG):
  1220. warnings.warn(
  1221. "BottomUpChartParser only works for CFG, "
  1222. "use BottomUpProbabilisticChartParser instead",
  1223. category=DeprecationWarning,
  1224. )
  1225. ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
  1226. class BottomUpLeftCornerChartParser(ChartParser):
  1227. """
  1228. A ``ChartParser`` using a bottom-up left-corner parsing strategy.
  1229. This strategy is often more efficient than standard bottom-up.
  1230. See ``ChartParser`` for more information.
  1231. """
  1232. def __init__(self, grammar, **parser_args):
  1233. ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
  1234. class LeftCornerChartParser(ChartParser):
  1235. def __init__(self, grammar, **parser_args):
  1236. if not grammar.is_nonempty():
  1237. raise ValueError(
  1238. "LeftCornerParser only works for grammars " "without empty productions."
  1239. )
  1240. ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
  1241. ########################################################################
  1242. ## Stepping Chart Parser
  1243. ########################################################################
  1244. class SteppingChartParser(ChartParser):
  1245. """
  1246. A ``ChartParser`` that allows you to step through the parsing
  1247. process, adding a single edge at a time. It also allows you to
  1248. change the parser's strategy or grammar midway through parsing a
  1249. text.
  1250. The ``initialize`` method is used to start parsing a text. ``step``
  1251. adds a single edge to the chart. ``set_strategy`` changes the
  1252. strategy used by the chart parser. ``parses`` returns the set of
  1253. parses that has been found by the chart parser.
  1254. :ivar _restart: Records whether the parser's strategy, grammar,
  1255. or chart has been changed. If so, then ``step`` must restart
  1256. the parsing algorithm.
  1257. """
  1258. def __init__(self, grammar, strategy=[], trace=0):
  1259. self._chart = None
  1260. self._current_chartrule = None
  1261. self._restart = False
  1262. ChartParser.__init__(self, grammar, strategy, trace)
  1263. # ////////////////////////////////////////////////////////////
  1264. # Initialization
  1265. # ////////////////////////////////////////////////////////////
  1266. def initialize(self, tokens):
  1267. "Begin parsing the given tokens."
  1268. self._chart = Chart(list(tokens))
  1269. self._restart = True
  1270. # ////////////////////////////////////////////////////////////
  1271. # Stepping
  1272. # ////////////////////////////////////////////////////////////
  1273. def step(self):
  1274. """
  1275. Return a generator that adds edges to the chart, one at a
  1276. time. Each time the generator is resumed, it adds a single
  1277. edge and yields that edge. If no more edges can be added,
  1278. then it yields None.
  1279. If the parser's strategy, grammar, or chart is changed, then
  1280. the generator will continue adding edges using the new
  1281. strategy, grammar, or chart.
  1282. Note that this generator never terminates, since the grammar
  1283. or strategy might be changed to values that would add new
  1284. edges. Instead, it yields None when no more edges can be
  1285. added with the current strategy and grammar.
  1286. """
  1287. if self._chart is None:
  1288. raise ValueError("Parser must be initialized first")
  1289. while True:
  1290. self._restart = False
  1291. w = 50 // (self._chart.num_leaves() + 1)
  1292. for e in self._parse():
  1293. if self._trace > 1:
  1294. print(self._current_chartrule)
  1295. if self._trace > 0:
  1296. print(self._chart.pretty_format_edge(e, w))
  1297. yield e
  1298. if self._restart:
  1299. break
  1300. else:
  1301. yield None # No more edges.
  1302. def _parse(self):
  1303. """
  1304. A generator that implements the actual parsing algorithm.
  1305. ``step`` iterates through this generator, and restarts it
  1306. whenever the parser's strategy, grammar, or chart is modified.
  1307. """
  1308. chart = self._chart
  1309. grammar = self._grammar
  1310. edges_added = 1
  1311. while edges_added > 0:
  1312. edges_added = 0
  1313. for rule in self._strategy:
  1314. self._current_chartrule = rule
  1315. for e in rule.apply_everywhere(chart, grammar):
  1316. edges_added += 1
  1317. yield e
  1318. # ////////////////////////////////////////////////////////////
  1319. # Accessors
  1320. # ////////////////////////////////////////////////////////////
  1321. def strategy(self):
  1322. "Return the strategy used by this parser."
  1323. return self._strategy
  1324. def grammar(self):
  1325. "Return the grammar used by this parser."
  1326. return self._grammar
  1327. def chart(self):
  1328. "Return the chart that is used by this parser."
  1329. return self._chart
  1330. def current_chartrule(self):
  1331. "Return the chart rule used to generate the most recent edge."
  1332. return self._current_chartrule
  1333. def parses(self, tree_class=Tree):
  1334. "Return the parse trees currently contained in the chart."
  1335. return self._chart.parses(self._grammar.start(), tree_class)
  1336. # ////////////////////////////////////////////////////////////
  1337. # Parser modification
  1338. # ////////////////////////////////////////////////////////////
  1339. def set_strategy(self, strategy):
  1340. """
  1341. Change the strategy that the parser uses to decide which edges
  1342. to add to the chart.
  1343. :type strategy: list(ChartRuleI)
  1344. :param strategy: A list of rules that should be used to decide
  1345. what edges to add to the chart.
  1346. """
  1347. if strategy == self._strategy:
  1348. return
  1349. self._strategy = strategy[:] # Make a copy.
  1350. self._restart = True
  1351. def set_grammar(self, grammar):
  1352. "Change the grammar used by the parser."
  1353. if grammar is self._grammar:
  1354. return
  1355. self._grammar = grammar
  1356. self._restart = True
  1357. def set_chart(self, chart):
  1358. "Load a given chart into the chart parser."
  1359. if chart is self._chart:
  1360. return
  1361. self._chart = chart
  1362. self._restart = True
  1363. # ////////////////////////////////////////////////////////////
  1364. # Standard parser methods
  1365. # ////////////////////////////////////////////////////////////
  1366. def parse(self, tokens, tree_class=Tree):
  1367. tokens = list(tokens)
  1368. self._grammar.check_coverage(tokens)
  1369. # Initialize ourselves.
  1370. self.initialize(tokens)
  1371. # Step until no more edges are generated.
  1372. for e in self.step():
  1373. if e is None:
  1374. break
  1375. # Return an iterator of complete parses.
  1376. return self.parses(tree_class=tree_class)
  1377. ########################################################################
  1378. ## Demo Code
  1379. ########################################################################
  1380. def demo_grammar():
  1381. from nltk.grammar import CFG
  1382. return CFG.fromstring(
  1383. """
  1384. S -> NP VP
  1385. PP -> "with" NP
  1386. NP -> NP PP
  1387. VP -> VP PP
  1388. VP -> Verb NP
  1389. VP -> Verb
  1390. NP -> Det Noun
  1391. NP -> "John"
  1392. NP -> "I"
  1393. Det -> "the"
  1394. Det -> "my"
  1395. Det -> "a"
  1396. Noun -> "dog"
  1397. Noun -> "cookie"
  1398. Verb -> "ate"
  1399. Verb -> "saw"
  1400. Prep -> "with"
  1401. Prep -> "under"
  1402. """
  1403. )
  1404. def demo(
  1405. choice=None,
  1406. print_times=True,
  1407. print_grammar=False,
  1408. print_trees=True,
  1409. trace=2,
  1410. sent="I saw John with a dog with my cookie",
  1411. numparses=5,
  1412. ):
  1413. """
  1414. A demonstration of the chart parsers.
  1415. """
  1416. import sys, time
  1417. from nltk import nonterminals, Production, CFG
  1418. # The grammar for ChartParser and SteppingChartParser:
  1419. grammar = demo_grammar()
  1420. if print_grammar:
  1421. print("* Grammar")
  1422. print(grammar)
  1423. # Tokenize the sample sentence.
  1424. print("* Sentence:")
  1425. print(sent)
  1426. tokens = sent.split()
  1427. print(tokens)
  1428. print()
  1429. # Ask the user which parser to test,
  1430. # if the parser wasn't provided as an argument
  1431. if choice is None:
  1432. print(" 1: Top-down chart parser")
  1433. print(" 2: Bottom-up chart parser")
  1434. print(" 3: Bottom-up left-corner chart parser")
  1435. print(" 4: Left-corner chart parser with bottom-up filter")
  1436. print(" 5: Stepping chart parser (alternating top-down & bottom-up)")
  1437. print(" 6: All parsers")
  1438. print("\nWhich parser (1-6)? ", end=" ")
  1439. choice = sys.stdin.readline().strip()
  1440. print()
  1441. choice = str(choice)
  1442. if choice not in "123456":
  1443. print("Bad parser number")
  1444. return
  1445. # Keep track of how long each parser takes.
  1446. times = {}
  1447. strategies = {
  1448. "1": ("Top-down", TD_STRATEGY),
  1449. "2": ("Bottom-up", BU_STRATEGY),
  1450. "3": ("Bottom-up left-corner", BU_LC_STRATEGY),
  1451. "4": ("Filtered left-corner", LC_STRATEGY),
  1452. }
  1453. choices = []
  1454. if choice in strategies:
  1455. choices = [choice]
  1456. if choice == "6":
  1457. choices = "1234"
  1458. # Run the requested chart parser(s), except the stepping parser.
  1459. for strategy in choices:
  1460. print("* Strategy: " + strategies[strategy][0])
  1461. print()
  1462. cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
  1463. t = time.time()
  1464. chart = cp.chart_parse(tokens)
  1465. parses = list(chart.parses(grammar.start()))
  1466. times[strategies[strategy][0]] = time.time() - t
  1467. print("Nr edges in chart:", len(chart.edges()))
  1468. if numparses:
  1469. assert len(parses) == numparses, "Not all parses found"
  1470. if print_trees:
  1471. for tree in parses:
  1472. print(tree)
  1473. else:
  1474. print("Nr trees:", len(parses))
  1475. print()
  1476. # Run the stepping parser, if requested.
  1477. if choice in "56":
  1478. print("* Strategy: Stepping (top-down vs bottom-up)")
  1479. print()
  1480. t = time.time()
  1481. cp = SteppingChartParser(grammar, trace=trace)
  1482. cp.initialize(tokens)
  1483. for i in range(5):
  1484. print("*** SWITCH TO TOP DOWN")
  1485. cp.set_strategy(TD_STRATEGY)
  1486. for j, e in enumerate(cp.step()):
  1487. if j > 20 or e is None:
  1488. break
  1489. print("*** SWITCH TO BOTTOM UP")
  1490. cp.set_strategy(BU_STRATEGY)
  1491. for j, e in enumerate(cp.step()):
  1492. if j > 20 or e is None:
  1493. break
  1494. times["Stepping"] = time.time() - t
  1495. print("Nr edges in chart:", len(cp.chart().edges()))
  1496. if numparses:
  1497. assert len(list(cp.parses())) == numparses, "Not all parses found"
  1498. if print_trees:
  1499. for tree in cp.parses():
  1500. print(tree)
  1501. else:
  1502. print("Nr trees:", len(list(cp.parses())))
  1503. print()
  1504. # Print the times of all parsers:
  1505. if not (print_times and times):
  1506. return
  1507. print("* Parsing times")
  1508. print()
  1509. maxlen = max(len(key) for key in times)
  1510. format = "%" + repr(maxlen) + "s parser: %6.3fsec"
  1511. times_items = times.items()
  1512. for (parser, t) in sorted(times_items, key=lambda a: a[1]):
  1513. print(format % (parser, t))
  1514. if __name__ == "__main__":
  1515. demo()