grammar.py 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Context Free Grammars
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Steven Bird <stevenbird1@gmail.com>
  6. # Edward Loper <edloper@gmail.com>
  7. # Jason Narad <jason.narad@gmail.com>
  8. # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. #
  12. """
  13. Basic data classes for representing context free grammars. A
  14. "grammar" specifies which trees can represent the structure of a
  15. given text. Each of these trees is called a "parse tree" for the
  16. text (or simply a "parse"). In a "context free" grammar, the set of
  17. parse trees for any piece of a text can depend only on that piece, and
  18. not on the rest of the text (i.e., the piece's context). Context free
  19. grammars are often used to find possible syntactic structures for
  20. sentences. In this context, the leaves of a parse tree are word
  21. tokens; and the node values are phrasal categories, such as ``NP``
  22. and ``VP``.
  23. The ``CFG`` class is used to encode context free grammars. Each
  24. ``CFG`` consists of a start symbol and a set of productions.
  25. The "start symbol" specifies the root node value for parse trees. For example,
  26. the start symbol for syntactic parsing is usually ``S``. Start
  27. symbols are encoded using the ``Nonterminal`` class, which is discussed
  28. below.
  29. A Grammar's "productions" specify what parent-child relationships a parse
  30. tree can contain. Each production specifies that a particular
  31. node can be the parent of a particular set of children. For example,
  32. the production ``<S> -> <NP> <VP>`` specifies that an ``S`` node can
  33. be the parent of an ``NP`` node and a ``VP`` node.
  34. Grammar productions are implemented by the ``Production`` class.
  35. Each ``Production`` consists of a left hand side and a right hand
  36. side. The "left hand side" is a ``Nonterminal`` that specifies the
  37. node type for a potential parent; and the "right hand side" is a list
  38. that specifies allowable children for that parent. This lists
  39. consists of ``Nonterminals`` and text types: each ``Nonterminal``
  40. indicates that the corresponding child may be a ``TreeToken`` with the
  41. specified node type; and each text type indicates that the
  42. corresponding child may be a ``Token`` with the with that type.
  43. The ``Nonterminal`` class is used to distinguish node values from leaf
  44. values. This prevents the grammar from accidentally using a leaf
  45. value (such as the English word "A") as the node of a subtree. Within
  46. a ``CFG``, all node values are wrapped in the ``Nonterminal``
  47. class. Note, however, that the trees that are specified by the grammar do
  48. *not* include these ``Nonterminal`` wrappers.
  49. Grammars can also be given a more procedural interpretation. According to
  50. this interpretation, a Grammar specifies any tree structure *tree* that
  51. can be produced by the following procedure:
  52. | Set tree to the start symbol
  53. | Repeat until tree contains no more nonterminal leaves:
  54. | Choose a production prod with whose left hand side
  55. | lhs is a nonterminal leaf of tree.
  56. | Replace the nonterminal leaf with a subtree, whose node
  57. | value is the value wrapped by the nonterminal lhs, and
  58. | whose children are the right hand side of prod.
  59. The operation of replacing the left hand side (*lhs*) of a production
  60. with the right hand side (*rhs*) in a tree (*tree*) is known as
  61. "expanding" *lhs* to *rhs* in *tree*.
  62. """
  63. import re
  64. from functools import total_ordering
  65. from nltk.util import transitive_closure, invert_graph
  66. from nltk.internals import raise_unorderable_types
  67. from nltk.probability import ImmutableProbabilisticMixIn
  68. from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
  69. #################################################################
  70. # Nonterminal
  71. #################################################################
  72. @total_ordering
  73. class Nonterminal(object):
  74. """
  75. A non-terminal symbol for a context free grammar. ``Nonterminal``
  76. is a wrapper class for node values; it is used by ``Production``
  77. objects to distinguish node values from leaf values.
  78. The node value that is wrapped by a ``Nonterminal`` is known as its
  79. "symbol". Symbols are typically strings representing phrasal
  80. categories (such as ``"NP"`` or ``"VP"``). However, more complex
  81. symbol types are sometimes used (e.g., for lexicalized grammars).
  82. Since symbols are node values, they must be immutable and
  83. hashable. Two ``Nonterminals`` are considered equal if their
  84. symbols are equal.
  85. :see: ``CFG``, ``Production``
  86. :type _symbol: any
  87. :ivar _symbol: The node value corresponding to this
  88. ``Nonterminal``. This value must be immutable and hashable.
  89. """
  90. def __init__(self, symbol):
  91. """
  92. Construct a new non-terminal from the given symbol.
  93. :type symbol: any
  94. :param symbol: The node value corresponding to this
  95. ``Nonterminal``. This value must be immutable and
  96. hashable.
  97. """
  98. self._symbol = symbol
  99. self._hash = hash(symbol)
  100. def symbol(self):
  101. """
  102. Return the node value corresponding to this ``Nonterminal``.
  103. :rtype: (any)
  104. """
  105. return self._symbol
  106. def __eq__(self, other):
  107. """
  108. Return True if this non-terminal is equal to ``other``. In
  109. particular, return True if ``other`` is a ``Nonterminal``
  110. and this non-terminal's symbol is equal to ``other`` 's symbol.
  111. :rtype: bool
  112. """
  113. return type(self) == type(other) and self._symbol == other._symbol
  114. def __ne__(self, other):
  115. return not self == other
  116. def __lt__(self, other):
  117. if not isinstance(other, Nonterminal):
  118. raise_unorderable_types("<", self, other)
  119. return self._symbol < other._symbol
  120. def __hash__(self):
  121. return self._hash
  122. def __repr__(self):
  123. """
  124. Return a string representation for this ``Nonterminal``.
  125. :rtype: str
  126. """
  127. if isinstance(self._symbol, str):
  128. return "%s" % self._symbol
  129. else:
  130. return "%s" % repr(self._symbol)
  131. def __str__(self):
  132. """
  133. Return a string representation for this ``Nonterminal``.
  134. :rtype: str
  135. """
  136. if isinstance(self._symbol, str):
  137. return "%s" % self._symbol
  138. else:
  139. return "%s" % repr(self._symbol)
  140. def __div__(self, rhs):
  141. """
  142. Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
  143. the symbol for this nonterminal, and ``B`` is the symbol for rhs.
  144. :param rhs: The nonterminal used to form the right hand side
  145. of the new nonterminal.
  146. :type rhs: Nonterminal
  147. :rtype: Nonterminal
  148. """
  149. return Nonterminal("%s/%s" % (self._symbol, rhs._symbol))
  150. def __truediv__(self, rhs):
  151. """
  152. Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
  153. the symbol for this nonterminal, and ``B`` is the symbol for rhs.
  154. This function allows use of the slash ``/`` operator with
  155. the future import of division.
  156. :param rhs: The nonterminal used to form the right hand side
  157. of the new nonterminal.
  158. :type rhs: Nonterminal
  159. :rtype: Nonterminal
  160. """
  161. return self.__div__(rhs)
  162. def nonterminals(symbols):
  163. """
  164. Given a string containing a list of symbol names, return a list of
  165. ``Nonterminals`` constructed from those symbols.
  166. :param symbols: The symbol name string. This string can be
  167. delimited by either spaces or commas.
  168. :type symbols: str
  169. :return: A list of ``Nonterminals`` constructed from the symbol
  170. names given in ``symbols``. The ``Nonterminals`` are sorted
  171. in the same order as the symbols names.
  172. :rtype: list(Nonterminal)
  173. """
  174. if "," in symbols:
  175. symbol_list = symbols.split(",")
  176. else:
  177. symbol_list = symbols.split()
  178. return [Nonterminal(s.strip()) for s in symbol_list]
  179. class FeatStructNonterminal(FeatDict, Nonterminal):
  180. """A feature structure that's also a nonterminal. It acts as its
  181. own symbol, and automatically freezes itself when hashed."""
  182. def __hash__(self):
  183. self.freeze()
  184. return FeatStruct.__hash__(self)
  185. def symbol(self):
  186. return self
  187. def is_nonterminal(item):
  188. """
  189. :return: True if the item is a ``Nonterminal``.
  190. :rtype: bool
  191. """
  192. return isinstance(item, Nonterminal)
  193. #################################################################
  194. # Terminals
  195. #################################################################
  196. def is_terminal(item):
  197. """
  198. Return True if the item is a terminal, which currently is
  199. if it is hashable and not a ``Nonterminal``.
  200. :rtype: bool
  201. """
  202. return hasattr(item, "__hash__") and not isinstance(item, Nonterminal)
  203. #################################################################
  204. # Productions
  205. #################################################################
  206. @total_ordering
  207. class Production(object):
  208. """
  209. A grammar production. Each production maps a single symbol
  210. on the "left-hand side" to a sequence of symbols on the
  211. "right-hand side". (In the case of context-free productions,
  212. the left-hand side must be a ``Nonterminal``, and the right-hand
  213. side is a sequence of terminals and ``Nonterminals``.)
  214. "terminals" can be any immutable hashable object that is
  215. not a ``Nonterminal``. Typically, terminals are strings
  216. representing words, such as ``"dog"`` or ``"under"``.
  217. :see: ``CFG``
  218. :see: ``DependencyGrammar``
  219. :see: ``Nonterminal``
  220. :type _lhs: Nonterminal
  221. :ivar _lhs: The left-hand side of the production.
  222. :type _rhs: tuple(Nonterminal, terminal)
  223. :ivar _rhs: The right-hand side of the production.
  224. """
  225. def __init__(self, lhs, rhs):
  226. """
  227. Construct a new ``Production``.
  228. :param lhs: The left-hand side of the new ``Production``.
  229. :type lhs: Nonterminal
  230. :param rhs: The right-hand side of the new ``Production``.
  231. :type rhs: sequence(Nonterminal and terminal)
  232. """
  233. if isinstance(rhs, str):
  234. raise TypeError(
  235. "production right hand side should be a list, " "not a string"
  236. )
  237. self._lhs = lhs
  238. self._rhs = tuple(rhs)
  239. self._hash = hash((self._lhs, self._rhs))
  240. def lhs(self):
  241. """
  242. Return the left-hand side of this ``Production``.
  243. :rtype: Nonterminal
  244. """
  245. return self._lhs
  246. def rhs(self):
  247. """
  248. Return the right-hand side of this ``Production``.
  249. :rtype: sequence(Nonterminal and terminal)
  250. """
  251. return self._rhs
  252. def __len__(self):
  253. """
  254. Return the length of the right-hand side.
  255. :rtype: int
  256. """
  257. return len(self._rhs)
  258. def is_nonlexical(self):
  259. """
  260. Return True if the right-hand side only contains ``Nonterminals``
  261. :rtype: bool
  262. """
  263. return all(is_nonterminal(n) for n in self._rhs)
  264. def is_lexical(self):
  265. """
  266. Return True if the right-hand contain at least one terminal token.
  267. :rtype: bool
  268. """
  269. return not self.is_nonlexical()
  270. def __str__(self):
  271. """
  272. Return a verbose string representation of the ``Production``.
  273. :rtype: str
  274. """
  275. result = "%s -> " % repr(self._lhs)
  276. result += " ".join(repr(el) for el in self._rhs)
  277. return result
  278. def __repr__(self):
  279. """
  280. Return a concise string representation of the ``Production``.
  281. :rtype: str
  282. """
  283. return "%s" % self
  284. def __eq__(self, other):
  285. """
  286. Return True if this ``Production`` is equal to ``other``.
  287. :rtype: bool
  288. """
  289. return (
  290. type(self) == type(other)
  291. and self._lhs == other._lhs
  292. and self._rhs == other._rhs
  293. )
  294. def __ne__(self, other):
  295. return not self == other
  296. def __lt__(self, other):
  297. if not isinstance(other, Production):
  298. raise_unorderable_types("<", self, other)
  299. return (self._lhs, self._rhs) < (other._lhs, other._rhs)
  300. def __hash__(self):
  301. """
  302. Return a hash value for the ``Production``.
  303. :rtype: int
  304. """
  305. return self._hash
  306. class DependencyProduction(Production):
  307. """
  308. A dependency grammar production. Each production maps a single
  309. head word to an unordered list of one or more modifier words.
  310. """
  311. def __str__(self):
  312. """
  313. Return a verbose string representation of the ``DependencyProduction``.
  314. :rtype: str
  315. """
  316. result = "'%s' ->" % (self._lhs,)
  317. for elt in self._rhs:
  318. result += " '%s'" % (elt,)
  319. return result
  320. class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
  321. """
  322. A probabilistic context free grammar production.
  323. A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
  324. has an associated probability, which represents how likely it is that
  325. this production will be used. In particular, the probability of a
  326. ``ProbabilisticProduction`` records the likelihood that its right-hand side is
  327. the correct instantiation for any given occurrence of its left-hand side.
  328. :see: ``Production``
  329. """
  330. def __init__(self, lhs, rhs, **prob):
  331. """
  332. Construct a new ``ProbabilisticProduction``.
  333. :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
  334. :type lhs: Nonterminal
  335. :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
  336. :type rhs: sequence(Nonterminal and terminal)
  337. :param prob: Probability parameters of the new ``ProbabilisticProduction``.
  338. """
  339. ImmutableProbabilisticMixIn.__init__(self, **prob)
  340. Production.__init__(self, lhs, rhs)
  341. def __str__(self):
  342. return super().__str__() + (
  343. " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob()
  344. )
  345. def __eq__(self, other):
  346. return (
  347. type(self) == type(other)
  348. and self._lhs == other._lhs
  349. and self._rhs == other._rhs
  350. and self.prob() == other.prob()
  351. )
  352. def __ne__(self, other):
  353. return not self == other
  354. def __hash__(self):
  355. return hash((self._lhs, self._rhs, self.prob()))
  356. #################################################################
  357. # Grammars
  358. #################################################################
  359. class CFG(object):
  360. """
  361. A context-free grammar. A grammar consists of a start state and
  362. a set of productions. The set of terminals and nonterminals is
  363. implicitly specified by the productions.
  364. If you need efficient key-based access to productions, you
  365. can use a subclass to implement it.
  366. """
  367. def __init__(self, start, productions, calculate_leftcorners=True):
  368. """
  369. Create a new context-free grammar, from the given start state
  370. and set of ``Production``s.
  371. :param start: The start symbol
  372. :type start: Nonterminal
  373. :param productions: The list of productions that defines the grammar
  374. :type productions: list(Production)
  375. :param calculate_leftcorners: False if we don't want to calculate the
  376. leftcorner relation. In that case, some optimized chart parsers won't work.
  377. :type calculate_leftcorners: bool
  378. """
  379. if not is_nonterminal(start):
  380. raise TypeError(
  381. "start should be a Nonterminal object,"
  382. " not a %s" % type(start).__name__
  383. )
  384. self._start = start
  385. self._productions = productions
  386. self._categories = set(prod.lhs() for prod in productions)
  387. self._calculate_indexes()
  388. self._calculate_grammar_forms()
  389. if calculate_leftcorners:
  390. self._calculate_leftcorners()
  391. def _calculate_indexes(self):
  392. self._lhs_index = {}
  393. self._rhs_index = {}
  394. self._empty_index = {}
  395. self._lexical_index = {}
  396. for prod in self._productions:
  397. # Left hand side.
  398. lhs = prod._lhs
  399. if lhs not in self._lhs_index:
  400. self._lhs_index[lhs] = []
  401. self._lhs_index[lhs].append(prod)
  402. if prod._rhs:
  403. # First item in right hand side.
  404. rhs0 = prod._rhs[0]
  405. if rhs0 not in self._rhs_index:
  406. self._rhs_index[rhs0] = []
  407. self._rhs_index[rhs0].append(prod)
  408. else:
  409. # The right hand side is empty.
  410. self._empty_index[prod.lhs()] = prod
  411. # Lexical tokens in the right hand side.
  412. for token in prod._rhs:
  413. if is_terminal(token):
  414. self._lexical_index.setdefault(token, set()).add(prod)
  415. def _calculate_leftcorners(self):
  416. # Calculate leftcorner relations, for use in optimized parsing.
  417. self._immediate_leftcorner_categories = dict(
  418. (cat, set([cat])) for cat in self._categories
  419. )
  420. self._immediate_leftcorner_words = dict(
  421. (cat, set()) for cat in self._categories
  422. )
  423. for prod in self.productions():
  424. if len(prod) > 0:
  425. cat, left = prod.lhs(), prod.rhs()[0]
  426. if is_nonterminal(left):
  427. self._immediate_leftcorner_categories[cat].add(left)
  428. else:
  429. self._immediate_leftcorner_words[cat].add(left)
  430. lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True)
  431. self._leftcorners = lc
  432. self._leftcorner_parents = invert_graph(lc)
  433. nr_leftcorner_categories = sum(
  434. map(len, self._immediate_leftcorner_categories.values())
  435. )
  436. nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values()))
  437. if nr_leftcorner_words > nr_leftcorner_categories > 10000:
  438. # If the grammar is big, the leftcorner-word dictionary will be too large.
  439. # In that case it is better to calculate the relation on demand.
  440. self._leftcorner_words = None
  441. return
  442. self._leftcorner_words = {}
  443. for cat in self._leftcorners:
  444. lefts = self._leftcorners[cat]
  445. lc = self._leftcorner_words[cat] = set()
  446. for left in lefts:
  447. lc.update(self._immediate_leftcorner_words.get(left, set()))
  448. @classmethod
  449. def fromstring(cls, input, encoding=None):
  450. """
  451. Return the grammar instance corresponding to the input string(s).
  452. :param input: a grammar, either in the form of a string or as a list of strings.
  453. """
  454. start, productions = read_grammar(
  455. input, standard_nonterm_parser, encoding=encoding
  456. )
  457. return cls(start, productions)
  458. def start(self):
  459. """
  460. Return the start symbol of the grammar
  461. :rtype: Nonterminal
  462. """
  463. return self._start
  464. # tricky to balance readability and efficiency here!
  465. # can't use set operations as they don't preserve ordering
  466. def productions(self, lhs=None, rhs=None, empty=False):
  467. """
  468. Return the grammar productions, filtered by the left-hand side
  469. or the first item in the right-hand side.
  470. :param lhs: Only return productions with the given left-hand side.
  471. :param rhs: Only return productions with the given first item
  472. in the right-hand side.
  473. :param empty: Only return productions with an empty right-hand side.
  474. :return: A list of productions matching the given constraints.
  475. :rtype: list(Production)
  476. """
  477. if rhs and empty:
  478. raise ValueError(
  479. "You cannot select empty and non-empty " "productions at the same time."
  480. )
  481. # no constraints so return everything
  482. if not lhs and not rhs:
  483. if not empty:
  484. return self._productions
  485. else:
  486. return self._empty_index.values()
  487. # only lhs specified so look up its index
  488. elif lhs and not rhs:
  489. if not empty:
  490. return self._lhs_index.get(lhs, [])
  491. elif lhs in self._empty_index:
  492. return [self._empty_index[lhs]]
  493. else:
  494. return []
  495. # only rhs specified so look up its index
  496. elif rhs and not lhs:
  497. return self._rhs_index.get(rhs, [])
  498. # intersect
  499. else:
  500. return [
  501. prod
  502. for prod in self._lhs_index.get(lhs, [])
  503. if prod in self._rhs_index.get(rhs, [])
  504. ]
  505. def leftcorners(self, cat):
  506. """
  507. Return the set of all nonterminals that the given nonterminal
  508. can start with, including itself.
  509. This is the reflexive, transitive closure of the immediate
  510. leftcorner relation: (A > B) iff (A -> B beta)
  511. :param cat: the parent of the leftcorners
  512. :type cat: Nonterminal
  513. :return: the set of all leftcorners
  514. :rtype: set(Nonterminal)
  515. """
  516. return self._leftcorners.get(cat, set([cat]))
  517. def is_leftcorner(self, cat, left):
  518. """
  519. True if left is a leftcorner of cat, where left can be a
  520. terminal or a nonterminal.
  521. :param cat: the parent of the leftcorner
  522. :type cat: Nonterminal
  523. :param left: the suggested leftcorner
  524. :type left: Terminal or Nonterminal
  525. :rtype: bool
  526. """
  527. if is_nonterminal(left):
  528. return left in self.leftcorners(cat)
  529. elif self._leftcorner_words:
  530. return left in self._leftcorner_words.get(cat, set())
  531. else:
  532. return any(
  533. left in self._immediate_leftcorner_words.get(parent, set())
  534. for parent in self.leftcorners(cat)
  535. )
  536. def leftcorner_parents(self, cat):
  537. """
  538. Return the set of all nonterminals for which the given category
  539. is a left corner. This is the inverse of the leftcorner relation.
  540. :param cat: the suggested leftcorner
  541. :type cat: Nonterminal
  542. :return: the set of all parents to the leftcorner
  543. :rtype: set(Nonterminal)
  544. """
  545. return self._leftcorner_parents.get(cat, set([cat]))
  546. def check_coverage(self, tokens):
  547. """
  548. Check whether the grammar rules cover the given list of tokens.
  549. If not, then raise an exception.
  550. :type tokens: list(str)
  551. """
  552. missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
  553. if missing:
  554. missing = ", ".join("%r" % (w,) for w in missing)
  555. raise ValueError(
  556. "Grammar does not cover some of the " "input words: %r." % missing
  557. )
  558. def _calculate_grammar_forms(self):
  559. """
  560. Pre-calculate of which form(s) the grammar is.
  561. """
  562. prods = self._productions
  563. self._is_lexical = all(p.is_lexical() for p in prods)
  564. self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1)
  565. self._min_len = min(len(p) for p in prods)
  566. self._max_len = max(len(p) for p in prods)
  567. self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1)
  568. def is_lexical(self):
  569. """
  570. Return True if all productions are lexicalised.
  571. """
  572. return self._is_lexical
  573. def is_nonlexical(self):
  574. """
  575. Return True if all lexical rules are "preterminals", that is,
  576. unary rules which can be separated in a preprocessing step.
  577. This means that all productions are of the forms
  578. A -> B1 ... Bn (n>=0), or A -> "s".
  579. Note: is_lexical() and is_nonlexical() are not opposites.
  580. There are grammars which are neither, and grammars which are both.
  581. """
  582. return self._is_nonlexical
  583. def min_len(self):
  584. """
  585. Return the right-hand side length of the shortest grammar production.
  586. """
  587. return self._min_len
  588. def max_len(self):
  589. """
  590. Return the right-hand side length of the longest grammar production.
  591. """
  592. return self._max_len
  593. def is_nonempty(self):
  594. """
  595. Return True if there are no empty productions.
  596. """
  597. return self._min_len > 0
  598. def is_binarised(self):
  599. """
  600. Return True if all productions are at most binary.
  601. Note that there can still be empty and unary productions.
  602. """
  603. return self._max_len <= 2
  604. def is_flexible_chomsky_normal_form(self):
  605. """
  606. Return True if all productions are of the forms
  607. A -> B C, A -> B, or A -> "s".
  608. """
  609. return self.is_nonempty() and self.is_nonlexical() and self.is_binarised()
  610. def is_chomsky_normal_form(self):
  611. """
  612. Return True if the grammar is of Chomsky Normal Form, i.e. all productions
  613. are of the form A -> B C, or A -> "s".
  614. """
  615. return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
  616. def chomsky_normal_form(self, new_token_padding="@$@", flexible=False):
  617. """
  618. Returns a new Grammer that is in chomsky normal
  619. :param: new_token_padding
  620. Customise new rule formation during binarisation
  621. """
  622. if self.is_chomsky_normal_form():
  623. return self
  624. if self.productions(empty=True):
  625. raise ValueError(
  626. ("Grammar has Empty rules. " "Cannot deal with them at the moment")
  627. )
  628. # check for mixed rules
  629. for rule in self.productions():
  630. if rule.is_lexical() and len(rule.rhs()) > 1:
  631. raise ValueError(
  632. "Cannot handled mixed rule {} => {}".format(rule.lhs(), rule.rhs())
  633. )
  634. step1 = CFG.eliminate_start(self)
  635. step2 = CFG.binarize(step1, new_token_padding)
  636. if flexible:
  637. return step2
  638. step3 = CFG.remove_unitary_rules(step2)
  639. return step3
  640. @classmethod
  641. def remove_unitary_rules(cls, grammar):
  642. """
  643. Remove nonlexical unitary rules and convert them to
  644. lexical
  645. """
  646. result = []
  647. unitary = []
  648. for rule in grammar.productions():
  649. if len(rule) == 1 and rule.is_nonlexical():
  650. unitary.append(rule)
  651. else:
  652. result.append(rule)
  653. while unitary:
  654. rule = unitary.pop(0)
  655. for item in grammar.productions(lhs=rule.rhs()[0]):
  656. new_rule = Production(rule.lhs(), item.rhs())
  657. if len(new_rule) != 1 or new_rule.is_lexical():
  658. result.append(new_rule)
  659. else:
  660. unitary.append(new_rule)
  661. n_grammar = CFG(grammar.start(), result)
  662. return n_grammar
  663. @classmethod
  664. def binarize(cls, grammar, padding="@$@"):
  665. """
  666. Convert all non-binary rules into binary by introducing
  667. new tokens.
  668. Example::
  669. Original:
  670. A => B C D
  671. After Conversion:
  672. A => B A@$@B
  673. A@$@B => C D
  674. """
  675. result = []
  676. for rule in grammar.productions():
  677. if len(rule.rhs()) > 2:
  678. # this rule needs to be broken down
  679. left_side = rule.lhs()
  680. for k in range(0, len(rule.rhs()) - 2):
  681. tsym = rule.rhs()[k]
  682. new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol())
  683. new_production = Production(left_side, (tsym, new_sym))
  684. left_side = new_sym
  685. result.append(new_production)
  686. last_prd = Production(left_side, rule.rhs()[-2:])
  687. result.append(last_prd)
  688. else:
  689. result.append(rule)
  690. n_grammar = CFG(grammar.start(), result)
  691. return n_grammar
  692. @classmethod
  693. def eliminate_start(cls, grammar):
  694. """
  695. Eliminate start rule in case it appears on RHS
  696. Example: S -> S0 S1 and S0 -> S1 S
  697. Then another rule S0_Sigma -> S is added
  698. """
  699. start = grammar.start()
  700. result = []
  701. need_to_add = None
  702. for rule in grammar.productions():
  703. if start in rule.rhs():
  704. need_to_add = True
  705. result.append(rule)
  706. if need_to_add:
  707. start = Nonterminal("S0_SIGMA")
  708. result.append(Production(start, [grammar.start()]))
  709. n_grammar = CFG(start, result)
  710. return n_grammar
  711. return grammar
  712. def __repr__(self):
  713. return "<Grammar with %d productions>" % len(self._productions)
  714. def __str__(self):
  715. result = "Grammar with %d productions" % len(self._productions)
  716. result += " (start state = %r)" % self._start
  717. for production in self._productions:
  718. result += "\n %s" % production
  719. return result
  720. class FeatureGrammar(CFG):
  721. """
  722. A feature-based grammar. This is equivalent to a
  723. ``CFG`` whose nonterminals are all
  724. ``FeatStructNonterminal``.
  725. A grammar consists of a start state and a set of
  726. productions. The set of terminals and nonterminals
  727. is implicitly specified by the productions.
  728. """
  729. def __init__(self, start, productions):
  730. """
  731. Create a new feature-based grammar, from the given start
  732. state and set of ``Productions``.
  733. :param start: The start symbol
  734. :type start: FeatStructNonterminal
  735. :param productions: The list of productions that defines the grammar
  736. :type productions: list(Production)
  737. """
  738. CFG.__init__(self, start, productions)
  739. # The difference with CFG is that the productions are
  740. # indexed on the TYPE feature of the nonterminals.
  741. # This is calculated by the method _get_type_if_possible().
  742. def _calculate_indexes(self):
  743. self._lhs_index = {}
  744. self._rhs_index = {}
  745. self._empty_index = {}
  746. self._empty_productions = []
  747. self._lexical_index = {}
  748. for prod in self._productions:
  749. # Left hand side.
  750. lhs = self._get_type_if_possible(prod._lhs)
  751. if lhs not in self._lhs_index:
  752. self._lhs_index[lhs] = []
  753. self._lhs_index[lhs].append(prod)
  754. if prod._rhs:
  755. # First item in right hand side.
  756. rhs0 = self._get_type_if_possible(prod._rhs[0])
  757. if rhs0 not in self._rhs_index:
  758. self._rhs_index[rhs0] = []
  759. self._rhs_index[rhs0].append(prod)
  760. else:
  761. # The right hand side is empty.
  762. if lhs not in self._empty_index:
  763. self._empty_index[lhs] = []
  764. self._empty_index[lhs].append(prod)
  765. self._empty_productions.append(prod)
  766. # Lexical tokens in the right hand side.
  767. for token in prod._rhs:
  768. if is_terminal(token):
  769. self._lexical_index.setdefault(token, set()).add(prod)
  770. @classmethod
  771. def fromstring(
  772. cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None
  773. ):
  774. """
  775. Return a feature structure based grammar.
  776. :param input: a grammar, either in the form of a string or else
  777. as a list of strings.
  778. :param features: a tuple of features (default: SLASH, TYPE)
  779. :param logic_parser: a parser for lambda-expressions,
  780. by default, ``LogicParser()``
  781. :param fstruct_reader: a feature structure parser
  782. (only if features and logic_parser is None)
  783. """
  784. if features is None:
  785. features = (SLASH, TYPE)
  786. if fstruct_reader is None:
  787. fstruct_reader = FeatStructReader(
  788. features, FeatStructNonterminal, logic_parser=logic_parser
  789. )
  790. elif logic_parser is not None:
  791. raise Exception(
  792. "'logic_parser' and 'fstruct_reader' must " "not both be set"
  793. )
  794. start, productions = read_grammar(
  795. input, fstruct_reader.read_partial, encoding=encoding
  796. )
  797. return cls(start, productions)
  798. def productions(self, lhs=None, rhs=None, empty=False):
  799. """
  800. Return the grammar productions, filtered by the left-hand side
  801. or the first item in the right-hand side.
  802. :param lhs: Only return productions with the given left-hand side.
  803. :param rhs: Only return productions with the given first item
  804. in the right-hand side.
  805. :param empty: Only return productions with an empty right-hand side.
  806. :rtype: list(Production)
  807. """
  808. if rhs and empty:
  809. raise ValueError(
  810. "You cannot select empty and non-empty " "productions at the same time."
  811. )
  812. # no constraints so return everything
  813. if not lhs and not rhs:
  814. if empty:
  815. return self._empty_productions
  816. else:
  817. return self._productions
  818. # only lhs specified so look up its index
  819. elif lhs and not rhs:
  820. if empty:
  821. return self._empty_index.get(self._get_type_if_possible(lhs), [])
  822. else:
  823. return self._lhs_index.get(self._get_type_if_possible(lhs), [])
  824. # only rhs specified so look up its index
  825. elif rhs and not lhs:
  826. return self._rhs_index.get(self._get_type_if_possible(rhs), [])
  827. # intersect
  828. else:
  829. return [
  830. prod
  831. for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
  832. if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])
  833. ]
  834. def leftcorners(self, cat):
  835. """
  836. Return the set of all words that the given category can start with.
  837. Also called the "first set" in compiler construction.
  838. """
  839. raise NotImplementedError("Not implemented yet")
  840. def leftcorner_parents(self, cat):
  841. """
  842. Return the set of all categories for which the given category
  843. is a left corner.
  844. """
  845. raise NotImplementedError("Not implemented yet")
  846. def _get_type_if_possible(self, item):
  847. """
  848. Helper function which returns the ``TYPE`` feature of the ``item``,
  849. if it exists, otherwise it returns the ``item`` itself
  850. """
  851. if isinstance(item, dict) and TYPE in item:
  852. return FeatureValueType(item[TYPE])
  853. else:
  854. return item
  855. @total_ordering
  856. class FeatureValueType(object):
  857. """
  858. A helper class for ``FeatureGrammars``, designed to be different
  859. from ordinary strings. This is to stop the ``FeatStruct``
  860. ``FOO[]`` from being compare equal to the terminal "FOO".
  861. """
  862. def __init__(self, value):
  863. self._value = value
  864. self._hash = hash(value)
  865. def __repr__(self):
  866. return "<%s>" % self._value
  867. def __eq__(self, other):
  868. return type(self) == type(other) and self._value == other._value
  869. def __ne__(self, other):
  870. return not self == other
  871. def __lt__(self, other):
  872. if not isinstance(other, FeatureValueType):
  873. raise_unorderable_types("<", self, other)
  874. return self._value < other._value
  875. def __hash__(self):
  876. return self._hash
  877. class DependencyGrammar(object):
  878. """
  879. A dependency grammar. A DependencyGrammar consists of a set of
  880. productions. Each production specifies a head/modifier relationship
  881. between a pair of words.
  882. """
  883. def __init__(self, productions):
  884. """
  885. Create a new dependency grammar, from the set of ``Productions``.
  886. :param productions: The list of productions that defines the grammar
  887. :type productions: list(Production)
  888. """
  889. self._productions = productions
  890. @classmethod
  891. def fromstring(cls, input):
  892. productions = []
  893. for linenum, line in enumerate(input.split("\n")):
  894. line = line.strip()
  895. if line.startswith("#") or line == "":
  896. continue
  897. try:
  898. productions += _read_dependency_production(line)
  899. except ValueError:
  900. raise ValueError("Unable to parse line %s: %s" % (linenum, line))
  901. if len(productions) == 0:
  902. raise ValueError("No productions found!")
  903. return cls(productions)
  904. def contains(self, head, mod):
  905. """
  906. :param head: A head word.
  907. :type head: str
  908. :param mod: A mod word, to test as a modifier of 'head'.
  909. :type mod: str
  910. :return: true if this ``DependencyGrammar`` contains a
  911. ``DependencyProduction`` mapping 'head' to 'mod'.
  912. :rtype: bool
  913. """
  914. for production in self._productions:
  915. for possibleMod in production._rhs:
  916. if production._lhs == head and possibleMod == mod:
  917. return True
  918. return False
  919. def __contains__(self, head, mod):
  920. """
  921. Return True if this ``DependencyGrammar`` contains a
  922. ``DependencyProduction`` mapping 'head' to 'mod'.
  923. :param head: A head word.
  924. :type head: str
  925. :param mod: A mod word, to test as a modifier of 'head'.
  926. :type mod: str
  927. :rtype: bool
  928. """
  929. for production in self._productions:
  930. for possibleMod in production._rhs:
  931. if production._lhs == head and possibleMod == mod:
  932. return True
  933. return False
  934. # # should be rewritten, the set comp won't work in all comparisons
  935. # def contains_exactly(self, head, modlist):
  936. # for production in self._productions:
  937. # if(len(production._rhs) == len(modlist)):
  938. # if(production._lhs == head):
  939. # set1 = Set(production._rhs)
  940. # set2 = Set(modlist)
  941. # if(set1 == set2):
  942. # return True
  943. # return False
  944. def __str__(self):
  945. """
  946. Return a verbose string representation of the ``DependencyGrammar``
  947. :rtype: str
  948. """
  949. str = "Dependency grammar with %d productions" % len(self._productions)
  950. for production in self._productions:
  951. str += "\n %s" % production
  952. return str
  953. def __repr__(self):
  954. """
  955. Return a concise string representation of the ``DependencyGrammar``
  956. """
  957. return "Dependency grammar with %d productions" % len(self._productions)
  958. class ProbabilisticDependencyGrammar(object):
  959. """
  960. """
  961. def __init__(self, productions, events, tags):
  962. self._productions = productions
  963. self._events = events
  964. self._tags = tags
  965. def contains(self, head, mod):
  966. """
  967. Return True if this ``DependencyGrammar`` contains a
  968. ``DependencyProduction`` mapping 'head' to 'mod'.
  969. :param head: A head word.
  970. :type head: str
  971. :param mod: A mod word, to test as a modifier of 'head'.
  972. :type mod: str
  973. :rtype: bool
  974. """
  975. for production in self._productions:
  976. for possibleMod in production._rhs:
  977. if production._lhs == head and possibleMod == mod:
  978. return True
  979. return False
  980. def __str__(self):
  981. """
  982. Return a verbose string representation of the ``ProbabilisticDependencyGrammar``
  983. :rtype: str
  984. """
  985. str = "Statistical dependency grammar with %d productions" % len(
  986. self._productions
  987. )
  988. for production in self._productions:
  989. str += "\n %s" % production
  990. str += "\nEvents:"
  991. for event in self._events:
  992. str += "\n %d:%s" % (self._events[event], event)
  993. str += "\nTags:"
  994. for tag_word in self._tags:
  995. str += "\n %s:\t(%s)" % (tag_word, self._tags[tag_word])
  996. return str
  997. def __repr__(self):
  998. """
  999. Return a concise string representation of the ``ProbabilisticDependencyGrammar``
  1000. """
  1001. return "Statistical Dependency grammar with %d productions" % len(
  1002. self._productions
  1003. )
  1004. class PCFG(CFG):
  1005. """
  1006. A probabilistic context-free grammar. A PCFG consists of a
  1007. start state and a set of productions with probabilities. The set of
  1008. terminals and nonterminals is implicitly specified by the productions.
  1009. PCFG productions use the ``ProbabilisticProduction`` class.
  1010. ``PCFGs`` impose the constraint that the set of productions with
  1011. any given left-hand-side must have probabilities that sum to 1
  1012. (allowing for a small margin of error).
  1013. If you need efficient key-based access to productions, you can use
  1014. a subclass to implement it.
  1015. :type EPSILON: float
  1016. :cvar EPSILON: The acceptable margin of error for checking that
  1017. productions with a given left-hand side have probabilities
  1018. that sum to 1.
  1019. """
  1020. EPSILON = 0.01
  1021. def __init__(self, start, productions, calculate_leftcorners=True):
  1022. """
  1023. Create a new context-free grammar, from the given start state
  1024. and set of ``ProbabilisticProductions``.
  1025. :param start: The start symbol
  1026. :type start: Nonterminal
  1027. :param productions: The list of productions that defines the grammar
  1028. :type productions: list(Production)
  1029. :raise ValueError: if the set of productions with any left-hand-side
  1030. do not have probabilities that sum to a value within
  1031. EPSILON of 1.
  1032. :param calculate_leftcorners: False if we don't want to calculate the
  1033. leftcorner relation. In that case, some optimized chart parsers won't work.
  1034. :type calculate_leftcorners: bool
  1035. """
  1036. CFG.__init__(self, start, productions, calculate_leftcorners)
  1037. # Make sure that the probabilities sum to one.
  1038. probs = {}
  1039. for production in productions:
  1040. probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob()
  1041. for (lhs, p) in probs.items():
  1042. if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)):
  1043. raise ValueError("Productions for %r do not sum to 1" % lhs)
  1044. @classmethod
  1045. def fromstring(cls, input, encoding=None):
  1046. """
  1047. Return a probabilistic context-free grammar corresponding to the
  1048. input string(s).
  1049. :param input: a grammar, either in the form of a string or else
  1050. as a list of strings.
  1051. """
  1052. start, productions = read_grammar(
  1053. input, standard_nonterm_parser, probabilistic=True, encoding=encoding
  1054. )
  1055. return cls(start, productions)
  1056. #################################################################
  1057. # Inducing Grammars
  1058. #################################################################
  1059. # Contributed by Nathan Bodenstab <bodenstab@cslu.ogi.edu>
  1060. def induce_pcfg(start, productions):
  1061. """
  1062. Induce a PCFG grammar from a list of productions.
  1063. The probability of a production A -> B C in a PCFG is:
  1064. | count(A -> B C)
  1065. | P(B, C | A) = --------------- where \* is any right hand side
  1066. | count(A -> \*)
  1067. :param start: The start symbol
  1068. :type start: Nonterminal
  1069. :param productions: The list of productions that defines the grammar
  1070. :type productions: list(Production)
  1071. """
  1072. # Production count: the number of times a given production occurs
  1073. pcount = {}
  1074. # LHS-count: counts the number of times a given lhs occurs
  1075. lcount = {}
  1076. for prod in productions:
  1077. lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
  1078. pcount[prod] = pcount.get(prod, 0) + 1
  1079. prods = [
  1080. ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()])
  1081. for p in pcount
  1082. ]
  1083. return PCFG(start, prods)
  1084. #################################################################
  1085. # Helper functions for reading productions
  1086. #################################################################
  1087. def _read_cfg_production(input):
  1088. """
  1089. Return a list of context-free ``Productions``.
  1090. """
  1091. return _read_production(input, standard_nonterm_parser)
  1092. def _read_pcfg_production(input):
  1093. """
  1094. Return a list of PCFG ``ProbabilisticProductions``.
  1095. """
  1096. return _read_production(input, standard_nonterm_parser, probabilistic=True)
  1097. def _read_fcfg_production(input, fstruct_reader):
  1098. """
  1099. Return a list of feature-based ``Productions``.
  1100. """
  1101. return _read_production(input, fstruct_reader)
  1102. # Parsing generic grammars
  1103. _ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE)
  1104. _PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE)
  1105. _TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
  1106. _DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE)
  1107. def _read_production(line, nonterm_parser, probabilistic=False):
  1108. """
  1109. Parse a grammar rule, given as a string, and return
  1110. a list of productions.
  1111. """
  1112. pos = 0
  1113. # Parse the left-hand side.
  1114. lhs, pos = nonterm_parser(line, pos)
  1115. # Skip over the arrow.
  1116. m = _ARROW_RE.match(line, pos)
  1117. if not m:
  1118. raise ValueError("Expected an arrow")
  1119. pos = m.end()
  1120. # Parse the right hand side.
  1121. probabilities = [0.0]
  1122. rhsides = [[]]
  1123. while pos < len(line):
  1124. # Probability.
  1125. m = _PROBABILITY_RE.match(line, pos)
  1126. if probabilistic and m:
  1127. pos = m.end()
  1128. probabilities[-1] = float(m.group(1)[1:-1])
  1129. if probabilities[-1] > 1.0:
  1130. raise ValueError(
  1131. "Production probability %f, "
  1132. "should not be greater than 1.0" % (probabilities[-1],)
  1133. )
  1134. # String -- add terminal.
  1135. elif line[pos] in "'\"":
  1136. m = _TERMINAL_RE.match(line, pos)
  1137. if not m:
  1138. raise ValueError("Unterminated string")
  1139. rhsides[-1].append(m.group(1)[1:-1])
  1140. pos = m.end()
  1141. # Vertical bar -- start new rhside.
  1142. elif line[pos] == "|":
  1143. m = _DISJUNCTION_RE.match(line, pos)
  1144. probabilities.append(0.0)
  1145. rhsides.append([])
  1146. pos = m.end()
  1147. # Anything else -- nonterminal.
  1148. else:
  1149. nonterm, pos = nonterm_parser(line, pos)
  1150. rhsides[-1].append(nonterm)
  1151. if probabilistic:
  1152. return [
  1153. ProbabilisticProduction(lhs, rhs, prob=probability)
  1154. for (rhs, probability) in zip(rhsides, probabilities)
  1155. ]
  1156. else:
  1157. return [Production(lhs, rhs) for rhs in rhsides]
  1158. #################################################################
  1159. # Reading Phrase Structure Grammars
  1160. #################################################################
  1161. def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
  1162. """
  1163. Return a pair consisting of a starting category and a list of
  1164. ``Productions``.
  1165. :param input: a grammar, either in the form of a string or else
  1166. as a list of strings.
  1167. :param nonterm_parser: a function for parsing nonterminals.
  1168. It should take a ``(string, position)`` as argument and
  1169. return a ``(nonterminal, position)`` as result.
  1170. :param probabilistic: are the grammar rules probabilistic?
  1171. :type probabilistic: bool
  1172. :param encoding: the encoding of the grammar, if it is a binary string
  1173. :type encoding: str
  1174. """
  1175. if encoding is not None:
  1176. input = input.decode(encoding)
  1177. if isinstance(input, str):
  1178. lines = input.split("\n")
  1179. else:
  1180. lines = input
  1181. start = None
  1182. productions = []
  1183. continue_line = ""
  1184. for linenum, line in enumerate(lines):
  1185. line = continue_line + line.strip()
  1186. if line.startswith("#") or line == "":
  1187. continue
  1188. if line.endswith("\\"):
  1189. continue_line = line[:-1].rstrip() + " "
  1190. continue
  1191. continue_line = ""
  1192. try:
  1193. if line[0] == "%":
  1194. directive, args = line[1:].split(None, 1)
  1195. if directive == "start":
  1196. start, pos = nonterm_parser(args, 0)
  1197. if pos != len(args):
  1198. raise ValueError("Bad argument to start directive")
  1199. else:
  1200. raise ValueError("Bad directive")
  1201. else:
  1202. # expand out the disjunctions on the RHS
  1203. productions += _read_production(line, nonterm_parser, probabilistic)
  1204. except ValueError as e:
  1205. raise ValueError("Unable to parse line %s: %s\n%s" % (linenum + 1, line, e))
  1206. if not productions:
  1207. raise ValueError("No productions found!")
  1208. if not start:
  1209. start = productions[0].lhs()
  1210. return (start, productions)
  1211. _STANDARD_NONTERM_RE = re.compile("( [\w/][\w/^<>-]* ) \s*", re.VERBOSE)
  1212. def standard_nonterm_parser(string, pos):
  1213. m = _STANDARD_NONTERM_RE.match(string, pos)
  1214. if not m:
  1215. raise ValueError("Expected a nonterminal, found: " + string[pos:])
  1216. return (Nonterminal(m.group(1)), m.end())
  1217. #################################################################
  1218. # Reading Dependency Grammars
  1219. #################################################################
  1220. _READ_DG_RE = re.compile(
  1221. r"""^\s* # leading whitespace
  1222. ('[^']+')\s* # single-quoted lhs
  1223. (?:[-=]+>)\s* # arrow
  1224. (?:( # rhs:
  1225. "[^"]+" # doubled-quoted terminal
  1226. | '[^']+' # single-quoted terminal
  1227. | \| # disjunction
  1228. )
  1229. \s*) # trailing space
  1230. *$""", # zero or more copies
  1231. re.VERBOSE,
  1232. )
  1233. _SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""")
  1234. def _read_dependency_production(s):
  1235. if not _READ_DG_RE.match(s):
  1236. raise ValueError("Bad production string")
  1237. pieces = _SPLIT_DG_RE.split(s)
  1238. pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
  1239. lhside = pieces[0].strip("'\"")
  1240. rhsides = [[]]
  1241. for piece in pieces[2:]:
  1242. if piece == "|":
  1243. rhsides.append([])
  1244. else:
  1245. rhsides[-1].append(piece.strip("'\""))
  1246. return [DependencyProduction(lhside, rhside) for rhside in rhsides]
  1247. #################################################################
  1248. # Demonstration
  1249. #################################################################
  1250. def cfg_demo():
  1251. """
  1252. A demonstration showing how ``CFGs`` can be created and used.
  1253. """
  1254. from nltk import nonterminals, Production, CFG
  1255. # Create some nonterminals
  1256. S, NP, VP, PP = nonterminals("S, NP, VP, PP")
  1257. N, V, P, Det = nonterminals("N, V, P, Det")
  1258. VP_slash_NP = VP / NP
  1259. print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP])
  1260. print(" S.symbol() =>", repr(S.symbol()))
  1261. print()
  1262. print(Production(S, [NP]))
  1263. # Create some Grammar Productions
  1264. grammar = CFG.fromstring(
  1265. """
  1266. S -> NP VP
  1267. PP -> P NP
  1268. NP -> Det N | NP PP
  1269. VP -> V NP | VP PP
  1270. Det -> 'a' | 'the'
  1271. N -> 'dog' | 'cat'
  1272. V -> 'chased' | 'sat'
  1273. P -> 'on' | 'in'
  1274. """
  1275. )
  1276. print("A Grammar:", repr(grammar))
  1277. print(" grammar.start() =>", repr(grammar.start()))
  1278. print(" grammar.productions() =>", end=" ")
  1279. # Use string.replace(...) is to line-wrap the output.
  1280. print(repr(grammar.productions()).replace(",", ",\n" + " " * 25))
  1281. print()
  1282. toy_pcfg1 = PCFG.fromstring(
  1283. """
  1284. S -> NP VP [1.0]
  1285. NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
  1286. Det -> 'the' [0.8] | 'my' [0.2]
  1287. N -> 'man' [0.5] | 'telescope' [0.5]
  1288. VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
  1289. V -> 'ate' [0.35] | 'saw' [0.65]
  1290. PP -> P NP [1.0]
  1291. P -> 'with' [0.61] | 'under' [0.39]
  1292. """
  1293. )
  1294. toy_pcfg2 = PCFG.fromstring(
  1295. """
  1296. S -> NP VP [1.0]
  1297. VP -> V NP [.59]
  1298. VP -> V [.40]
  1299. VP -> VP PP [.01]
  1300. NP -> Det N [.41]
  1301. NP -> Name [.28]
  1302. NP -> NP PP [.31]
  1303. PP -> P NP [1.0]
  1304. V -> 'saw' [.21]
  1305. V -> 'ate' [.51]
  1306. V -> 'ran' [.28]
  1307. N -> 'boy' [.11]
  1308. N -> 'cookie' [.12]
  1309. N -> 'table' [.13]
  1310. N -> 'telescope' [.14]
  1311. N -> 'hill' [.5]
  1312. Name -> 'Jack' [.52]
  1313. Name -> 'Bob' [.48]
  1314. P -> 'with' [.61]
  1315. P -> 'under' [.39]
  1316. Det -> 'the' [.41]
  1317. Det -> 'a' [.31]
  1318. Det -> 'my' [.28]
  1319. """
  1320. )
  1321. def pcfg_demo():
  1322. """
  1323. A demonstration showing how a ``PCFG`` can be created and used.
  1324. """
  1325. from nltk.corpus import treebank
  1326. from nltk import treetransforms
  1327. from nltk import induce_pcfg
  1328. from nltk.parse import pchart
  1329. pcfg_prods = toy_pcfg1.productions()
  1330. pcfg_prod = pcfg_prods[2]
  1331. print("A PCFG production:", repr(pcfg_prod))
  1332. print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs()))
  1333. print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs()))
  1334. print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob()))
  1335. print()
  1336. grammar = toy_pcfg2
  1337. print("A PCFG grammar:", repr(grammar))
  1338. print(" grammar.start() =>", repr(grammar.start()))
  1339. print(" grammar.productions() =>", end=" ")
  1340. # Use .replace(...) is to line-wrap the output.
  1341. print(repr(grammar.productions()).replace(",", ",\n" + " " * 26))
  1342. print()
  1343. # extract productions from three trees and induce the PCFG
  1344. print("Induce PCFG grammar from treebank data:")
  1345. productions = []
  1346. item = treebank._fileids[0]
  1347. for tree in treebank.parsed_sents(item)[:3]:
  1348. # perform optional tree transformations, e.g.:
  1349. tree.collapse_unary(collapsePOS=False)
  1350. tree.chomsky_normal_form(horzMarkov=2)
  1351. productions += tree.productions()
  1352. S = Nonterminal("S")
  1353. grammar = induce_pcfg(S, productions)
  1354. print(grammar)
  1355. print()
  1356. print("Parse sentence using induced grammar:")
  1357. parser = pchart.InsideChartParser(grammar)
  1358. parser.trace(3)
  1359. # doesn't work as tokens are different:
  1360. # sent = treebank.tokenized('wsj_0001.mrg')[0]
  1361. sent = treebank.parsed_sents(item)[0].leaves()
  1362. print(sent)
  1363. for parse in parser.parse(sent):
  1364. print(parse)
  1365. def fcfg_demo():
  1366. import nltk.data
  1367. g = nltk.data.load("grammars/book_grammars/feat0.fcfg")
  1368. print(g)
  1369. print()
  1370. def dg_demo():
  1371. """
  1372. A demonstration showing the creation and inspection of a
  1373. ``DependencyGrammar``.
  1374. """
  1375. grammar = DependencyGrammar.fromstring(
  1376. """
  1377. 'scratch' -> 'cats' | 'walls'
  1378. 'walls' -> 'the'
  1379. 'cats' -> 'the'
  1380. """
  1381. )
  1382. print(grammar)
  1383. def sdg_demo():
  1384. """
  1385. A demonstration of how to read a string representation of
  1386. a CoNLL format dependency tree.
  1387. """
  1388. from nltk.parse import DependencyGraph
  1389. dg = DependencyGraph(
  1390. """
  1391. 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
  1392. 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
  1393. 3 met met Prep Prep voor 8 mod _ _
  1394. 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
  1395. 5 moeder moeder N N soort|ev|neut 3 obj1 _ _
  1396. 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
  1397. 7 gaan ga V V hulp|inf 6 vc _ _
  1398. 8 winkelen winkel V V intrans|inf 11 cnj _ _
  1399. 9 , , Punc Punc komma 8 punct _ _
  1400. 10 zwemmen zwem V V intrans|inf 11 cnj _ _
  1401. 11 of of Conj Conj neven 7 vc _ _
  1402. 12 terrassen terras N N soort|mv|neut 11 cnj _ _
  1403. 13 . . Punc Punc punt 12 punct _ _
  1404. """
  1405. )
  1406. tree = dg.tree()
  1407. print(tree.pprint())
  1408. def demo():
  1409. cfg_demo()
  1410. pcfg_demo()
  1411. fcfg_demo()
  1412. dg_demo()
  1413. sdg_demo()
  1414. if __name__ == "__main__":
  1415. demo()
  1416. __all__ = [
  1417. "Nonterminal",
  1418. "nonterminals",
  1419. "CFG",
  1420. "Production",
  1421. "PCFG",
  1422. "ProbabilisticProduction",
  1423. "DependencyGrammar",
  1424. "DependencyProduction",
  1425. "ProbabilisticDependencyGrammar",
  1426. "induce_pcfg",
  1427. "read_grammar",
  1428. ]