chart.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. The lexicon is constructed by calling
  9. ``lexicon.fromstring(<lexicon string>)``.
  10. In order to construct a parser, you also need a rule set.
  11. The standard English rules are provided in chart as
  12. ``chart.DefaultRuleSet``.
  13. The parser can then be constructed by calling, for example:
  14. ``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
  15. Parsing is then performed by running
  16. ``parser.parse(<sentence>.split())``.
  17. While this returns a list of trees, the default representation
  18. of the produced trees is not very enlightening, particularly
  19. given that it uses the same tree class as the CFG parsers.
  20. It is probably better to call:
  21. ``chart.printCCGDerivation(<parse tree extracted from list>)``
  22. which should print a nice representation of the derivation.
  23. This entire process is shown far more clearly in the demonstration:
  24. python chart.py
  25. """
  26. import itertools
  27. from nltk.parse import ParserI
  28. from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
  29. from nltk.tree import Tree
  30. from nltk.ccg.lexicon import fromstring, Token
  31. from nltk.ccg.combinator import (
  32. ForwardT,
  33. BackwardT,
  34. ForwardApplication,
  35. BackwardApplication,
  36. ForwardComposition,
  37. BackwardComposition,
  38. ForwardSubstitution,
  39. BackwardBx,
  40. BackwardSx,
  41. )
  42. from nltk.ccg.combinator import *
  43. from nltk.ccg.logic import *
  44. from nltk.sem.logic import *
  45. # Based on the EdgeI class from NLTK.
  46. # A number of the properties of the EdgeI interface don't
  47. # transfer well to CCGs, however.
  48. class CCGEdge(EdgeI):
  49. def __init__(self, span, categ, rule):
  50. self._span = span
  51. self._categ = categ
  52. self._rule = rule
  53. self._comparison_key = (span, categ, rule)
  54. # Accessors
  55. def lhs(self):
  56. return self._categ
  57. def span(self):
  58. return self._span
  59. def start(self):
  60. return self._span[0]
  61. def end(self):
  62. return self._span[1]
  63. def length(self):
  64. return self._span[1] - self.span[0]
  65. def rhs(self):
  66. return ()
  67. def dot(self):
  68. return 0
  69. def is_complete(self):
  70. return True
  71. def is_incomplete(self):
  72. return False
  73. def nextsym(self):
  74. return None
  75. def categ(self):
  76. return self._categ
  77. def rule(self):
  78. return self._rule
  79. class CCGLeafEdge(EdgeI):
  80. """
  81. Class representing leaf edges in a CCG derivation.
  82. """
  83. def __init__(self, pos, token, leaf):
  84. self._pos = pos
  85. self._token = token
  86. self._leaf = leaf
  87. self._comparison_key = (pos, token.categ(), leaf)
  88. # Accessors
  89. def lhs(self):
  90. return self._token.categ()
  91. def span(self):
  92. return (self._pos, self._pos + 1)
  93. def start(self):
  94. return self._pos
  95. def end(self):
  96. return self._pos + 1
  97. def length(self):
  98. return 1
  99. def rhs(self):
  100. return self._leaf
  101. def dot(self):
  102. return 0
  103. def is_complete(self):
  104. return True
  105. def is_incomplete(self):
  106. return False
  107. def nextsym(self):
  108. return None
  109. def token(self):
  110. return self._token
  111. def categ(self):
  112. return self._token.categ()
  113. def leaf(self):
  114. return self._leaf
  115. class BinaryCombinatorRule(AbstractChartRule):
  116. """
  117. Class implementing application of a binary combinator to a chart.
  118. Takes the directed combinator to apply.
  119. """
  120. NUMEDGES = 2
  121. def __init__(self, combinator):
  122. self._combinator = combinator
  123. # Apply a combinator
  124. def apply(self, chart, grammar, left_edge, right_edge):
  125. # The left & right edges must be touching.
  126. if not (left_edge.end() == right_edge.start()):
  127. return
  128. # Check if the two edges are permitted to combine.
  129. # If so, generate the corresponding edge.
  130. if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
  131. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  132. new_edge = CCGEdge(
  133. span=(left_edge.start(), right_edge.end()),
  134. categ=res,
  135. rule=self._combinator,
  136. )
  137. if chart.insert(new_edge, (left_edge, right_edge)):
  138. yield new_edge
  139. # The representation of the combinator (for printing derivations)
  140. def __str__(self):
  141. return "%s" % self._combinator
  142. # Type-raising must be handled slightly differently to the other rules, as the
  143. # resulting rules only span a single edge, rather than both edges.
  144. class ForwardTypeRaiseRule(AbstractChartRule):
  145. """
  146. Class for applying forward type raising
  147. """
  148. NUMEDGES = 2
  149. def __init__(self):
  150. self._combinator = ForwardT
  151. def apply(self, chart, grammar, left_edge, right_edge):
  152. if not (left_edge.end() == right_edge.start()):
  153. return
  154. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  155. new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
  156. if chart.insert(new_edge, (left_edge,)):
  157. yield new_edge
  158. def __str__(self):
  159. return "%s" % self._combinator
  160. class BackwardTypeRaiseRule(AbstractChartRule):
  161. """
  162. Class for applying backward type raising.
  163. """
  164. NUMEDGES = 2
  165. def __init__(self):
  166. self._combinator = BackwardT
  167. def apply(self, chart, grammar, left_edge, right_edge):
  168. if not (left_edge.end() == right_edge.start()):
  169. return
  170. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  171. new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
  172. if chart.insert(new_edge, (right_edge,)):
  173. yield new_edge
  174. def __str__(self):
  175. return "%s" % self._combinator
  176. # Common sets of combinators used for English derivations.
  177. ApplicationRuleSet = [
  178. BinaryCombinatorRule(ForwardApplication),
  179. BinaryCombinatorRule(BackwardApplication),
  180. ]
  181. CompositionRuleSet = [
  182. BinaryCombinatorRule(ForwardComposition),
  183. BinaryCombinatorRule(BackwardComposition),
  184. BinaryCombinatorRule(BackwardBx),
  185. ]
  186. SubstitutionRuleSet = [
  187. BinaryCombinatorRule(ForwardSubstitution),
  188. BinaryCombinatorRule(BackwardSx),
  189. ]
  190. TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
  191. # The standard English rule set.
  192. DefaultRuleSet = (
  193. ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
  194. )
  195. class CCGChartParser(ParserI):
  196. """
  197. Chart parser for CCGs.
  198. Based largely on the ChartParser class from NLTK.
  199. """
  200. def __init__(self, lexicon, rules, trace=0):
  201. self._lexicon = lexicon
  202. self._rules = rules
  203. self._trace = trace
  204. def lexicon(self):
  205. return self._lexicon
  206. # Implements the CYK algorithm
  207. def parse(self, tokens):
  208. tokens = list(tokens)
  209. chart = CCGChart(list(tokens))
  210. lex = self._lexicon
  211. # Initialize leaf edges.
  212. for index in range(chart.num_leaves()):
  213. for token in lex.categories(chart.leaf(index)):
  214. new_edge = CCGLeafEdge(index, token, chart.leaf(index))
  215. chart.insert(new_edge, ())
  216. # Select a span for the new edges
  217. for span in range(2, chart.num_leaves() + 1):
  218. for start in range(0, chart.num_leaves() - span + 1):
  219. # Try all possible pairs of edges that could generate
  220. # an edge for that span
  221. for part in range(1, span):
  222. lstart = start
  223. mid = start + part
  224. rend = start + span
  225. for left in chart.select(span=(lstart, mid)):
  226. for right in chart.select(span=(mid, rend)):
  227. # Generate all possible combinations of the two edges
  228. for rule in self._rules:
  229. edges_added_by_rule = 0
  230. for newedge in rule.apply(chart, lex, left, right):
  231. edges_added_by_rule += 1
  232. # Output the resulting parses
  233. return chart.parses(lex.start())
  234. class CCGChart(Chart):
  235. def __init__(self, tokens):
  236. Chart.__init__(self, tokens)
  237. # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
  238. # constructed slightly differently to those in the default Chart class, so it has to
  239. # be reimplemented
  240. def _trees(self, edge, complete, memo, tree_class):
  241. assert complete, "CCGChart cannot build incomplete trees"
  242. if edge in memo:
  243. return memo[edge]
  244. if isinstance(edge, CCGLeafEdge):
  245. word = tree_class(edge.token(), [self._tokens[edge.start()]])
  246. leaf = tree_class((edge.token(), "Leaf"), [word])
  247. memo[edge] = [leaf]
  248. return [leaf]
  249. memo[edge] = []
  250. trees = []
  251. for cpl in self.child_pointer_lists(edge):
  252. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
  253. for children in itertools.product(*child_choices):
  254. lhs = (
  255. Token(
  256. self._tokens[edge.start() : edge.end()],
  257. edge.lhs(),
  258. compute_semantics(children, edge),
  259. ),
  260. str(edge.rule()),
  261. )
  262. trees.append(tree_class(lhs, children))
  263. memo[edge] = trees
  264. return trees
  265. def compute_semantics(children, edge):
  266. if children[0].label()[0].semantics() is None:
  267. return None
  268. if len(children) == 2:
  269. if isinstance(edge.rule(), BackwardCombinator):
  270. children = [children[1], children[0]]
  271. combinator = edge.rule()._combinator
  272. function = children[0].label()[0].semantics()
  273. argument = children[1].label()[0].semantics()
  274. if isinstance(combinator, UndirectedFunctionApplication):
  275. return compute_function_semantics(function, argument)
  276. elif isinstance(combinator, UndirectedComposition):
  277. return compute_composition_semantics(function, argument)
  278. elif isinstance(combinator, UndirectedSubstitution):
  279. return compute_substitution_semantics(function, argument)
  280. else:
  281. raise AssertionError("Unsupported combinator '" + combinator + "'")
  282. else:
  283. return compute_type_raised_semantics(children[0].label()[0].semantics())
  284. # --------
  285. # Displaying derivations
  286. # --------
  287. def printCCGDerivation(tree):
  288. # Get the leaves and initial categories
  289. leafcats = tree.pos()
  290. leafstr = ""
  291. catstr = ""
  292. # Construct a string with both the leaf word and corresponding
  293. # category aligned.
  294. for (leaf, cat) in leafcats:
  295. str_cat = "%s" % cat
  296. nextlen = 2 + max(len(leaf), len(str_cat))
  297. lcatlen = (nextlen - len(str_cat)) // 2
  298. rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
  299. catstr += " " * lcatlen + str_cat + " " * rcatlen
  300. lleaflen = (nextlen - len(leaf)) // 2
  301. rleaflen = lleaflen + (nextlen - len(leaf)) % 2
  302. leafstr += " " * lleaflen + leaf + " " * rleaflen
  303. print(leafstr.rstrip())
  304. print(catstr.rstrip())
  305. # Display the derivation steps
  306. printCCGTree(0, tree)
  307. # Prints the sequence of derivation steps.
  308. def printCCGTree(lwidth, tree):
  309. rwidth = lwidth
  310. # Is a leaf (word).
  311. # Increment the span by the space occupied by the leaf.
  312. if not isinstance(tree, Tree):
  313. return 2 + lwidth + len(tree)
  314. # Find the width of the current derivation step
  315. for child in tree:
  316. rwidth = max(rwidth, printCCGTree(rwidth, child))
  317. # Is a leaf node.
  318. # Don't print anything, but account for the space occupied.
  319. if not isinstance(tree.label(), tuple):
  320. return max(
  321. rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
  322. )
  323. (token, op) = tree.label()
  324. if op == "Leaf":
  325. return rwidth
  326. # Pad to the left with spaces, followed by a sequence of '-'
  327. # and the derivation rule.
  328. print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
  329. # Print the resulting category on a new line.
  330. str_res = "%s" % (token.categ())
  331. if token.semantics() is not None:
  332. str_res += " {" + str(token.semantics()) + "}"
  333. respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
  334. print(respadlen * " " + str_res)
  335. return rwidth
  336. ### Demonstration code
  337. # Construct the lexicon
  338. lex = fromstring(
  339. """
  340. :- S, NP, N, VP # Primitive categories, S is the target primitive
  341. Det :: NP/N # Family of words
  342. Pro :: NP
  343. TV :: VP/NP
  344. Modal :: (S\\NP)/VP # Backslashes need to be escaped
  345. I => Pro # Word -> Category mapping
  346. you => Pro
  347. the => Det
  348. # Variables have the special keyword 'var'
  349. # '.' prevents permutation
  350. # ',' prevents composition
  351. and => var\\.,var/.,var
  352. which => (N\\N)/(S/NP)
  353. will => Modal # Categories can be either explicit, or families.
  354. might => Modal
  355. cook => TV
  356. eat => TV
  357. mushrooms => N
  358. parsnips => N
  359. bacon => N
  360. """
  361. )
  362. def demo():
  363. parser = CCGChartParser(lex, DefaultRuleSet)
  364. for parse in parser.parse("I might cook and eat the bacon".split()):
  365. printCCGDerivation(parse)
  366. if __name__ == "__main__":
  367. demo()