glue.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832
  1. # Natural Language Toolkit: Glue Semantics
  2. #
  3. # Author: Dan Garrette <dhgarrette@gmail.com>
  4. #
  5. # Copyright (C) 2001-2020 NLTK Project
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. import os
  9. from itertools import chain
  10. import nltk
  11. from nltk.internals import Counter
  12. from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
  13. from nltk.sem.logic import (
  14. Expression,
  15. Variable,
  16. VariableExpression,
  17. LambdaExpression,
  18. AbstractVariableExpression,
  19. )
  20. from nltk.sem import drt
  21. from nltk.sem import linearlogic
  22. SPEC_SEMTYPES = {
  23. "a": "ex_quant",
  24. "an": "ex_quant",
  25. "every": "univ_quant",
  26. "the": "def_art",
  27. "no": "no_quant",
  28. "default": "ex_quant",
  29. }
  30. OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"]
  31. class GlueFormula(object):
  32. def __init__(self, meaning, glue, indices=None):
  33. if not indices:
  34. indices = set()
  35. if isinstance(meaning, str):
  36. self.meaning = Expression.fromstring(meaning)
  37. elif isinstance(meaning, Expression):
  38. self.meaning = meaning
  39. else:
  40. raise RuntimeError(
  41. "Meaning term neither string or expression: %s, %s"
  42. % (meaning, meaning.__class__)
  43. )
  44. if isinstance(glue, str):
  45. self.glue = linearlogic.LinearLogicParser().parse(glue)
  46. elif isinstance(glue, linearlogic.Expression):
  47. self.glue = glue
  48. else:
  49. raise RuntimeError(
  50. "Glue term neither string or expression: %s, %s"
  51. % (glue, glue.__class__)
  52. )
  53. self.indices = indices
  54. def applyto(self, arg):
  55. """ self = (\\x.(walk x), (subj -o f))
  56. arg = (john , subj)
  57. returns ((walk john), f)
  58. """
  59. if self.indices & arg.indices: # if the sets are NOT disjoint
  60. raise linearlogic.LinearLogicApplicationException(
  61. "'%s' applied to '%s'. Indices are not disjoint." % (self, arg)
  62. )
  63. else: # if the sets ARE disjoint
  64. return_indices = self.indices | arg.indices
  65. try:
  66. return_glue = linearlogic.ApplicationExpression(
  67. self.glue, arg.glue, arg.indices
  68. )
  69. except linearlogic.LinearLogicApplicationException:
  70. raise linearlogic.LinearLogicApplicationException(
  71. "'%s' applied to '%s'" % (self.simplify(), arg.simplify())
  72. )
  73. arg_meaning_abstracted = arg.meaning
  74. if return_indices:
  75. for dep in self.glue.simplify().antecedent.dependencies[
  76. ::-1
  77. ]: # if self.glue is (A -o B), dep is in A.dependencies
  78. arg_meaning_abstracted = self.make_LambdaExpression(
  79. Variable("v%s" % dep), arg_meaning_abstracted
  80. )
  81. return_meaning = self.meaning.applyto(arg_meaning_abstracted)
  82. return self.__class__(return_meaning, return_glue, return_indices)
  83. def make_VariableExpression(self, name):
  84. return VariableExpression(name)
  85. def make_LambdaExpression(self, variable, term):
  86. return LambdaExpression(variable, term)
  87. def lambda_abstract(self, other):
  88. assert isinstance(other, GlueFormula)
  89. assert isinstance(other.meaning, AbstractVariableExpression)
  90. return self.__class__(
  91. self.make_LambdaExpression(other.meaning.variable, self.meaning),
  92. linearlogic.ImpExpression(other.glue, self.glue),
  93. )
  94. def compile(self, counter=None):
  95. """From Iddo Lev's PhD Dissertation p108-109"""
  96. if not counter:
  97. counter = Counter()
  98. (compiled_glue, new_forms) = self.glue.simplify().compile_pos(
  99. counter, self.__class__
  100. )
  101. return new_forms + [
  102. self.__class__(self.meaning, compiled_glue, set([counter.get()]))
  103. ]
  104. def simplify(self):
  105. return self.__class__(
  106. self.meaning.simplify(), self.glue.simplify(), self.indices
  107. )
  108. def __eq__(self, other):
  109. return (
  110. self.__class__ == other.__class__
  111. and self.meaning == other.meaning
  112. and self.glue == other.glue
  113. )
  114. def __ne__(self, other):
  115. return not self == other
  116. # sorting for use in doctests which must be deterministic
  117. def __lt__(self, other):
  118. return str(self) < str(other)
  119. def __str__(self):
  120. assert isinstance(self.indices, set)
  121. accum = "%s : %s" % (self.meaning, self.glue)
  122. if self.indices:
  123. accum += " : {" + ", ".join(str(index) for index in self.indices) + "}"
  124. return accum
  125. def __repr__(self):
  126. return "%s" % self
  127. class GlueDict(dict):
  128. def __init__(self, filename, encoding=None):
  129. self.filename = filename
  130. self.file_encoding = encoding
  131. self.read_file()
  132. def read_file(self, empty_first=True):
  133. if empty_first:
  134. self.clear()
  135. try:
  136. contents = nltk.data.load(
  137. self.filename, format="text", encoding=self.file_encoding
  138. )
  139. # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
  140. except LookupError as e:
  141. try:
  142. contents = nltk.data.load(
  143. "file:" + self.filename, format="text", encoding=self.file_encoding
  144. )
  145. except LookupError:
  146. raise e
  147. lines = contents.splitlines()
  148. for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
  149. # lambdacalc -^ linear logic -^
  150. line = line.strip() # remove trailing newline
  151. if not len(line):
  152. continue # skip empty lines
  153. if line[0] == "#":
  154. continue # skip commented out lines
  155. parts = line.split(
  156. " : ", 2
  157. ) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
  158. glue_formulas = []
  159. paren_count = 0
  160. tuple_start = 0
  161. tuple_comma = 0
  162. relationships = None
  163. if len(parts) > 1:
  164. for (i, c) in enumerate(parts[1]):
  165. if c == "(":
  166. if paren_count == 0: # if it's the first '(' of a tuple
  167. tuple_start = i + 1 # then save the index
  168. paren_count += 1
  169. elif c == ")":
  170. paren_count -= 1
  171. if paren_count == 0: # if it's the last ')' of a tuple
  172. meaning_term = parts[1][
  173. tuple_start:tuple_comma
  174. ] # '\\x.(<word> x)'
  175. glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)'
  176. glue_formulas.append(
  177. [meaning_term, glue_term]
  178. ) # add the GlueFormula to the list
  179. elif c == ",":
  180. if (
  181. paren_count == 1
  182. ): # if it's a comma separating the parts of the tuple
  183. tuple_comma = i # then save the index
  184. elif c == "#": # skip comments at the ends of lines
  185. if (
  186. paren_count != 0
  187. ): # if the line hasn't parsed correctly so far
  188. raise RuntimeError(
  189. "Formula syntax is incorrect for entry " + line
  190. )
  191. break # break to the next line
  192. if len(parts) > 2: # if there is a relationship entry at the end
  193. rel_start = parts[2].index("[") + 1
  194. rel_end = parts[2].index("]")
  195. if rel_start == rel_end:
  196. relationships = frozenset()
  197. else:
  198. relationships = frozenset(
  199. r.strip() for r in parts[2][rel_start:rel_end].split(",")
  200. )
  201. try:
  202. start_inheritance = parts[0].index("(")
  203. end_inheritance = parts[0].index(")")
  204. sem = parts[0][:start_inheritance].strip()
  205. supertype = parts[0][start_inheritance + 1 : end_inheritance]
  206. except:
  207. sem = parts[0].strip()
  208. supertype = None
  209. if sem not in self:
  210. self[sem] = {}
  211. if (
  212. relationships is None
  213. ): # if not specified for a specific relationship set
  214. # add all relationship entries for parents
  215. if supertype:
  216. for rels in self[supertype]:
  217. if rels not in self[sem]:
  218. self[sem][rels] = []
  219. glue = self[supertype][rels]
  220. self[sem][rels].extend(glue)
  221. self[sem][rels].extend(
  222. glue_formulas
  223. ) # add the glue formulas to every rel entry
  224. else:
  225. if None not in self[sem]:
  226. self[sem][None] = []
  227. self[sem][None].extend(
  228. glue_formulas
  229. ) # add the glue formulas to every rel entry
  230. else:
  231. if relationships not in self[sem]:
  232. self[sem][relationships] = []
  233. if supertype:
  234. self[sem][relationships].extend(self[supertype][relationships])
  235. self[sem][relationships].extend(
  236. glue_formulas
  237. ) # add the glue entry to the dictionary
  238. def __str__(self):
  239. accum = ""
  240. for pos in self:
  241. str_pos = "%s" % pos
  242. for relset in self[pos]:
  243. i = 1
  244. for gf in self[pos][relset]:
  245. if i == 1:
  246. accum += str_pos + ": "
  247. else:
  248. accum += " " * (len(str_pos) + 2)
  249. accum += "%s" % gf
  250. if relset and i == len(self[pos][relset]):
  251. accum += " : %s" % relset
  252. accum += "\n"
  253. i += 1
  254. return accum
  255. def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
  256. if node is None:
  257. # TODO: should it be depgraph.root? Is this code tested?
  258. top = depgraph.nodes[0]
  259. depList = list(chain(*top["deps"].values()))
  260. root = depgraph.nodes[depList[0]]
  261. return self.to_glueformula_list(depgraph, root, Counter(), verbose)
  262. glueformulas = self.lookup(node, depgraph, counter)
  263. for dep_idx in chain(*node["deps"].values()):
  264. dep = depgraph.nodes[dep_idx]
  265. glueformulas.extend(
  266. self.to_glueformula_list(depgraph, dep, counter, verbose)
  267. )
  268. return glueformulas
  269. def lookup(self, node, depgraph, counter):
  270. semtype_names = self.get_semtypes(node)
  271. semtype = None
  272. for name in semtype_names:
  273. if name in self:
  274. semtype = self[name]
  275. break
  276. if semtype is None:
  277. # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
  278. return []
  279. self.add_missing_dependencies(node, depgraph)
  280. lookup = self._lookup_semtype_option(semtype, node, depgraph)
  281. if not len(lookup):
  282. raise KeyError(
  283. "There is no GlueDict entry for sem type of '%s' "
  284. "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"])
  285. )
  286. return self.get_glueformulas_from_semtype_entry(
  287. lookup, node["word"], node, depgraph, counter
  288. )
  289. def add_missing_dependencies(self, node, depgraph):
  290. rel = node["rel"].lower()
  291. if rel == "main":
  292. headnode = depgraph.nodes[node["head"]]
  293. subj = self.lookup_unique("subj", headnode, depgraph)
  294. relation = subj["rel"]
  295. node["deps"].setdefault(relation, [])
  296. node["deps"][relation].append(subj["address"])
  297. # node['deps'].append(subj['address'])
  298. def _lookup_semtype_option(self, semtype, node, depgraph):
  299. relationships = frozenset(
  300. depgraph.nodes[dep]["rel"].lower()
  301. for dep in chain(*node["deps"].values())
  302. if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS
  303. )
  304. try:
  305. lookup = semtype[relationships]
  306. except KeyError:
  307. # An exact match is not found, so find the best match where
  308. # 'best' is defined as the glue entry whose relationship set has the
  309. # most relations of any possible relationship set that is a subset
  310. # of the actual depgraph
  311. best_match = frozenset()
  312. for relset_option in set(semtype) - set([None]):
  313. if (
  314. len(relset_option) > len(best_match)
  315. and relset_option < relationships
  316. ):
  317. best_match = relset_option
  318. if not best_match:
  319. if None in semtype:
  320. best_match = None
  321. else:
  322. return None
  323. lookup = semtype[best_match]
  324. return lookup
  325. def get_semtypes(self, node):
  326. """
  327. Based on the node, return a list of plausible semtypes in order of
  328. plausibility.
  329. """
  330. rel = node["rel"].lower()
  331. word = node["word"].lower()
  332. if rel == "spec":
  333. if word in SPEC_SEMTYPES:
  334. return [SPEC_SEMTYPES[word]]
  335. else:
  336. return [SPEC_SEMTYPES["default"]]
  337. elif rel in ["nmod", "vmod"]:
  338. return [node["tag"], rel]
  339. else:
  340. return [node["tag"]]
  341. def get_glueformulas_from_semtype_entry(
  342. self, lookup, word, node, depgraph, counter
  343. ):
  344. glueformulas = []
  345. glueFormulaFactory = self.get_GlueFormula_factory()
  346. for meaning, glue in lookup:
  347. gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
  348. if not len(glueformulas):
  349. gf.word = word
  350. else:
  351. gf.word = "%s%s" % (word, len(glueformulas) + 1)
  352. gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
  353. glueformulas.append(gf)
  354. return glueformulas
  355. def get_meaning_formula(self, generic, word):
  356. """
  357. :param generic: A meaning formula string containing the
  358. parameter "<word>"
  359. :param word: The actual word to be replace "<word>"
  360. """
  361. word = word.replace(".", "")
  362. return generic.replace("<word>", word)
  363. def initialize_labels(self, expr, node, depgraph, unique_index):
  364. if isinstance(expr, linearlogic.AtomicExpression):
  365. name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
  366. if name[0].isupper():
  367. return linearlogic.VariableExpression(name)
  368. else:
  369. return linearlogic.ConstantExpression(name)
  370. else:
  371. return linearlogic.ImpExpression(
  372. self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
  373. self.initialize_labels(expr.consequent, node, depgraph, unique_index),
  374. )
  375. def find_label_name(self, name, node, depgraph, unique_index):
  376. try:
  377. dot = name.index(".")
  378. before_dot = name[:dot]
  379. after_dot = name[dot + 1 :]
  380. if before_dot == "super":
  381. return self.find_label_name(
  382. after_dot, depgraph.nodes[node["head"]], depgraph, unique_index
  383. )
  384. else:
  385. return self.find_label_name(
  386. after_dot,
  387. self.lookup_unique(before_dot, node, depgraph),
  388. depgraph,
  389. unique_index,
  390. )
  391. except ValueError:
  392. lbl = self.get_label(node)
  393. if name == "f":
  394. return lbl
  395. elif name == "v":
  396. return "%sv" % lbl
  397. elif name == "r":
  398. return "%sr" % lbl
  399. elif name == "super":
  400. return self.get_label(depgraph.nodes[node["head"]])
  401. elif name == "var":
  402. return "%s%s" % (lbl.upper(), unique_index)
  403. elif name == "a":
  404. return self.get_label(self.lookup_unique("conja", node, depgraph))
  405. elif name == "b":
  406. return self.get_label(self.lookup_unique("conjb", node, depgraph))
  407. else:
  408. return self.get_label(self.lookup_unique(name, node, depgraph))
  409. def get_label(self, node):
  410. """
  411. Pick an alphabetic character as identifier for an entity in the model.
  412. :param value: where to index into the list of characters
  413. :type value: int
  414. """
  415. value = node["address"]
  416. letter = [
  417. "f",
  418. "g",
  419. "h",
  420. "i",
  421. "j",
  422. "k",
  423. "l",
  424. "m",
  425. "n",
  426. "o",
  427. "p",
  428. "q",
  429. "r",
  430. "s",
  431. "t",
  432. "u",
  433. "v",
  434. "w",
  435. "x",
  436. "y",
  437. "z",
  438. "a",
  439. "b",
  440. "c",
  441. "d",
  442. "e",
  443. ][value - 1]
  444. num = int(value) // 26
  445. if num > 0:
  446. return letter + str(num)
  447. else:
  448. return letter
  449. def lookup_unique(self, rel, node, depgraph):
  450. """
  451. Lookup 'key'. There should be exactly one item in the associated relation.
  452. """
  453. deps = [
  454. depgraph.nodes[dep]
  455. for dep in chain(*node["deps"].values())
  456. if depgraph.nodes[dep]["rel"].lower() == rel.lower()
  457. ]
  458. if len(deps) == 0:
  459. raise KeyError("'%s' doesn't contain a feature '%s'" % (node["word"], rel))
  460. elif len(deps) > 1:
  461. raise KeyError(
  462. "'%s' should only have one feature '%s'" % (node["word"], rel)
  463. )
  464. else:
  465. return deps[0]
  466. def get_GlueFormula_factory(self):
  467. return GlueFormula
  468. class Glue(object):
  469. def __init__(
  470. self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
  471. ):
  472. self.verbose = verbose
  473. self.remove_duplicates = remove_duplicates
  474. self.depparser = depparser
  475. from nltk import Prover9
  476. self.prover = Prover9()
  477. if semtype_file:
  478. self.semtype_file = semtype_file
  479. else:
  480. self.semtype_file = os.path.join(
  481. "grammars", "sample_grammars", "glue.semtype"
  482. )
  483. def train_depparser(self, depgraphs=None):
  484. if depgraphs:
  485. self.depparser.train(depgraphs)
  486. else:
  487. self.depparser.train_from_file(
  488. nltk.data.find(
  489. os.path.join("grammars", "sample_grammars", "glue_train.conll")
  490. )
  491. )
  492. def parse_to_meaning(self, sentence):
  493. readings = []
  494. for agenda in self.parse_to_compiled(sentence):
  495. readings.extend(self.get_readings(agenda))
  496. return readings
  497. def get_readings(self, agenda):
  498. readings = []
  499. agenda_length = len(agenda)
  500. atomics = dict()
  501. nonatomics = dict()
  502. while agenda: # is not empty
  503. cur = agenda.pop()
  504. glue_simp = cur.glue.simplify()
  505. if isinstance(
  506. glue_simp, linearlogic.ImpExpression
  507. ): # if cur.glue is non-atomic
  508. for key in atomics:
  509. try:
  510. if isinstance(cur.glue, linearlogic.ApplicationExpression):
  511. bindings = cur.glue.bindings
  512. else:
  513. bindings = linearlogic.BindingDict()
  514. glue_simp.antecedent.unify(key, bindings)
  515. for atomic in atomics[key]:
  516. if not (
  517. cur.indices & atomic.indices
  518. ): # if the sets of indices are disjoint
  519. try:
  520. agenda.append(cur.applyto(atomic))
  521. except linearlogic.LinearLogicApplicationException:
  522. pass
  523. except linearlogic.UnificationException:
  524. pass
  525. try:
  526. nonatomics[glue_simp.antecedent].append(cur)
  527. except KeyError:
  528. nonatomics[glue_simp.antecedent] = [cur]
  529. else: # else cur.glue is atomic
  530. for key in nonatomics:
  531. for nonatomic in nonatomics[key]:
  532. try:
  533. if isinstance(
  534. nonatomic.glue, linearlogic.ApplicationExpression
  535. ):
  536. bindings = nonatomic.glue.bindings
  537. else:
  538. bindings = linearlogic.BindingDict()
  539. glue_simp.unify(key, bindings)
  540. if not (
  541. cur.indices & nonatomic.indices
  542. ): # if the sets of indices are disjoint
  543. try:
  544. agenda.append(nonatomic.applyto(cur))
  545. except linearlogic.LinearLogicApplicationException:
  546. pass
  547. except linearlogic.UnificationException:
  548. pass
  549. try:
  550. atomics[glue_simp].append(cur)
  551. except KeyError:
  552. atomics[glue_simp] = [cur]
  553. for entry in atomics:
  554. for gf in atomics[entry]:
  555. if len(gf.indices) == agenda_length:
  556. self._add_to_reading_list(gf, readings)
  557. for entry in nonatomics:
  558. for gf in nonatomics[entry]:
  559. if len(gf.indices) == agenda_length:
  560. self._add_to_reading_list(gf, readings)
  561. return readings
  562. def _add_to_reading_list(self, glueformula, reading_list):
  563. add_reading = True
  564. if self.remove_duplicates:
  565. for reading in reading_list:
  566. try:
  567. if reading.equiv(glueformula.meaning, self.prover):
  568. add_reading = False
  569. break
  570. except Exception as e:
  571. # if there is an exception, the syntax of the formula
  572. # may not be understandable by the prover, so don't
  573. # throw out the reading.
  574. print("Error when checking logical equality of statements", e)
  575. if add_reading:
  576. reading_list.append(glueformula.meaning)
  577. def parse_to_compiled(self, sentence):
  578. gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
  579. return [self.gfl_to_compiled(gfl) for gfl in gfls]
  580. def dep_parse(self, sentence):
  581. """
  582. Return a dependency graph for the sentence.
  583. :param sentence: the sentence to be parsed
  584. :type sentence: list(str)
  585. :rtype: DependencyGraph
  586. """
  587. # Lazy-initialize the depparser
  588. if self.depparser is None:
  589. from nltk.parse import MaltParser
  590. self.depparser = MaltParser(tagger=self.get_pos_tagger())
  591. if not self.depparser._trained:
  592. self.train_depparser()
  593. return self.depparser.parse(sentence, verbose=self.verbose)
  594. def depgraph_to_glue(self, depgraph):
  595. return self.get_glue_dict().to_glueformula_list(depgraph)
  596. def get_glue_dict(self):
  597. return GlueDict(self.semtype_file)
  598. def gfl_to_compiled(self, gfl):
  599. index_counter = Counter()
  600. return_list = []
  601. for gf in gfl:
  602. return_list.extend(gf.compile(index_counter))
  603. if self.verbose:
  604. print("Compiled Glue Premises:")
  605. for cgf in return_list:
  606. print(cgf)
  607. return return_list
  608. def get_pos_tagger(self):
  609. from nltk.corpus import brown
  610. regexp_tagger = RegexpTagger(
  611. [
  612. (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers
  613. (r"(The|the|A|a|An|an)$", "AT"), # articles
  614. (r".*able$", "JJ"), # adjectives
  615. (r".*ness$", "NN"), # nouns formed from adjectives
  616. (r".*ly$", "RB"), # adverbs
  617. (r".*s$", "NNS"), # plural nouns
  618. (r".*ing$", "VBG"), # gerunds
  619. (r".*ed$", "VBD"), # past tense verbs
  620. (r".*", "NN"), # nouns (default)
  621. ]
  622. )
  623. brown_train = brown.tagged_sents(categories="news")
  624. unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
  625. bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
  626. trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
  627. # Override particular words
  628. main_tagger = RegexpTagger(
  629. [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")],
  630. backoff=trigram_tagger,
  631. )
  632. return main_tagger
  633. class DrtGlueFormula(GlueFormula):
  634. def __init__(self, meaning, glue, indices=None):
  635. if not indices:
  636. indices = set()
  637. if isinstance(meaning, str):
  638. self.meaning = drt.DrtExpression.fromstring(meaning)
  639. elif isinstance(meaning, drt.DrtExpression):
  640. self.meaning = meaning
  641. else:
  642. raise RuntimeError(
  643. "Meaning term neither string or expression: %s, %s"
  644. % (meaning, meaning.__class__)
  645. )
  646. if isinstance(glue, str):
  647. self.glue = linearlogic.LinearLogicParser().parse(glue)
  648. elif isinstance(glue, linearlogic.Expression):
  649. self.glue = glue
  650. else:
  651. raise RuntimeError(
  652. "Glue term neither string or expression: %s, %s"
  653. % (glue, glue.__class__)
  654. )
  655. self.indices = indices
  656. def make_VariableExpression(self, name):
  657. return drt.DrtVariableExpression(name)
  658. def make_LambdaExpression(self, variable, term):
  659. return drt.DrtLambdaExpression(variable, term)
  660. class DrtGlueDict(GlueDict):
  661. def get_GlueFormula_factory(self):
  662. return DrtGlueFormula
  663. class DrtGlue(Glue):
  664. def __init__(
  665. self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False
  666. ):
  667. if not semtype_file:
  668. semtype_file = os.path.join(
  669. "grammars", "sample_grammars", "drt_glue.semtype"
  670. )
  671. Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
  672. def get_glue_dict(self):
  673. return DrtGlueDict(self.semtype_file)
  674. def demo(show_example=-1):
  675. from nltk.parse import MaltParser
  676. examples = [
  677. "David sees Mary",
  678. "David eats a sandwich",
  679. "every man chases a dog",
  680. "every man believes a dog sleeps",
  681. "John gives David a sandwich",
  682. "John chases himself",
  683. ]
  684. # 'John persuades David to order a pizza',
  685. # 'John tries to go',
  686. # 'John tries to find a unicorn',
  687. # 'John seems to vanish',
  688. # 'a unicorn seems to approach',
  689. # 'every big cat leaves',
  690. # 'every gray cat leaves',
  691. # 'every big gray cat leaves',
  692. # 'a former senator leaves',
  693. print("============== DEMO ==============")
  694. tagger = RegexpTagger(
  695. [
  696. ("^(David|Mary|John)$", "NNP"),
  697. (
  698. "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$",
  699. "VB",
  700. ),
  701. ("^(go|order|vanish|find|approach)$", "VB"),
  702. ("^(a)$", "ex_quant"),
  703. ("^(every)$", "univ_quant"),
  704. ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"),
  705. ("^(big|gray|former)$", "JJ"),
  706. ("^(him|himself)$", "PRP"),
  707. ]
  708. )
  709. depparser = MaltParser(tagger=tagger)
  710. glue = Glue(depparser=depparser, verbose=False)
  711. for (i, sentence) in enumerate(examples):
  712. if i == show_example or show_example == -1:
  713. print("[[[Example %s]]] %s" % (i, sentence))
  714. for reading in glue.parse_to_meaning(sentence.split()):
  715. print(reading.simplify())
  716. print("")
  717. if __name__ == "__main__":
  718. demo()