boxer.py 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607
  1. # Natural Language Toolkit: Interface to Boxer
  2. # <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
  3. #
  4. # Author: Dan Garrette <dhgarrette@gmail.com>
  5. #
  6. # Copyright (C) 2001-2020 NLTK Project
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. An interface to Boxer.
  11. This interface relies on the latest version of the development (subversion) version of
  12. C&C and Boxer.
  13. Usage:
  14. Set the environment variable CANDC to the bin directory of your CandC installation.
  15. The models directory should be in the CandC root directory.
  16. For example:
  17. /path/to/candc/
  18. bin/
  19. candc
  20. boxer
  21. models/
  22. boxer/
  23. """
  24. import os
  25. import re
  26. import operator
  27. import subprocess
  28. from optparse import OptionParser
  29. import tempfile
  30. from functools import reduce
  31. from nltk.internals import find_binary
  32. from nltk.sem.logic import (
  33. ExpectedMoreTokensException,
  34. LogicalExpressionException,
  35. UnexpectedTokenException,
  36. Variable,
  37. )
  38. from nltk.sem.drt import (
  39. DRS,
  40. DrtApplicationExpression,
  41. DrtEqualityExpression,
  42. DrtNegatedExpression,
  43. DrtOrExpression,
  44. DrtParser,
  45. DrtProposition,
  46. DrtTokens,
  47. DrtVariableExpression,
  48. )
  49. class Boxer(object):
  50. """
  51. This class is an interface to Johan Bos's program Boxer, a wide-coverage
  52. semantic parser that produces Discourse Representation Structures (DRSs).
  53. """
  54. def __init__(
  55. self,
  56. boxer_drs_interpreter=None,
  57. elimeq=False,
  58. bin_dir=None,
  59. verbose=False,
  60. resolve=True,
  61. ):
  62. """
  63. :param boxer_drs_interpreter: A class that converts from the
  64. ``AbstractBoxerDrs`` object hierarchy to a different object. The
  65. default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
  66. DRT hierarchy.
  67. :param elimeq: When set to true, Boxer removes all equalities from the
  68. DRSs and discourse referents standing in the equality relation are
  69. unified, but only if this can be done in a meaning-preserving manner.
  70. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
  71. Resolution follows Van der Sandt's theory of binding and accommodation.
  72. """
  73. if boxer_drs_interpreter is None:
  74. boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
  75. self._boxer_drs_interpreter = boxer_drs_interpreter
  76. self._resolve = resolve
  77. self._elimeq = elimeq
  78. self.set_bin_dir(bin_dir, verbose)
  79. def set_bin_dir(self, bin_dir, verbose=False):
  80. self._candc_bin = self._find_binary("candc", bin_dir, verbose)
  81. self._candc_models_path = os.path.normpath(
  82. os.path.join(self._candc_bin[:-5], "../models")
  83. )
  84. self._boxer_bin = self._find_binary("boxer", bin_dir, verbose)
  85. def interpret(self, input, discourse_id=None, question=False, verbose=False):
  86. """
  87. Use Boxer to give a first order representation.
  88. :param input: str Input sentence to parse
  89. :param occur_index: bool Should predicates be occurrence indexed?
  90. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
  91. :return: ``drt.DrtExpression``
  92. """
  93. discourse_ids = [discourse_id] if discourse_id is not None else None
  94. d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
  95. if not d:
  96. raise Exception('Unable to interpret: "{0}"'.format(input))
  97. return d
  98. def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
  99. """
  100. Use Boxer to give a first order representation.
  101. :param input: list of str Input sentences to parse as a single discourse
  102. :param occur_index: bool Should predicates be occurrence indexed?
  103. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
  104. :return: ``drt.DrtExpression``
  105. """
  106. discourse_ids = [discourse_id] if discourse_id is not None else None
  107. d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
  108. if not d:
  109. raise Exception('Unable to interpret: "{0}"'.format(input))
  110. return d
  111. def interpret_sents(
  112. self, inputs, discourse_ids=None, question=False, verbose=False
  113. ):
  114. """
  115. Use Boxer to give a first order representation.
  116. :param inputs: list of str Input sentences to parse as individual discourses
  117. :param occur_index: bool Should predicates be occurrence indexed?
  118. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  119. :return: list of ``drt.DrtExpression``
  120. """
  121. return self.interpret_multi_sents(
  122. [[input] for input in inputs], discourse_ids, question, verbose
  123. )
  124. def interpret_multi_sents(
  125. self, inputs, discourse_ids=None, question=False, verbose=False
  126. ):
  127. """
  128. Use Boxer to give a first order representation.
  129. :param inputs: list of list of str Input discourses to parse
  130. :param occur_index: bool Should predicates be occurrence indexed?
  131. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  132. :return: ``drt.DrtExpression``
  133. """
  134. if discourse_ids is not None:
  135. assert len(inputs) == len(discourse_ids)
  136. assert reduce(operator.and_, (id is not None for id in discourse_ids))
  137. use_disc_id = True
  138. else:
  139. discourse_ids = list(map(str, range(len(inputs))))
  140. use_disc_id = False
  141. candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
  142. boxer_out = self._call_boxer(candc_out, verbose=verbose)
  143. # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
  144. # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
  145. drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
  146. return [drs_dict.get(id, None) for id in discourse_ids]
  147. def _call_candc(self, inputs, discourse_ids, question, verbose=False):
  148. """
  149. Call the ``candc`` binary with the given input.
  150. :param inputs: list of list of str Input discourses to parse
  151. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  152. :param filename: str A filename for the output file
  153. :return: stdout
  154. """
  155. args = [
  156. "--models",
  157. os.path.join(self._candc_models_path, ["boxer", "questions"][question]),
  158. "--candc-printer",
  159. "boxer",
  160. ]
  161. return self._call(
  162. "\n".join(
  163. sum(
  164. (
  165. ["<META>'{0}'".format(id)] + d
  166. for d, id in zip(inputs, discourse_ids)
  167. ),
  168. [],
  169. )
  170. ),
  171. self._candc_bin,
  172. args,
  173. verbose,
  174. )
  175. def _call_boxer(self, candc_out, verbose=False):
  176. """
  177. Call the ``boxer`` binary with the given input.
  178. :param candc_out: str output from C&C parser
  179. :return: stdout
  180. """
  181. f = None
  182. try:
  183. fd, temp_filename = tempfile.mkstemp(
  184. prefix="boxer-", suffix=".in", text=True
  185. )
  186. f = os.fdopen(fd, "w")
  187. f.write(candc_out)
  188. finally:
  189. if f:
  190. f.close()
  191. args = [
  192. "--box",
  193. "false",
  194. "--semantics",
  195. "drs",
  196. #'--flat', 'false', # removed from boxer
  197. "--resolve",
  198. ["false", "true"][self._resolve],
  199. "--elimeq",
  200. ["false", "true"][self._elimeq],
  201. "--format",
  202. "prolog",
  203. "--instantiate",
  204. "true",
  205. "--input",
  206. temp_filename,
  207. ]
  208. stdout = self._call(None, self._boxer_bin, args, verbose)
  209. os.remove(temp_filename)
  210. return stdout
  211. def _find_binary(self, name, bin_dir, verbose=False):
  212. return find_binary(
  213. name,
  214. path_to_bin=bin_dir,
  215. env_vars=["CANDC"],
  216. url="http://svn.ask.it.usyd.edu.au/trac/candc/",
  217. binary_names=[name, name + ".exe"],
  218. verbose=verbose,
  219. )
  220. def _call(self, input_str, binary, args=[], verbose=False):
  221. """
  222. Call the binary with the given input.
  223. :param input_str: A string whose contents are used as stdin.
  224. :param binary: The location of the binary to call
  225. :param args: A list of command-line arguments.
  226. :return: stdout
  227. """
  228. if verbose:
  229. print("Calling:", binary)
  230. print("Args:", args)
  231. print("Input:", input_str)
  232. print("Command:", binary + " " + " ".join(args))
  233. # Call via a subprocess
  234. if input_str is None:
  235. cmd = [binary] + args
  236. p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  237. else:
  238. cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, " ".join(args))
  239. p = subprocess.Popen(
  240. cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
  241. )
  242. stdout, stderr = p.communicate()
  243. if verbose:
  244. print("Return code:", p.returncode)
  245. if stdout:
  246. print("stdout:\n", stdout, "\n")
  247. if stderr:
  248. print("stderr:\n", stderr, "\n")
  249. if p.returncode != 0:
  250. raise Exception(
  251. "ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}".format(
  252. binary, " ".join(args), p.returncode, stderr
  253. )
  254. )
  255. return stdout
  256. def _parse_to_drs_dict(self, boxer_out, use_disc_id):
  257. lines = boxer_out.split("\n")
  258. drs_dict = {}
  259. i = 0
  260. while i < len(lines):
  261. line = lines[i]
  262. if line.startswith("id("):
  263. comma_idx = line.index(",")
  264. discourse_id = line[3:comma_idx]
  265. if discourse_id[0] == "'" and discourse_id[-1] == "'":
  266. discourse_id = discourse_id[1:-1]
  267. drs_id = line[comma_idx + 1 : line.index(")")]
  268. i += 1
  269. line = lines[i]
  270. assert line.startswith("sem({0},".format(drs_id))
  271. if line[-4:] == "').'":
  272. line = line[:-4] + ")."
  273. assert line.endswith(")."), "can't parse line: {0}".format(line)
  274. search_start = len("sem({0},[".format(drs_id))
  275. brace_count = 1
  276. drs_start = -1
  277. for j, c in enumerate(line[search_start:]):
  278. if c == "[":
  279. brace_count += 1
  280. if c == "]":
  281. brace_count -= 1
  282. if brace_count == 0:
  283. drs_start = search_start + j + 1
  284. if line[drs_start : drs_start + 3] == "','":
  285. drs_start = drs_start + 3
  286. else:
  287. drs_start = drs_start + 1
  288. break
  289. assert drs_start > -1
  290. drs_input = line[drs_start:-2].strip()
  291. parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
  292. drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
  293. i += 1
  294. return drs_dict
  295. def _parse_drs(self, drs_string, discourse_id, use_disc_id):
  296. return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
  297. class BoxerOutputDrsParser(DrtParser):
  298. def __init__(self, discourse_id=None):
  299. """
  300. This class is used to parse the Prolog DRS output from Boxer into a
  301. hierarchy of python objects.
  302. """
  303. DrtParser.__init__(self)
  304. self.discourse_id = discourse_id
  305. self.sentence_id_offset = None
  306. self.quote_chars = [("'", "'", "\\", False)]
  307. def parse(self, data, signature=None):
  308. return DrtParser.parse(self, data, signature)
  309. def get_all_symbols(self):
  310. return ["(", ")", ",", "[", "]", ":"]
  311. def handle(self, tok, context):
  312. return self.handle_drs(tok)
  313. def attempt_adjuncts(self, expression, context):
  314. return expression
  315. def parse_condition(self, indices):
  316. """
  317. Parse a DRS condition
  318. :return: list of ``DrtExpression``
  319. """
  320. tok = self.token()
  321. accum = self.handle_condition(tok, indices)
  322. if accum is None:
  323. raise UnexpectedTokenException(tok)
  324. return accum
  325. def handle_drs(self, tok):
  326. if tok == "drs":
  327. return self.parse_drs()
  328. elif tok in ["merge", "smerge"]:
  329. return self._handle_binary_expression(self._make_merge_expression)(None, [])
  330. elif tok in ["alfa"]:
  331. return self._handle_alfa(self._make_merge_expression)(None, [])
  332. def handle_condition(self, tok, indices):
  333. """
  334. Handle a DRS condition
  335. :param indices: list of int
  336. :return: list of ``DrtExpression``
  337. """
  338. if tok == "not":
  339. return [self._handle_not()]
  340. if tok == "or":
  341. conds = [self._handle_binary_expression(self._make_or_expression)]
  342. elif tok == "imp":
  343. conds = [self._handle_binary_expression(self._make_imp_expression)]
  344. elif tok == "eq":
  345. conds = [self._handle_eq()]
  346. elif tok == "prop":
  347. conds = [self._handle_prop()]
  348. elif tok == "pred":
  349. conds = [self._handle_pred()]
  350. elif tok == "named":
  351. conds = [self._handle_named()]
  352. elif tok == "rel":
  353. conds = [self._handle_rel()]
  354. elif tok == "timex":
  355. conds = self._handle_timex()
  356. elif tok == "card":
  357. conds = [self._handle_card()]
  358. elif tok == "whq":
  359. conds = [self._handle_whq()]
  360. elif tok == "duplex":
  361. conds = [self._handle_duplex()]
  362. else:
  363. conds = []
  364. return sum(
  365. [
  366. [cond(sent_index, word_indices) for cond in conds]
  367. for sent_index, word_indices in self._sent_and_word_indices(indices)
  368. ],
  369. [],
  370. )
  371. def _handle_not(self):
  372. self.assertToken(self.token(), "(")
  373. drs = self.process_next_expression(None)
  374. self.assertToken(self.token(), ")")
  375. return BoxerNot(drs)
  376. def _handle_pred(self):
  377. # pred(_G3943, dog, n, 0)
  378. self.assertToken(self.token(), "(")
  379. variable = self.parse_variable()
  380. self.assertToken(self.token(), ",")
  381. name = self.token()
  382. self.assertToken(self.token(), ",")
  383. pos = self.token()
  384. self.assertToken(self.token(), ",")
  385. sense = int(self.token())
  386. self.assertToken(self.token(), ")")
  387. def _handle_pred_f(sent_index, word_indices):
  388. return BoxerPred(
  389. self.discourse_id, sent_index, word_indices, variable, name, pos, sense
  390. )
  391. return _handle_pred_f
  392. def _handle_duplex(self):
  393. # duplex(whq, drs(...), var, drs(...))
  394. self.assertToken(self.token(), "(")
  395. # self.assertToken(self.token(), '[')
  396. ans_types = []
  397. # while self.token(0) != ']':
  398. # cat = self.token()
  399. # self.assertToken(self.token(), ':')
  400. # if cat == 'des':
  401. # ans_types.append(self.token())
  402. # elif cat == 'num':
  403. # ans_types.append('number')
  404. # typ = self.token()
  405. # if typ == 'cou':
  406. # ans_types.append('count')
  407. # else:
  408. # ans_types.append(typ)
  409. # else:
  410. # ans_types.append(self.token())
  411. # self.token() #swallow the ']'
  412. self.assertToken(self.token(), "whq")
  413. self.assertToken(self.token(), ",")
  414. d1 = self.process_next_expression(None)
  415. self.assertToken(self.token(), ",")
  416. ref = self.parse_variable()
  417. self.assertToken(self.token(), ",")
  418. d2 = self.process_next_expression(None)
  419. self.assertToken(self.token(), ")")
  420. return lambda sent_index, word_indices: BoxerWhq(
  421. self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
  422. )
  423. def _handle_named(self):
  424. # named(x0, john, per, 0)
  425. self.assertToken(self.token(), "(")
  426. variable = self.parse_variable()
  427. self.assertToken(self.token(), ",")
  428. name = self.token()
  429. self.assertToken(self.token(), ",")
  430. type = self.token()
  431. self.assertToken(self.token(), ",")
  432. sense = self.token() # as per boxer rev 2554
  433. self.assertToken(self.token(), ")")
  434. return lambda sent_index, word_indices: BoxerNamed(
  435. self.discourse_id, sent_index, word_indices, variable, name, type, sense
  436. )
  437. def _handle_rel(self):
  438. # rel(_G3993, _G3943, agent, 0)
  439. self.assertToken(self.token(), "(")
  440. var1 = self.parse_variable()
  441. self.assertToken(self.token(), ",")
  442. var2 = self.parse_variable()
  443. self.assertToken(self.token(), ",")
  444. rel = self.token()
  445. self.assertToken(self.token(), ",")
  446. sense = int(self.token())
  447. self.assertToken(self.token(), ")")
  448. return lambda sent_index, word_indices: BoxerRel(
  449. self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
  450. )
  451. def _handle_timex(self):
  452. # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
  453. self.assertToken(self.token(), "(")
  454. arg = self.parse_variable()
  455. self.assertToken(self.token(), ",")
  456. new_conds = self._handle_time_expression(arg)
  457. self.assertToken(self.token(), ")")
  458. return new_conds
  459. def _handle_time_expression(self, arg):
  460. # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
  461. tok = self.token()
  462. self.assertToken(self.token(), "(")
  463. if tok == "date":
  464. conds = self._handle_date(arg)
  465. elif tok == "time":
  466. conds = self._handle_time(arg)
  467. else:
  468. return None
  469. self.assertToken(self.token(), ")")
  470. return [
  471. lambda sent_index, word_indices: BoxerPred(
  472. self.discourse_id, sent_index, word_indices, arg, tok, "n", 0
  473. )
  474. ] + [lambda sent_index, word_indices: cond for cond in conds]
  475. def _handle_date(self, arg):
  476. # []: (+), []:'XXXX', [1004]:'04', []:'XX'
  477. conds = []
  478. (sent_index, word_indices), = self._sent_and_word_indices(
  479. self._parse_index_list()
  480. )
  481. self.assertToken(self.token(), "(")
  482. pol = self.token()
  483. self.assertToken(self.token(), ")")
  484. conds.append(
  485. BoxerPred(
  486. self.discourse_id,
  487. sent_index,
  488. word_indices,
  489. arg,
  490. "date_pol_{0}".format(pol),
  491. "a",
  492. 0,
  493. )
  494. )
  495. self.assertToken(self.token(), ",")
  496. (sent_index, word_indices), = self._sent_and_word_indices(
  497. self._parse_index_list()
  498. )
  499. year = self.token()
  500. if year != "XXXX":
  501. year = year.replace(":", "_")
  502. conds.append(
  503. BoxerPred(
  504. self.discourse_id,
  505. sent_index,
  506. word_indices,
  507. arg,
  508. "date_year_{0}".format(year),
  509. "a",
  510. 0,
  511. )
  512. )
  513. self.assertToken(self.token(), ",")
  514. (sent_index, word_indices), = self._sent_and_word_indices(
  515. self._parse_index_list()
  516. )
  517. month = self.token()
  518. if month != "XX":
  519. conds.append(
  520. BoxerPred(
  521. self.discourse_id,
  522. sent_index,
  523. word_indices,
  524. arg,
  525. "date_month_{0}".format(month),
  526. "a",
  527. 0,
  528. )
  529. )
  530. self.assertToken(self.token(), ",")
  531. (sent_index, word_indices), = self._sent_and_word_indices(
  532. self._parse_index_list()
  533. )
  534. day = self.token()
  535. if day != "XX":
  536. conds.append(
  537. BoxerPred(
  538. self.discourse_id,
  539. sent_index,
  540. word_indices,
  541. arg,
  542. "date_day_{0}".format(day),
  543. "a",
  544. 0,
  545. )
  546. )
  547. return conds
  548. def _handle_time(self, arg):
  549. # time([1018]:'18', []:'XX', []:'XX')
  550. conds = []
  551. self._parse_index_list()
  552. hour = self.token()
  553. if hour != "XX":
  554. conds.append(self._make_atom("r_hour_2", arg, hour))
  555. self.assertToken(self.token(), ",")
  556. self._parse_index_list()
  557. min = self.token()
  558. if min != "XX":
  559. conds.append(self._make_atom("r_min_2", arg, min))
  560. self.assertToken(self.token(), ",")
  561. self._parse_index_list()
  562. sec = self.token()
  563. if sec != "XX":
  564. conds.append(self._make_atom("r_sec_2", arg, sec))
  565. return conds
  566. def _handle_card(self):
  567. # card(_G18535, 28, ge)
  568. self.assertToken(self.token(), "(")
  569. variable = self.parse_variable()
  570. self.assertToken(self.token(), ",")
  571. value = self.token()
  572. self.assertToken(self.token(), ",")
  573. type = self.token()
  574. self.assertToken(self.token(), ")")
  575. return lambda sent_index, word_indices: BoxerCard(
  576. self.discourse_id, sent_index, word_indices, variable, value, type
  577. )
  578. def _handle_prop(self):
  579. # prop(_G15949, drs(...))
  580. self.assertToken(self.token(), "(")
  581. variable = self.parse_variable()
  582. self.assertToken(self.token(), ",")
  583. drs = self.process_next_expression(None)
  584. self.assertToken(self.token(), ")")
  585. return lambda sent_index, word_indices: BoxerProp(
  586. self.discourse_id, sent_index, word_indices, variable, drs
  587. )
  588. def _parse_index_list(self):
  589. # [1001,1002]:
  590. indices = []
  591. self.assertToken(self.token(), "[")
  592. while self.token(0) != "]":
  593. indices.append(self.parse_index())
  594. if self.token(0) == ",":
  595. self.token() # swallow ','
  596. self.token() # swallow ']'
  597. self.assertToken(self.token(), ":")
  598. return indices
  599. def parse_drs(self):
  600. # drs([[1001]:_G3943],
  601. # [[1002]:pred(_G3943, dog, n, 0)]
  602. # )
  603. self.assertToken(self.token(), "(")
  604. self.assertToken(self.token(), "[")
  605. refs = set()
  606. while self.token(0) != "]":
  607. indices = self._parse_index_list()
  608. refs.add(self.parse_variable())
  609. if self.token(0) == ",":
  610. self.token() # swallow ','
  611. self.token() # swallow ']'
  612. self.assertToken(self.token(), ",")
  613. self.assertToken(self.token(), "[")
  614. conds = []
  615. while self.token(0) != "]":
  616. indices = self._parse_index_list()
  617. conds.extend(self.parse_condition(indices))
  618. if self.token(0) == ",":
  619. self.token() # swallow ','
  620. self.token() # swallow ']'
  621. self.assertToken(self.token(), ")")
  622. return BoxerDrs(list(refs), conds)
  623. def _handle_binary_expression(self, make_callback):
  624. self.assertToken(self.token(), "(")
  625. drs1 = self.process_next_expression(None)
  626. self.assertToken(self.token(), ",")
  627. drs2 = self.process_next_expression(None)
  628. self.assertToken(self.token(), ")")
  629. return lambda sent_index, word_indices: make_callback(
  630. sent_index, word_indices, drs1, drs2
  631. )
  632. def _handle_alfa(self, make_callback):
  633. self.assertToken(self.token(), "(")
  634. type = self.token()
  635. self.assertToken(self.token(), ",")
  636. drs1 = self.process_next_expression(None)
  637. self.assertToken(self.token(), ",")
  638. drs2 = self.process_next_expression(None)
  639. self.assertToken(self.token(), ")")
  640. return lambda sent_index, word_indices: make_callback(
  641. sent_index, word_indices, drs1, drs2
  642. )
  643. def _handle_eq(self):
  644. self.assertToken(self.token(), "(")
  645. var1 = self.parse_variable()
  646. self.assertToken(self.token(), ",")
  647. var2 = self.parse_variable()
  648. self.assertToken(self.token(), ")")
  649. return lambda sent_index, word_indices: BoxerEq(
  650. self.discourse_id, sent_index, word_indices, var1, var2
  651. )
  652. def _handle_whq(self):
  653. self.assertToken(self.token(), "(")
  654. self.assertToken(self.token(), "[")
  655. ans_types = []
  656. while self.token(0) != "]":
  657. cat = self.token()
  658. self.assertToken(self.token(), ":")
  659. if cat == "des":
  660. ans_types.append(self.token())
  661. elif cat == "num":
  662. ans_types.append("number")
  663. typ = self.token()
  664. if typ == "cou":
  665. ans_types.append("count")
  666. else:
  667. ans_types.append(typ)
  668. else:
  669. ans_types.append(self.token())
  670. self.token() # swallow the ']'
  671. self.assertToken(self.token(), ",")
  672. d1 = self.process_next_expression(None)
  673. self.assertToken(self.token(), ",")
  674. ref = self.parse_variable()
  675. self.assertToken(self.token(), ",")
  676. d2 = self.process_next_expression(None)
  677. self.assertToken(self.token(), ")")
  678. return lambda sent_index, word_indices: BoxerWhq(
  679. self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
  680. )
  681. def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
  682. return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
  683. def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
  684. return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
  685. def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
  686. return BoxerDrs(drs1.refs, drs1.conds, drs2)
  687. def parse_variable(self):
  688. var = self.token()
  689. assert re.match("^[exps]\d+$", var), var
  690. return var
  691. def parse_index(self):
  692. return int(self.token())
  693. def _sent_and_word_indices(self, indices):
  694. """
  695. :return: list of (sent_index, word_indices) tuples
  696. """
  697. sent_indices = set((i / 1000) - 1 for i in indices if i >= 0)
  698. if sent_indices:
  699. pairs = []
  700. for sent_index in sent_indices:
  701. word_indices = [
  702. (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
  703. ]
  704. pairs.append((sent_index, word_indices))
  705. return pairs
  706. else:
  707. word_indices = [(i % 1000) - 1 for i in indices]
  708. return [(None, word_indices)]
  709. class BoxerDrsParser(DrtParser):
  710. """
  711. Reparse the str form of subclasses of ``AbstractBoxerDrs``
  712. """
  713. def __init__(self, discourse_id=None):
  714. DrtParser.__init__(self)
  715. self.discourse_id = discourse_id
  716. def get_all_symbols(self):
  717. return [
  718. DrtTokens.OPEN,
  719. DrtTokens.CLOSE,
  720. DrtTokens.COMMA,
  721. DrtTokens.OPEN_BRACKET,
  722. DrtTokens.CLOSE_BRACKET,
  723. ]
  724. def attempt_adjuncts(self, expression, context):
  725. return expression
  726. def handle(self, tok, context):
  727. try:
  728. # if tok == 'drs':
  729. # self.assertNextToken(DrtTokens.OPEN)
  730. # label = int(self.token())
  731. # self.assertNextToken(DrtTokens.COMMA)
  732. # refs = list(map(int, self.handle_refs()))
  733. # self.assertNextToken(DrtTokens.COMMA)
  734. # conds = self.handle_conds(None)
  735. # self.assertNextToken(DrtTokens.CLOSE)
  736. # return BoxerDrs(label, refs, conds)
  737. if tok == "pred":
  738. self.assertNextToken(DrtTokens.OPEN)
  739. disc_id = (
  740. self.discourse_id if self.discourse_id is not None else self.token()
  741. )
  742. self.assertNextToken(DrtTokens.COMMA)
  743. sent_id = self.nullableIntToken()
  744. self.assertNextToken(DrtTokens.COMMA)
  745. word_ids = list(map(int, self.handle_refs()))
  746. self.assertNextToken(DrtTokens.COMMA)
  747. variable = int(self.token())
  748. self.assertNextToken(DrtTokens.COMMA)
  749. name = self.token()
  750. self.assertNextToken(DrtTokens.COMMA)
  751. pos = self.token()
  752. self.assertNextToken(DrtTokens.COMMA)
  753. sense = int(self.token())
  754. self.assertNextToken(DrtTokens.CLOSE)
  755. return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
  756. elif tok == "named":
  757. self.assertNextToken(DrtTokens.OPEN)
  758. disc_id = (
  759. self.discourse_id if self.discourse_id is not None else self.token()
  760. )
  761. self.assertNextToken(DrtTokens.COMMA)
  762. sent_id = int(self.token())
  763. self.assertNextToken(DrtTokens.COMMA)
  764. word_ids = map(int, self.handle_refs())
  765. self.assertNextToken(DrtTokens.COMMA)
  766. variable = int(self.token())
  767. self.assertNextToken(DrtTokens.COMMA)
  768. name = self.token()
  769. self.assertNextToken(DrtTokens.COMMA)
  770. type = self.token()
  771. self.assertNextToken(DrtTokens.COMMA)
  772. sense = int(self.token())
  773. self.assertNextToken(DrtTokens.CLOSE)
  774. return BoxerNamed(
  775. disc_id, sent_id, word_ids, variable, name, type, sense
  776. )
  777. elif tok == "rel":
  778. self.assertNextToken(DrtTokens.OPEN)
  779. disc_id = (
  780. self.discourse_id if self.discourse_id is not None else self.token()
  781. )
  782. self.assertNextToken(DrtTokens.COMMA)
  783. sent_id = self.nullableIntToken()
  784. self.assertNextToken(DrtTokens.COMMA)
  785. word_ids = list(map(int, self.handle_refs()))
  786. self.assertNextToken(DrtTokens.COMMA)
  787. var1 = int(self.token())
  788. self.assertNextToken(DrtTokens.COMMA)
  789. var2 = int(self.token())
  790. self.assertNextToken(DrtTokens.COMMA)
  791. rel = self.token()
  792. self.assertNextToken(DrtTokens.COMMA)
  793. sense = int(self.token())
  794. self.assertNextToken(DrtTokens.CLOSE)
  795. return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
  796. elif tok == "prop":
  797. self.assertNextToken(DrtTokens.OPEN)
  798. disc_id = (
  799. self.discourse_id if self.discourse_id is not None else self.token()
  800. )
  801. self.assertNextToken(DrtTokens.COMMA)
  802. sent_id = int(self.token())
  803. self.assertNextToken(DrtTokens.COMMA)
  804. word_ids = list(map(int, self.handle_refs()))
  805. self.assertNextToken(DrtTokens.COMMA)
  806. variable = int(self.token())
  807. self.assertNextToken(DrtTokens.COMMA)
  808. drs = self.process_next_expression(None)
  809. self.assertNextToken(DrtTokens.CLOSE)
  810. return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
  811. elif tok == "not":
  812. self.assertNextToken(DrtTokens.OPEN)
  813. drs = self.process_next_expression(None)
  814. self.assertNextToken(DrtTokens.CLOSE)
  815. return BoxerNot(drs)
  816. elif tok == "imp":
  817. self.assertNextToken(DrtTokens.OPEN)
  818. drs1 = self.process_next_expression(None)
  819. self.assertNextToken(DrtTokens.COMMA)
  820. drs2 = self.process_next_expression(None)
  821. self.assertNextToken(DrtTokens.CLOSE)
  822. return BoxerDrs(drs1.refs, drs1.conds, drs2)
  823. elif tok == "or":
  824. self.assertNextToken(DrtTokens.OPEN)
  825. disc_id = (
  826. self.discourse_id if self.discourse_id is not None else self.token()
  827. )
  828. self.assertNextToken(DrtTokens.COMMA)
  829. sent_id = self.nullableIntToken()
  830. self.assertNextToken(DrtTokens.COMMA)
  831. word_ids = map(int, self.handle_refs())
  832. self.assertNextToken(DrtTokens.COMMA)
  833. drs1 = self.process_next_expression(None)
  834. self.assertNextToken(DrtTokens.COMMA)
  835. drs2 = self.process_next_expression(None)
  836. self.assertNextToken(DrtTokens.CLOSE)
  837. return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
  838. elif tok == "eq":
  839. self.assertNextToken(DrtTokens.OPEN)
  840. disc_id = (
  841. self.discourse_id if self.discourse_id is not None else self.token()
  842. )
  843. self.assertNextToken(DrtTokens.COMMA)
  844. sent_id = self.nullableIntToken()
  845. self.assertNextToken(DrtTokens.COMMA)
  846. word_ids = list(map(int, self.handle_refs()))
  847. self.assertNextToken(DrtTokens.COMMA)
  848. var1 = int(self.token())
  849. self.assertNextToken(DrtTokens.COMMA)
  850. var2 = int(self.token())
  851. self.assertNextToken(DrtTokens.CLOSE)
  852. return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
  853. elif tok == "card":
  854. self.assertNextToken(DrtTokens.OPEN)
  855. disc_id = (
  856. self.discourse_id if self.discourse_id is not None else self.token()
  857. )
  858. self.assertNextToken(DrtTokens.COMMA)
  859. sent_id = self.nullableIntToken()
  860. self.assertNextToken(DrtTokens.COMMA)
  861. word_ids = map(int, self.handle_refs())
  862. self.assertNextToken(DrtTokens.COMMA)
  863. var = int(self.token())
  864. self.assertNextToken(DrtTokens.COMMA)
  865. value = self.token()
  866. self.assertNextToken(DrtTokens.COMMA)
  867. type = self.token()
  868. self.assertNextToken(DrtTokens.CLOSE)
  869. return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
  870. elif tok == "whq":
  871. self.assertNextToken(DrtTokens.OPEN)
  872. disc_id = (
  873. self.discourse_id if self.discourse_id is not None else self.token()
  874. )
  875. self.assertNextToken(DrtTokens.COMMA)
  876. sent_id = self.nullableIntToken()
  877. self.assertNextToken(DrtTokens.COMMA)
  878. word_ids = list(map(int, self.handle_refs()))
  879. self.assertNextToken(DrtTokens.COMMA)
  880. ans_types = self.handle_refs()
  881. self.assertNextToken(DrtTokens.COMMA)
  882. drs1 = self.process_next_expression(None)
  883. self.assertNextToken(DrtTokens.COMMA)
  884. var = int(self.token())
  885. self.assertNextToken(DrtTokens.COMMA)
  886. drs2 = self.process_next_expression(None)
  887. self.assertNextToken(DrtTokens.CLOSE)
  888. return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
  889. except Exception as e:
  890. raise LogicalExpressionException(self._currentIndex, str(e))
  891. assert False, repr(tok)
  892. def nullableIntToken(self):
  893. t = self.token()
  894. return int(t) if t != "None" else None
  895. def get_next_token_variable(self, description):
  896. try:
  897. return self.token()
  898. except ExpectedMoreTokensException as e:
  899. raise ExpectedMoreTokensException(e.index, "Variable expected.")
  900. class AbstractBoxerDrs(object):
  901. def variables(self):
  902. """
  903. :return: (set<variables>, set<events>, set<propositions>)
  904. """
  905. variables, events, propositions = self._variables()
  906. return (variables - (events | propositions), events, propositions - events)
  907. def variable_types(self):
  908. vartypes = {}
  909. for t, vars in zip(("z", "e", "p"), self.variables()):
  910. for v in vars:
  911. vartypes[v] = t
  912. return vartypes
  913. def _variables(self):
  914. """
  915. :return: (set<variables>, set<events>, set<propositions>)
  916. """
  917. return (set(), set(), set())
  918. def atoms(self):
  919. return set()
  920. def clean(self):
  921. return self
  922. def _clean_name(self, name):
  923. return name.replace("-", "_").replace("'", "_")
  924. def renumber_sentences(self, f):
  925. return self
  926. def __hash__(self):
  927. return hash("{0}".format(self))
  928. class BoxerDrs(AbstractBoxerDrs):
  929. def __init__(self, refs, conds, consequent=None):
  930. AbstractBoxerDrs.__init__(self)
  931. self.refs = refs
  932. self.conds = conds
  933. self.consequent = consequent
  934. def _variables(self):
  935. variables = (set(), set(), set())
  936. for cond in self.conds:
  937. for s, v in zip(variables, cond._variables()):
  938. s.update(v)
  939. if self.consequent is not None:
  940. for s, v in zip(variables, self.consequent._variables()):
  941. s.update(v)
  942. return variables
  943. def atoms(self):
  944. atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
  945. if self.consequent is not None:
  946. atoms.update(self.consequent.atoms())
  947. return atoms
  948. def clean(self):
  949. consequent = self.consequent.clean() if self.consequent else None
  950. return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
  951. def renumber_sentences(self, f):
  952. consequent = self.consequent.renumber_sentences(f) if self.consequent else None
  953. return BoxerDrs(
  954. self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
  955. )
  956. def __repr__(self):
  957. s = "drs([%s], [%s])" % (
  958. ", ".join("%s" % r for r in self.refs),
  959. ", ".join("%s" % c for c in self.conds),
  960. )
  961. if self.consequent is not None:
  962. s = "imp(%s, %s)" % (s, self.consequent)
  963. return s
  964. def __eq__(self, other):
  965. return (
  966. self.__class__ == other.__class__
  967. and self.refs == other.refs
  968. and len(self.conds) == len(other.conds)
  969. and reduce(
  970. operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
  971. )
  972. and self.consequent == other.consequent
  973. )
  974. def __ne__(self, other):
  975. return not self == other
  976. __hash__ = AbstractBoxerDrs.__hash__
  977. class BoxerNot(AbstractBoxerDrs):
  978. def __init__(self, drs):
  979. AbstractBoxerDrs.__init__(self)
  980. self.drs = drs
  981. def _variables(self):
  982. return self.drs._variables()
  983. def atoms(self):
  984. return self.drs.atoms()
  985. def clean(self):
  986. return BoxerNot(self.drs.clean())
  987. def renumber_sentences(self, f):
  988. return BoxerNot(self.drs.renumber_sentences(f))
  989. def __repr__(self):
  990. return "not(%s)" % (self.drs)
  991. def __eq__(self, other):
  992. return self.__class__ == other.__class__ and self.drs == other.drs
  993. def __ne__(self, other):
  994. return not self == other
  995. __hash__ = AbstractBoxerDrs.__hash__
  996. class BoxerIndexed(AbstractBoxerDrs):
  997. def __init__(self, discourse_id, sent_index, word_indices):
  998. AbstractBoxerDrs.__init__(self)
  999. self.discourse_id = discourse_id
  1000. self.sent_index = sent_index
  1001. self.word_indices = word_indices
  1002. def atoms(self):
  1003. return set([self])
  1004. def __eq__(self, other):
  1005. return (
  1006. self.__class__ == other.__class__
  1007. and self.discourse_id == other.discourse_id
  1008. and self.sent_index == other.sent_index
  1009. and self.word_indices == other.word_indices
  1010. and reduce(operator.and_, (s == o for s, o in zip(self, other)))
  1011. )
  1012. def __ne__(self, other):
  1013. return not self == other
  1014. __hash__ = AbstractBoxerDrs.__hash__
  1015. def __repr__(self):
  1016. s = "%s(%s, %s, [%s]" % (
  1017. self._pred(),
  1018. self.discourse_id,
  1019. self.sent_index,
  1020. ", ".join("%s" % wi for wi in self.word_indices),
  1021. )
  1022. for v in self:
  1023. s += ", %s" % v
  1024. return s + ")"
  1025. class BoxerPred(BoxerIndexed):
  1026. def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
  1027. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1028. self.var = var
  1029. self.name = name
  1030. self.pos = pos
  1031. self.sense = sense
  1032. def _variables(self):
  1033. return (set([self.var]), set(), set())
  1034. def change_var(self, var):
  1035. return BoxerPred(
  1036. self.discourse_id,
  1037. self.sent_index,
  1038. self.word_indices,
  1039. var,
  1040. self.name,
  1041. self.pos,
  1042. self.sense,
  1043. )
  1044. def clean(self):
  1045. return BoxerPred(
  1046. self.discourse_id,
  1047. self.sent_index,
  1048. self.word_indices,
  1049. self.var,
  1050. self._clean_name(self.name),
  1051. self.pos,
  1052. self.sense,
  1053. )
  1054. def renumber_sentences(self, f):
  1055. new_sent_index = f(self.sent_index)
  1056. return BoxerPred(
  1057. self.discourse_id,
  1058. new_sent_index,
  1059. self.word_indices,
  1060. self.var,
  1061. self.name,
  1062. self.pos,
  1063. self.sense,
  1064. )
  1065. def __iter__(self):
  1066. return iter((self.var, self.name, self.pos, self.sense))
  1067. def _pred(self):
  1068. return "pred"
  1069. class BoxerNamed(BoxerIndexed):
  1070. def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
  1071. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1072. self.var = var
  1073. self.name = name
  1074. self.type = type
  1075. self.sense = sense
  1076. def _variables(self):
  1077. return (set([self.var]), set(), set())
  1078. def change_var(self, var):
  1079. return BoxerNamed(
  1080. self.discourse_id,
  1081. self.sent_index,
  1082. self.word_indices,
  1083. var,
  1084. self.name,
  1085. self.type,
  1086. self.sense,
  1087. )
  1088. def clean(self):
  1089. return BoxerNamed(
  1090. self.discourse_id,
  1091. self.sent_index,
  1092. self.word_indices,
  1093. self.var,
  1094. self._clean_name(self.name),
  1095. self.type,
  1096. self.sense,
  1097. )
  1098. def renumber_sentences(self, f):
  1099. return BoxerNamed(
  1100. self.discourse_id,
  1101. f(self.sent_index),
  1102. self.word_indices,
  1103. self.var,
  1104. self.name,
  1105. self.type,
  1106. self.sense,
  1107. )
  1108. def __iter__(self):
  1109. return iter((self.var, self.name, self.type, self.sense))
  1110. def _pred(self):
  1111. return "named"
  1112. class BoxerRel(BoxerIndexed):
  1113. def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
  1114. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1115. self.var1 = var1
  1116. self.var2 = var2
  1117. self.rel = rel
  1118. self.sense = sense
  1119. def _variables(self):
  1120. return (set([self.var1, self.var2]), set(), set())
  1121. def clean(self):
  1122. return BoxerRel(
  1123. self.discourse_id,
  1124. self.sent_index,
  1125. self.word_indices,
  1126. self.var1,
  1127. self.var2,
  1128. self._clean_name(self.rel),
  1129. self.sense,
  1130. )
  1131. def renumber_sentences(self, f):
  1132. return BoxerRel(
  1133. self.discourse_id,
  1134. f(self.sent_index),
  1135. self.word_indices,
  1136. self.var1,
  1137. self.var2,
  1138. self.rel,
  1139. self.sense,
  1140. )
  1141. def __iter__(self):
  1142. return iter((self.var1, self.var2, self.rel, self.sense))
  1143. def _pred(self):
  1144. return "rel"
  1145. class BoxerProp(BoxerIndexed):
  1146. def __init__(self, discourse_id, sent_index, word_indices, var, drs):
  1147. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1148. self.var = var
  1149. self.drs = drs
  1150. def _variables(self):
  1151. return tuple(
  1152. map(operator.or_, (set(), set(), set([self.var])), self.drs._variables())
  1153. )
  1154. def referenced_labels(self):
  1155. return set([self.drs])
  1156. def atoms(self):
  1157. return self.drs.atoms()
  1158. def clean(self):
  1159. return BoxerProp(
  1160. self.discourse_id,
  1161. self.sent_index,
  1162. self.word_indices,
  1163. self.var,
  1164. self.drs.clean(),
  1165. )
  1166. def renumber_sentences(self, f):
  1167. return BoxerProp(
  1168. self.discourse_id,
  1169. f(self.sent_index),
  1170. self.word_indices,
  1171. self.var,
  1172. self.drs.renumber_sentences(f),
  1173. )
  1174. def __iter__(self):
  1175. return iter((self.var, self.drs))
  1176. def _pred(self):
  1177. return "prop"
  1178. class BoxerEq(BoxerIndexed):
  1179. def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
  1180. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1181. self.var1 = var1
  1182. self.var2 = var2
  1183. def _variables(self):
  1184. return (set([self.var1, self.var2]), set(), set())
  1185. def atoms(self):
  1186. return set()
  1187. def renumber_sentences(self, f):
  1188. return BoxerEq(
  1189. self.discourse_id,
  1190. f(self.sent_index),
  1191. self.word_indices,
  1192. self.var1,
  1193. self.var2,
  1194. )
  1195. def __iter__(self):
  1196. return iter((self.var1, self.var2))
  1197. def _pred(self):
  1198. return "eq"
  1199. class BoxerCard(BoxerIndexed):
  1200. def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
  1201. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1202. self.var = var
  1203. self.value = value
  1204. self.type = type
  1205. def _variables(self):
  1206. return (set([self.var]), set(), set())
  1207. def renumber_sentences(self, f):
  1208. return BoxerCard(
  1209. self.discourse_id,
  1210. f(self.sent_index),
  1211. self.word_indices,
  1212. self.var,
  1213. self.value,
  1214. self.type,
  1215. )
  1216. def __iter__(self):
  1217. return iter((self.var, self.value, self.type))
  1218. def _pred(self):
  1219. return "card"
  1220. class BoxerOr(BoxerIndexed):
  1221. def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
  1222. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1223. self.drs1 = drs1
  1224. self.drs2 = drs2
  1225. def _variables(self):
  1226. return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
  1227. def atoms(self):
  1228. return self.drs1.atoms() | self.drs2.atoms()
  1229. def clean(self):
  1230. return BoxerOr(
  1231. self.discourse_id,
  1232. self.sent_index,
  1233. self.word_indices,
  1234. self.drs1.clean(),
  1235. self.drs2.clean(),
  1236. )
  1237. def renumber_sentences(self, f):
  1238. return BoxerOr(
  1239. self.discourse_id,
  1240. f(self.sent_index),
  1241. self.word_indices,
  1242. self.drs1,
  1243. self.drs2,
  1244. )
  1245. def __iter__(self):
  1246. return iter((self.drs1, self.drs2))
  1247. def _pred(self):
  1248. return "or"
  1249. class BoxerWhq(BoxerIndexed):
  1250. def __init__(
  1251. self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
  1252. ):
  1253. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1254. self.ans_types = ans_types
  1255. self.drs1 = drs1
  1256. self.variable = variable
  1257. self.drs2 = drs2
  1258. def _variables(self):
  1259. return tuple(
  1260. map(
  1261. operator.or_,
  1262. (set([self.variable]), set(), set()),
  1263. self.drs1._variables(),
  1264. self.drs2._variables(),
  1265. )
  1266. )
  1267. def atoms(self):
  1268. return self.drs1.atoms() | self.drs2.atoms()
  1269. def clean(self):
  1270. return BoxerWhq(
  1271. self.discourse_id,
  1272. self.sent_index,
  1273. self.word_indices,
  1274. self.ans_types,
  1275. self.drs1.clean(),
  1276. self.variable,
  1277. self.drs2.clean(),
  1278. )
  1279. def renumber_sentences(self, f):
  1280. return BoxerWhq(
  1281. self.discourse_id,
  1282. f(self.sent_index),
  1283. self.word_indices,
  1284. self.ans_types,
  1285. self.drs1,
  1286. self.variable,
  1287. self.drs2,
  1288. )
  1289. def __iter__(self):
  1290. return iter(
  1291. ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2)
  1292. )
  1293. def _pred(self):
  1294. return "whq"
  1295. class PassthroughBoxerDrsInterpreter(object):
  1296. def interpret(self, ex):
  1297. return ex
  1298. class NltkDrtBoxerDrsInterpreter(object):
  1299. def __init__(self, occur_index=False):
  1300. self._occur_index = occur_index
  1301. def interpret(self, ex):
  1302. """
  1303. :param ex: ``AbstractBoxerDrs``
  1304. :return: ``DrtExpression``
  1305. """
  1306. if isinstance(ex, BoxerDrs):
  1307. drs = DRS(
  1308. [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
  1309. )
  1310. if ex.consequent is not None:
  1311. drs.consequent = self.interpret(ex.consequent)
  1312. return drs
  1313. elif isinstance(ex, BoxerNot):
  1314. return DrtNegatedExpression(self.interpret(ex.drs))
  1315. elif isinstance(ex, BoxerPred):
  1316. pred = self._add_occur_indexing("%s_%s" % (ex.pos, ex.name), ex)
  1317. return self._make_atom(pred, ex.var)
  1318. elif isinstance(ex, BoxerNamed):
  1319. pred = self._add_occur_indexing("ne_%s_%s" % (ex.type, ex.name), ex)
  1320. return self._make_atom(pred, ex.var)
  1321. elif isinstance(ex, BoxerRel):
  1322. pred = self._add_occur_indexing("%s" % (ex.rel), ex)
  1323. return self._make_atom(pred, ex.var1, ex.var2)
  1324. elif isinstance(ex, BoxerProp):
  1325. return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
  1326. elif isinstance(ex, BoxerEq):
  1327. return DrtEqualityExpression(
  1328. DrtVariableExpression(Variable(ex.var1)),
  1329. DrtVariableExpression(Variable(ex.var2)),
  1330. )
  1331. elif isinstance(ex, BoxerCard):
  1332. pred = self._add_occur_indexing("card_%s_%s" % (ex.type, ex.value), ex)
  1333. return self._make_atom(pred, ex.var)
  1334. elif isinstance(ex, BoxerOr):
  1335. return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
  1336. elif isinstance(ex, BoxerWhq):
  1337. drs1 = self.interpret(ex.drs1)
  1338. drs2 = self.interpret(ex.drs2)
  1339. return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
  1340. assert False, "%s: %s" % (ex.__class__.__name__, ex)
  1341. def _make_atom(self, pred, *args):
  1342. accum = DrtVariableExpression(Variable(pred))
  1343. for arg in args:
  1344. accum = DrtApplicationExpression(
  1345. accum, DrtVariableExpression(Variable(arg))
  1346. )
  1347. return accum
  1348. def _add_occur_indexing(self, base, ex):
  1349. if self._occur_index and ex.sent_index is not None:
  1350. if ex.discourse_id:
  1351. base += "_%s" % ex.discourse_id
  1352. base += "_s%s" % ex.sent_index
  1353. base += "_w%s" % sorted(ex.word_indices)[0]
  1354. return base
  1355. class UnparseableInputException(Exception):
  1356. pass
  1357. if __name__ == "__main__":
  1358. opts = OptionParser("usage: %prog TEXT [options]")
  1359. opts.add_option(
  1360. "--verbose",
  1361. "-v",
  1362. help="display verbose logs",
  1363. action="store_true",
  1364. default=False,
  1365. dest="verbose",
  1366. )
  1367. opts.add_option(
  1368. "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
  1369. )
  1370. opts.add_option(
  1371. "--question",
  1372. "-q",
  1373. help="input is a question",
  1374. action="store_true",
  1375. default=False,
  1376. dest="question",
  1377. )
  1378. opts.add_option(
  1379. "--occur",
  1380. "-o",
  1381. help="occurrence index",
  1382. action="store_true",
  1383. default=False,
  1384. dest="occur_index",
  1385. )
  1386. (options, args) = opts.parse_args()
  1387. if len(args) != 1:
  1388. opts.error("incorrect number of arguments")
  1389. interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
  1390. drs = Boxer(interpreter).interpret_multi(
  1391. args[0].split(r"\n"), question=options.question, verbose=options.verbose
  1392. )
  1393. if drs is None:
  1394. print(None)
  1395. else:
  1396. drs = drs.simplify().eliminate_equality()
  1397. if options.fol:
  1398. print(drs.fol().normalize())
  1399. else:
  1400. drs.pretty_print()