nombank.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. # Natural Language Toolkit: NomBank Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Authors: Paul Bedaride <paul.bedaride@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from xml.etree import ElementTree
  9. from functools import total_ordering
  10. from nltk.tree import Tree
  11. from nltk.internals import raise_unorderable_types
  12. from nltk.corpus.reader.util import *
  13. from nltk.corpus.reader.api import *
  14. class NombankCorpusReader(CorpusReader):
  15. """
  16. Corpus reader for the nombank corpus, which augments the Penn
  17. Treebank with information about the predicate argument structure
  18. of every noun instance. The corpus consists of two parts: the
  19. predicate-argument annotations themselves, and a set of "frameset
  20. files" which define the argument labels used by the annotations,
  21. on a per-noun basis. Each "frameset file" contains one or more
  22. predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
  23. divided into coarse-grained word senses called "rolesets". For
  24. each "roleset", the frameset file provides descriptions of the
  25. argument roles, along with examples.
  26. """
  27. def __init__(
  28. self,
  29. root,
  30. nomfile,
  31. framefiles="",
  32. nounsfile=None,
  33. parse_fileid_xform=None,
  34. parse_corpus=None,
  35. encoding="utf8",
  36. ):
  37. """
  38. :param root: The root directory for this corpus.
  39. :param nomfile: The name of the file containing the predicate-
  40. argument annotations (relative to ``root``).
  41. :param framefiles: A list or regexp specifying the frameset
  42. fileids for this corpus.
  43. :param parse_fileid_xform: A transform that should be applied
  44. to the fileids in this corpus. This should be a function
  45. of one argument (a fileid) that returns a string (the new
  46. fileid).
  47. :param parse_corpus: The corpus containing the parse trees
  48. corresponding to this corpus. These parse trees are
  49. necessary to resolve the tree pointers used by nombank.
  50. """
  51. # If framefiles is specified as a regexp, expand it.
  52. if isinstance(framefiles, str):
  53. self._fileids = find_corpus_fileids(root, framefiles)
  54. self._fileids = list(framefiles)
  55. # Initialze the corpus reader.
  56. CorpusReader.__init__(self, root, framefiles, encoding)
  57. # Record our nom file & nouns file.
  58. self._nomfile = nomfile
  59. self._nounsfile = nounsfile
  60. self._parse_fileid_xform = parse_fileid_xform
  61. self._parse_corpus = parse_corpus
  62. def raw(self, fileids=None):
  63. """
  64. :return: the text contents of the given fileids, as a single string.
  65. """
  66. if fileids is None:
  67. fileids = self._fileids
  68. elif isinstance(fileids, str):
  69. fileids = [fileids]
  70. return concat([self.open(f).read() for f in fileids])
  71. def instances(self, baseform=None):
  72. """
  73. :return: a corpus view that acts as a list of
  74. ``NombankInstance`` objects, one for each noun in the corpus.
  75. """
  76. kwargs = {}
  77. if baseform is not None:
  78. kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
  79. return StreamBackedCorpusView(
  80. self.abspath(self._nomfile),
  81. lambda stream: self._read_instance_block(stream, **kwargs),
  82. encoding=self.encoding(self._nomfile),
  83. )
  84. def lines(self):
  85. """
  86. :return: a corpus view that acts as a list of strings, one for
  87. each line in the predicate-argument annotation file.
  88. """
  89. return StreamBackedCorpusView(
  90. self.abspath(self._nomfile),
  91. read_line_block,
  92. encoding=self.encoding(self._nomfile),
  93. )
  94. def roleset(self, roleset_id):
  95. """
  96. :return: the xml description for the given roleset.
  97. """
  98. baseform = roleset_id.split(".")[0]
  99. baseform = baseform.replace("perc-sign", "%")
  100. baseform = baseform.replace("oneslashonezero", "1/10").replace(
  101. "1/10", "1-slash-10"
  102. )
  103. framefile = "frames/%s.xml" % baseform
  104. if framefile not in self.fileids():
  105. raise ValueError("Frameset file for %s not found" % roleset_id)
  106. # n.b.: The encoding for XML fileids is specified by the file
  107. # itself; so we ignore self._encoding here.
  108. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  109. for roleset in etree.findall("predicate/roleset"):
  110. if roleset.attrib["id"] == roleset_id:
  111. return roleset
  112. raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
  113. def rolesets(self, baseform=None):
  114. """
  115. :return: list of xml descriptions for rolesets.
  116. """
  117. if baseform is not None:
  118. framefile = "frames/%s.xml" % baseform
  119. if framefile not in self.fileids():
  120. raise ValueError("Frameset file for %s not found" % baseform)
  121. framefiles = [framefile]
  122. else:
  123. framefiles = self.fileids()
  124. rsets = []
  125. for framefile in framefiles:
  126. # n.b.: The encoding for XML fileids is specified by the file
  127. # itself; so we ignore self._encoding here.
  128. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  129. rsets.append(etree.findall("predicate/roleset"))
  130. return LazyConcatenation(rsets)
  131. def nouns(self):
  132. """
  133. :return: a corpus view that acts as a list of all noun lemmas
  134. in this corpus (from the nombank.1.0.words file).
  135. """
  136. return StreamBackedCorpusView(
  137. self.abspath(self._nounsfile),
  138. read_line_block,
  139. encoding=self.encoding(self._nounsfile),
  140. )
  141. def _read_instance_block(self, stream, instance_filter=lambda inst: True):
  142. block = []
  143. # Read 100 at a time.
  144. for i in range(100):
  145. line = stream.readline().strip()
  146. if line:
  147. inst = NombankInstance.parse(
  148. line, self._parse_fileid_xform, self._parse_corpus
  149. )
  150. if instance_filter(inst):
  151. block.append(inst)
  152. return block
  153. ######################################################################
  154. # { Nombank Instance & related datatypes
  155. ######################################################################
  156. class NombankInstance(object):
  157. def __init__(
  158. self,
  159. fileid,
  160. sentnum,
  161. wordnum,
  162. baseform,
  163. sensenumber,
  164. predicate,
  165. predid,
  166. arguments,
  167. parse_corpus=None,
  168. ):
  169. self.fileid = fileid
  170. """The name of the file containing the parse tree for this
  171. instance's sentence."""
  172. self.sentnum = sentnum
  173. """The sentence number of this sentence within ``fileid``.
  174. Indexing starts from zero."""
  175. self.wordnum = wordnum
  176. """The word number of this instance's predicate within its
  177. containing sentence. Word numbers are indexed starting from
  178. zero, and include traces and other empty parse elements."""
  179. self.baseform = baseform
  180. """The baseform of the predicate."""
  181. self.sensenumber = sensenumber
  182. """The sense number of the predicate."""
  183. self.predicate = predicate
  184. """A ``NombankTreePointer`` indicating the position of this
  185. instance's predicate within its containing sentence."""
  186. self.predid = predid
  187. """Identifier of the predicate."""
  188. self.arguments = tuple(arguments)
  189. """A list of tuples (argloc, argid), specifying the location
  190. and identifier for each of the predicate's argument in the
  191. containing sentence. Argument identifiers are strings such as
  192. ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
  193. the predicate."""
  194. self.parse_corpus = parse_corpus
  195. """A corpus reader for the parse trees corresponding to the
  196. instances in this nombank corpus."""
  197. @property
  198. def roleset(self):
  199. """The name of the roleset used by this instance's predicate.
  200. Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
  201. look up information about the roleset."""
  202. r = self.baseform.replace("%", "perc-sign")
  203. r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
  204. return "%s.%s" % (r, self.sensenumber)
  205. def __repr__(self):
  206. return "<NombankInstance: %s, sent %s, word %s>" % (
  207. self.fileid,
  208. self.sentnum,
  209. self.wordnum,
  210. )
  211. def __str__(self):
  212. s = "%s %s %s %s %s" % (
  213. self.fileid,
  214. self.sentnum,
  215. self.wordnum,
  216. self.baseform,
  217. self.sensenumber,
  218. )
  219. items = self.arguments + ((self.predicate, "rel"),)
  220. for (argloc, argid) in sorted(items):
  221. s += " %s-%s" % (argloc, argid)
  222. return s
  223. def _get_tree(self):
  224. if self.parse_corpus is None:
  225. return None
  226. if self.fileid not in self.parse_corpus.fileids():
  227. return None
  228. return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
  229. tree = property(
  230. _get_tree,
  231. doc="""
  232. The parse tree corresponding to this instance, or None if
  233. the corresponding tree is not available.""",
  234. )
  235. @staticmethod
  236. def parse(s, parse_fileid_xform=None, parse_corpus=None):
  237. pieces = s.split()
  238. if len(pieces) < 6:
  239. raise ValueError("Badly formatted nombank line: %r" % s)
  240. # Divide the line into its basic pieces.
  241. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
  242. args = pieces[5:]
  243. rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
  244. if len(rel) != 1:
  245. raise ValueError("Badly formatted nombank line: %r" % s)
  246. # Apply the fileid selector, if any.
  247. if parse_fileid_xform is not None:
  248. fileid = parse_fileid_xform(fileid)
  249. # Convert sentence & word numbers to ints.
  250. sentnum = int(sentnum)
  251. wordnum = int(wordnum)
  252. # Parse the predicate location.
  253. predloc, predid = rel[0].split("-", 1)
  254. predicate = NombankTreePointer.parse(predloc)
  255. # Parse the arguments.
  256. arguments = []
  257. for arg in args:
  258. argloc, argid = arg.split("-", 1)
  259. arguments.append((NombankTreePointer.parse(argloc), argid))
  260. # Put it all together.
  261. return NombankInstance(
  262. fileid,
  263. sentnum,
  264. wordnum,
  265. baseform,
  266. sensenumber,
  267. predicate,
  268. predid,
  269. arguments,
  270. parse_corpus,
  271. )
  272. class NombankPointer(object):
  273. """
  274. A pointer used by nombank to identify one or more constituents in
  275. a parse tree. ``NombankPointer`` is an abstract base class with
  276. three concrete subclasses:
  277. - ``NombankTreePointer`` is used to point to single constituents.
  278. - ``NombankSplitTreePointer`` is used to point to 'split'
  279. constituents, which consist of a sequence of two or more
  280. ``NombankTreePointer`` pointers.
  281. - ``NombankChainTreePointer`` is used to point to entire trace
  282. chains in a tree. It consists of a sequence of pieces, which
  283. can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
  284. """
  285. def __init__(self):
  286. if self.__class__ == NombankPointer:
  287. raise NotImplementedError()
  288. class NombankChainTreePointer(NombankPointer):
  289. def __init__(self, pieces):
  290. self.pieces = pieces
  291. """A list of the pieces that make up this chain. Elements may
  292. be either ``NombankSplitTreePointer`` or
  293. ``NombankTreePointer`` pointers."""
  294. def __str__(self):
  295. return "*".join("%s" % p for p in self.pieces)
  296. def __repr__(self):
  297. return "<NombankChainTreePointer: %s>" % self
  298. def select(self, tree):
  299. if tree is None:
  300. raise ValueError("Parse tree not avaialable")
  301. return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
  302. class NombankSplitTreePointer(NombankPointer):
  303. def __init__(self, pieces):
  304. self.pieces = pieces
  305. """A list of the pieces that make up this chain. Elements are
  306. all ``NombankTreePointer`` pointers."""
  307. def __str__(self):
  308. return ",".join("%s" % p for p in self.pieces)
  309. def __repr__(self):
  310. return "<NombankSplitTreePointer: %s>" % self
  311. def select(self, tree):
  312. if tree is None:
  313. raise ValueError("Parse tree not avaialable")
  314. return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
  315. @total_ordering
  316. class NombankTreePointer(NombankPointer):
  317. """
  318. wordnum:height*wordnum:height*...
  319. wordnum:height,
  320. """
  321. def __init__(self, wordnum, height):
  322. self.wordnum = wordnum
  323. self.height = height
  324. @staticmethod
  325. def parse(s):
  326. # Deal with chains (xx*yy*zz)
  327. pieces = s.split("*")
  328. if len(pieces) > 1:
  329. return NombankChainTreePointer(
  330. [NombankTreePointer.parse(elt) for elt in pieces]
  331. )
  332. # Deal with split args (xx,yy,zz)
  333. pieces = s.split(",")
  334. if len(pieces) > 1:
  335. return NombankSplitTreePointer(
  336. [NombankTreePointer.parse(elt) for elt in pieces]
  337. )
  338. # Deal with normal pointers.
  339. pieces = s.split(":")
  340. if len(pieces) != 2:
  341. raise ValueError("bad nombank pointer %r" % s)
  342. return NombankTreePointer(int(pieces[0]), int(pieces[1]))
  343. def __str__(self):
  344. return "%s:%s" % (self.wordnum, self.height)
  345. def __repr__(self):
  346. return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)
  347. def __eq__(self, other):
  348. while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
  349. other = other.pieces[0]
  350. if not isinstance(other, NombankTreePointer):
  351. return self is other
  352. return self.wordnum == other.wordnum and self.height == other.height
  353. def __ne__(self, other):
  354. return not self == other
  355. def __lt__(self, other):
  356. while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
  357. other = other.pieces[0]
  358. if not isinstance(other, NombankTreePointer):
  359. return id(self) < id(other)
  360. return (self.wordnum, -self.height) < (other.wordnum, -other.height)
  361. def select(self, tree):
  362. if tree is None:
  363. raise ValueError("Parse tree not avaialable")
  364. return tree[self.treepos(tree)]
  365. def treepos(self, tree):
  366. """
  367. Convert this pointer to a standard 'tree position' pointer,
  368. given that it points to the given tree.
  369. """
  370. if tree is None:
  371. raise ValueError("Parse tree not avaialable")
  372. stack = [tree]
  373. treepos = []
  374. wordnum = 0
  375. while True:
  376. # tree node:
  377. if isinstance(stack[-1], Tree):
  378. # Select the next child.
  379. if len(treepos) < len(stack):
  380. treepos.append(0)
  381. else:
  382. treepos[-1] += 1
  383. # Update the stack.
  384. if treepos[-1] < len(stack[-1]):
  385. stack.append(stack[-1][treepos[-1]])
  386. else:
  387. # End of node's child list: pop up a level.
  388. stack.pop()
  389. treepos.pop()
  390. # word node:
  391. else:
  392. if wordnum == self.wordnum:
  393. return tuple(treepos[: len(treepos) - self.height - 1])
  394. else:
  395. wordnum += 1
  396. stack.pop()