propbank.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # Natural Language Toolkit: PropBank Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. import re
  8. from functools import total_ordering
  9. from xml.etree import ElementTree
  10. from nltk.tree import Tree
  11. from nltk.internals import raise_unorderable_types
  12. from nltk.corpus.reader.util import *
  13. from nltk.corpus.reader.api import *
  14. class PropbankCorpusReader(CorpusReader):
  15. """
  16. Corpus reader for the propbank corpus, which augments the Penn
  17. Treebank with information about the predicate argument structure
  18. of every verb instance. The corpus consists of two parts: the
  19. predicate-argument annotations themselves, and a set of "frameset
  20. files" which define the argument labels used by the annotations,
  21. on a per-verb basis. Each "frameset file" contains one or more
  22. predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
  23. divided into coarse-grained word senses called "rolesets". For
  24. each "roleset", the frameset file provides descriptions of the
  25. argument roles, along with examples.
  26. """
  27. def __init__(
  28. self,
  29. root,
  30. propfile,
  31. framefiles="",
  32. verbsfile=None,
  33. parse_fileid_xform=None,
  34. parse_corpus=None,
  35. encoding="utf8",
  36. ):
  37. """
  38. :param root: The root directory for this corpus.
  39. :param propfile: The name of the file containing the predicate-
  40. argument annotations (relative to ``root``).
  41. :param framefiles: A list or regexp specifying the frameset
  42. fileids for this corpus.
  43. :param parse_fileid_xform: A transform that should be applied
  44. to the fileids in this corpus. This should be a function
  45. of one argument (a fileid) that returns a string (the new
  46. fileid).
  47. :param parse_corpus: The corpus containing the parse trees
  48. corresponding to this corpus. These parse trees are
  49. necessary to resolve the tree pointers used by propbank.
  50. """
  51. # If framefiles is specified as a regexp, expand it.
  52. if isinstance(framefiles, str):
  53. framefiles = find_corpus_fileids(root, framefiles)
  54. framefiles = list(framefiles)
  55. # Initialze the corpus reader.
  56. CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
  57. # Record our frame fileids & prop file.
  58. self._propfile = propfile
  59. self._framefiles = framefiles
  60. self._verbsfile = verbsfile
  61. self._parse_fileid_xform = parse_fileid_xform
  62. self._parse_corpus = parse_corpus
  63. def raw(self, fileids=None):
  64. """
  65. :return: the text contents of the given fileids, as a single string.
  66. """
  67. if fileids is None:
  68. fileids = self._fileids
  69. elif isinstance(fileids):
  70. fileids = [fileids]
  71. return concat([self.open(f).read() for f in fileids])
  72. def instances(self, baseform=None):
  73. """
  74. :return: a corpus view that acts as a list of
  75. ``PropBankInstance`` objects, one for each noun in the corpus.
  76. """
  77. kwargs = {}
  78. if baseform is not None:
  79. kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
  80. return StreamBackedCorpusView(
  81. self.abspath(self._propfile),
  82. lambda stream: self._read_instance_block(stream, **kwargs),
  83. encoding=self.encoding(self._propfile),
  84. )
  85. def lines(self):
  86. """
  87. :return: a corpus view that acts as a list of strings, one for
  88. each line in the predicate-argument annotation file.
  89. """
  90. return StreamBackedCorpusView(
  91. self.abspath(self._propfile),
  92. read_line_block,
  93. encoding=self.encoding(self._propfile),
  94. )
  95. def roleset(self, roleset_id):
  96. """
  97. :return: the xml description for the given roleset.
  98. """
  99. baseform = roleset_id.split(".")[0]
  100. framefile = "frames/%s.xml" % baseform
  101. if framefile not in self._framefiles:
  102. raise ValueError("Frameset file for %s not found" % roleset_id)
  103. # n.b.: The encoding for XML fileids is specified by the file
  104. # itself; so we ignore self._encoding here.
  105. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  106. for roleset in etree.findall("predicate/roleset"):
  107. if roleset.attrib["id"] == roleset_id:
  108. return roleset
  109. raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
  110. def rolesets(self, baseform=None):
  111. """
  112. :return: list of xml descriptions for rolesets.
  113. """
  114. if baseform is not None:
  115. framefile = "frames/%s.xml" % baseform
  116. if framefile not in self._framefiles:
  117. raise ValueError("Frameset file for %s not found" % baseform)
  118. framefiles = [framefile]
  119. else:
  120. framefiles = self._framefiles
  121. rsets = []
  122. for framefile in framefiles:
  123. # n.b.: The encoding for XML fileids is specified by the file
  124. # itself; so we ignore self._encoding here.
  125. etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
  126. rsets.append(etree.findall("predicate/roleset"))
  127. return LazyConcatenation(rsets)
  128. def verbs(self):
  129. """
  130. :return: a corpus view that acts as a list of all verb lemmas
  131. in this corpus (from the verbs.txt file).
  132. """
  133. return StreamBackedCorpusView(
  134. self.abspath(self._verbsfile),
  135. read_line_block,
  136. encoding=self.encoding(self._verbsfile),
  137. )
  138. def _read_instance_block(self, stream, instance_filter=lambda inst: True):
  139. block = []
  140. # Read 100 at a time.
  141. for i in range(100):
  142. line = stream.readline().strip()
  143. if line:
  144. inst = PropbankInstance.parse(
  145. line, self._parse_fileid_xform, self._parse_corpus
  146. )
  147. if instance_filter(inst):
  148. block.append(inst)
  149. return block
  150. ######################################################################
  151. # { Propbank Instance & related datatypes
  152. ######################################################################
  153. class PropbankInstance(object):
  154. def __init__(
  155. self,
  156. fileid,
  157. sentnum,
  158. wordnum,
  159. tagger,
  160. roleset,
  161. inflection,
  162. predicate,
  163. arguments,
  164. parse_corpus=None,
  165. ):
  166. self.fileid = fileid
  167. """The name of the file containing the parse tree for this
  168. instance's sentence."""
  169. self.sentnum = sentnum
  170. """The sentence number of this sentence within ``fileid``.
  171. Indexing starts from zero."""
  172. self.wordnum = wordnum
  173. """The word number of this instance's predicate within its
  174. containing sentence. Word numbers are indexed starting from
  175. zero, and include traces and other empty parse elements."""
  176. self.tagger = tagger
  177. """An identifier for the tagger who tagged this instance; or
  178. ``'gold'`` if this is an adjuticated instance."""
  179. self.roleset = roleset
  180. """The name of the roleset used by this instance's predicate.
  181. Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
  182. look up information about the roleset."""
  183. self.inflection = inflection
  184. """A ``PropbankInflection`` object describing the inflection of
  185. this instance's predicate."""
  186. self.predicate = predicate
  187. """A ``PropbankTreePointer`` indicating the position of this
  188. instance's predicate within its containing sentence."""
  189. self.arguments = tuple(arguments)
  190. """A list of tuples (argloc, argid), specifying the location
  191. and identifier for each of the predicate's argument in the
  192. containing sentence. Argument identifiers are strings such as
  193. ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
  194. the predicate."""
  195. self.parse_corpus = parse_corpus
  196. """A corpus reader for the parse trees corresponding to the
  197. instances in this propbank corpus."""
  198. @property
  199. def baseform(self):
  200. """The baseform of the predicate."""
  201. return self.roleset.split(".")[0]
  202. @property
  203. def sensenumber(self):
  204. """The sense number of the predicate."""
  205. return self.roleset.split(".")[1]
  206. @property
  207. def predid(self):
  208. """Identifier of the predicate."""
  209. return "rel"
  210. def __repr__(self):
  211. return "<PropbankInstance: %s, sent %s, word %s>" % (
  212. self.fileid,
  213. self.sentnum,
  214. self.wordnum,
  215. )
  216. def __str__(self):
  217. s = "%s %s %s %s %s %s" % (
  218. self.fileid,
  219. self.sentnum,
  220. self.wordnum,
  221. self.tagger,
  222. self.roleset,
  223. self.inflection,
  224. )
  225. items = self.arguments + ((self.predicate, "rel"),)
  226. for (argloc, argid) in sorted(items):
  227. s += " %s-%s" % (argloc, argid)
  228. return s
  229. def _get_tree(self):
  230. if self.parse_corpus is None:
  231. return None
  232. if self.fileid not in self.parse_corpus.fileids():
  233. return None
  234. return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
  235. tree = property(
  236. _get_tree,
  237. doc="""
  238. The parse tree corresponding to this instance, or None if
  239. the corresponding tree is not available.""",
  240. )
  241. @staticmethod
  242. def parse(s, parse_fileid_xform=None, parse_corpus=None):
  243. pieces = s.split()
  244. if len(pieces) < 7:
  245. raise ValueError("Badly formatted propbank line: %r" % s)
  246. # Divide the line into its basic pieces.
  247. (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
  248. rel = [p for p in pieces[6:] if p.endswith("-rel")]
  249. args = [p for p in pieces[6:] if not p.endswith("-rel")]
  250. if len(rel) != 1:
  251. raise ValueError("Badly formatted propbank line: %r" % s)
  252. # Apply the fileid selector, if any.
  253. if parse_fileid_xform is not None:
  254. fileid = parse_fileid_xform(fileid)
  255. # Convert sentence & word numbers to ints.
  256. sentnum = int(sentnum)
  257. wordnum = int(wordnum)
  258. # Parse the inflection
  259. inflection = PropbankInflection.parse(inflection)
  260. # Parse the predicate location.
  261. predicate = PropbankTreePointer.parse(rel[0][:-4])
  262. # Parse the arguments.
  263. arguments = []
  264. for arg in args:
  265. argloc, argid = arg.split("-", 1)
  266. arguments.append((PropbankTreePointer.parse(argloc), argid))
  267. # Put it all together.
  268. return PropbankInstance(
  269. fileid,
  270. sentnum,
  271. wordnum,
  272. tagger,
  273. roleset,
  274. inflection,
  275. predicate,
  276. arguments,
  277. parse_corpus,
  278. )
  279. class PropbankPointer(object):
  280. """
  281. A pointer used by propbank to identify one or more constituents in
  282. a parse tree. ``PropbankPointer`` is an abstract base class with
  283. three concrete subclasses:
  284. - ``PropbankTreePointer`` is used to point to single constituents.
  285. - ``PropbankSplitTreePointer`` is used to point to 'split'
  286. constituents, which consist of a sequence of two or more
  287. ``PropbankTreePointer`` pointers.
  288. - ``PropbankChainTreePointer`` is used to point to entire trace
  289. chains in a tree. It consists of a sequence of pieces, which
  290. can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
  291. """
  292. def __init__(self):
  293. if self.__class__ == PropbankPointer:
  294. raise NotImplementedError()
  295. class PropbankChainTreePointer(PropbankPointer):
  296. def __init__(self, pieces):
  297. self.pieces = pieces
  298. """A list of the pieces that make up this chain. Elements may
  299. be either ``PropbankSplitTreePointer`` or
  300. ``PropbankTreePointer`` pointers."""
  301. def __str__(self):
  302. return "*".join("%s" % p for p in self.pieces)
  303. def __repr__(self):
  304. return "<PropbankChainTreePointer: %s>" % self
  305. def select(self, tree):
  306. if tree is None:
  307. raise ValueError("Parse tree not avaialable")
  308. return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
  309. class PropbankSplitTreePointer(PropbankPointer):
  310. def __init__(self, pieces):
  311. self.pieces = pieces
  312. """A list of the pieces that make up this chain. Elements are
  313. all ``PropbankTreePointer`` pointers."""
  314. def __str__(self):
  315. return ",".join("%s" % p for p in self.pieces)
  316. def __repr__(self):
  317. return "<PropbankSplitTreePointer: %s>" % self
  318. def select(self, tree):
  319. if tree is None:
  320. raise ValueError("Parse tree not avaialable")
  321. return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
  322. @total_ordering
  323. class PropbankTreePointer(PropbankPointer):
  324. """
  325. wordnum:height*wordnum:height*...
  326. wordnum:height,
  327. """
  328. def __init__(self, wordnum, height):
  329. self.wordnum = wordnum
  330. self.height = height
  331. @staticmethod
  332. def parse(s):
  333. # Deal with chains (xx*yy*zz)
  334. pieces = s.split("*")
  335. if len(pieces) > 1:
  336. return PropbankChainTreePointer(
  337. [PropbankTreePointer.parse(elt) for elt in pieces]
  338. )
  339. # Deal with split args (xx,yy,zz)
  340. pieces = s.split(",")
  341. if len(pieces) > 1:
  342. return PropbankSplitTreePointer(
  343. [PropbankTreePointer.parse(elt) for elt in pieces]
  344. )
  345. # Deal with normal pointers.
  346. pieces = s.split(":")
  347. if len(pieces) != 2:
  348. raise ValueError("bad propbank pointer %r" % s)
  349. return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
  350. def __str__(self):
  351. return "%s:%s" % (self.wordnum, self.height)
  352. def __repr__(self):
  353. return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
  354. def __eq__(self, other):
  355. while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
  356. other = other.pieces[0]
  357. if not isinstance(other, PropbankTreePointer):
  358. return self is other
  359. return self.wordnum == other.wordnum and self.height == other.height
  360. def __ne__(self, other):
  361. return not self == other
  362. def __lt__(self, other):
  363. while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
  364. other = other.pieces[0]
  365. if not isinstance(other, PropbankTreePointer):
  366. return id(self) < id(other)
  367. return (self.wordnum, -self.height) < (other.wordnum, -other.height)
  368. def select(self, tree):
  369. if tree is None:
  370. raise ValueError("Parse tree not avaialable")
  371. return tree[self.treepos(tree)]
  372. def treepos(self, tree):
  373. """
  374. Convert this pointer to a standard 'tree position' pointer,
  375. given that it points to the given tree.
  376. """
  377. if tree is None:
  378. raise ValueError("Parse tree not avaialable")
  379. stack = [tree]
  380. treepos = []
  381. wordnum = 0
  382. while True:
  383. # tree node:
  384. if isinstance(stack[-1], Tree):
  385. # Select the next child.
  386. if len(treepos) < len(stack):
  387. treepos.append(0)
  388. else:
  389. treepos[-1] += 1
  390. # Update the stack.
  391. if treepos[-1] < len(stack[-1]):
  392. stack.append(stack[-1][treepos[-1]])
  393. else:
  394. # End of node's child list: pop up a level.
  395. stack.pop()
  396. treepos.pop()
  397. # word node:
  398. else:
  399. if wordnum == self.wordnum:
  400. return tuple(treepos[: len(treepos) - self.height - 1])
  401. else:
  402. wordnum += 1
  403. stack.pop()
  404. class PropbankInflection(object):
  405. # { Inflection Form
  406. INFINITIVE = "i"
  407. GERUND = "g"
  408. PARTICIPLE = "p"
  409. FINITE = "v"
  410. # { Inflection Tense
  411. FUTURE = "f"
  412. PAST = "p"
  413. PRESENT = "n"
  414. # { Inflection Aspect
  415. PERFECT = "p"
  416. PROGRESSIVE = "o"
  417. PERFECT_AND_PROGRESSIVE = "b"
  418. # { Inflection Person
  419. THIRD_PERSON = "3"
  420. # { Inflection Voice
  421. ACTIVE = "a"
  422. PASSIVE = "p"
  423. # { Inflection
  424. NONE = "-"
  425. # }
  426. def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
  427. self.form = form
  428. self.tense = tense
  429. self.aspect = aspect
  430. self.person = person
  431. self.voice = voice
  432. def __str__(self):
  433. return self.form + self.tense + self.aspect + self.person + self.voice
  434. def __repr__(self):
  435. return "<PropbankInflection: %s>" % self
  436. _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
  437. @staticmethod
  438. def parse(s):
  439. if not isinstance(s, str):
  440. raise TypeError("expected a string")
  441. if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
  442. raise ValueError("Bad propbank inflection string %r" % s)
  443. return PropbankInflection(*s)