| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534 |
- # Natural Language Toolkit: PropBank Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- import re
- from functools import total_ordering
- from xml.etree import ElementTree
- from nltk.tree import Tree
- from nltk.internals import raise_unorderable_types
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class PropbankCorpusReader(CorpusReader):
- """
- Corpus reader for the propbank corpus, which augments the Penn
- Treebank with information about the predicate argument structure
- of every verb instance. The corpus consists of two parts: the
- predicate-argument annotations themselves, and a set of "frameset
- files" which define the argument labels used by the annotations,
- on a per-verb basis. Each "frameset file" contains one or more
- predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
- divided into coarse-grained word senses called "rolesets". For
- each "roleset", the frameset file provides descriptions of the
- argument roles, along with examples.
- """
- def __init__(
- self,
- root,
- propfile,
- framefiles="",
- verbsfile=None,
- parse_fileid_xform=None,
- parse_corpus=None,
- encoding="utf8",
- ):
- """
- :param root: The root directory for this corpus.
- :param propfile: The name of the file containing the predicate-
- argument annotations (relative to ``root``).
- :param framefiles: A list or regexp specifying the frameset
- fileids for this corpus.
- :param parse_fileid_xform: A transform that should be applied
- to the fileids in this corpus. This should be a function
- of one argument (a fileid) that returns a string (the new
- fileid).
- :param parse_corpus: The corpus containing the parse trees
- corresponding to this corpus. These parse trees are
- necessary to resolve the tree pointers used by propbank.
- """
- # If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, str):
- framefiles = find_corpus_fileids(root, framefiles)
- framefiles = list(framefiles)
- # Initialze the corpus reader.
- CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
- # Record our frame fileids & prop file.
- self._propfile = propfile
- self._framefiles = framefiles
- self._verbsfile = verbsfile
- self._parse_fileid_xform = parse_fileid_xform
- self._parse_corpus = parse_corpus
- def raw(self, fileids=None):
- """
- :return: the text contents of the given fileids, as a single string.
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def instances(self, baseform=None):
- """
- :return: a corpus view that acts as a list of
- ``PropBankInstance`` objects, one for each noun in the corpus.
- """
- kwargs = {}
- if baseform is not None:
- kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
- return StreamBackedCorpusView(
- self.abspath(self._propfile),
- lambda stream: self._read_instance_block(stream, **kwargs),
- encoding=self.encoding(self._propfile),
- )
- def lines(self):
- """
- :return: a corpus view that acts as a list of strings, one for
- each line in the predicate-argument annotation file.
- """
- return StreamBackedCorpusView(
- self.abspath(self._propfile),
- read_line_block,
- encoding=self.encoding(self._propfile),
- )
- def roleset(self, roleset_id):
- """
- :return: the xml description for the given roleset.
- """
- baseform = roleset_id.split(".")[0]
- framefile = "frames/%s.xml" % baseform
- if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % roleset_id)
- # n.b.: The encoding for XML fileids is specified by the file
- # itself; so we ignore self._encoding here.
- etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- for roleset in etree.findall("predicate/roleset"):
- if roleset.attrib["id"] == roleset_id:
- return roleset
- raise ValueError("Roleset %s not found in %s" % (roleset_id, framefile))
- def rolesets(self, baseform=None):
- """
- :return: list of xml descriptions for rolesets.
- """
- if baseform is not None:
- framefile = "frames/%s.xml" % baseform
- if framefile not in self._framefiles:
- raise ValueError("Frameset file for %s not found" % baseform)
- framefiles = [framefile]
- else:
- framefiles = self._framefiles
- rsets = []
- for framefile in framefiles:
- # n.b.: The encoding for XML fileids is specified by the file
- # itself; so we ignore self._encoding here.
- etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
- rsets.append(etree.findall("predicate/roleset"))
- return LazyConcatenation(rsets)
- def verbs(self):
- """
- :return: a corpus view that acts as a list of all verb lemmas
- in this corpus (from the verbs.txt file).
- """
- return StreamBackedCorpusView(
- self.abspath(self._verbsfile),
- read_line_block,
- encoding=self.encoding(self._verbsfile),
- )
- def _read_instance_block(self, stream, instance_filter=lambda inst: True):
- block = []
- # Read 100 at a time.
- for i in range(100):
- line = stream.readline().strip()
- if line:
- inst = PropbankInstance.parse(
- line, self._parse_fileid_xform, self._parse_corpus
- )
- if instance_filter(inst):
- block.append(inst)
- return block
- ######################################################################
- # { Propbank Instance & related datatypes
- ######################################################################
- class PropbankInstance(object):
- def __init__(
- self,
- fileid,
- sentnum,
- wordnum,
- tagger,
- roleset,
- inflection,
- predicate,
- arguments,
- parse_corpus=None,
- ):
- self.fileid = fileid
- """The name of the file containing the parse tree for this
- instance's sentence."""
- self.sentnum = sentnum
- """The sentence number of this sentence within ``fileid``.
- Indexing starts from zero."""
- self.wordnum = wordnum
- """The word number of this instance's predicate within its
- containing sentence. Word numbers are indexed starting from
- zero, and include traces and other empty parse elements."""
- self.tagger = tagger
- """An identifier for the tagger who tagged this instance; or
- ``'gold'`` if this is an adjuticated instance."""
- self.roleset = roleset
- """The name of the roleset used by this instance's predicate.
- Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
- look up information about the roleset."""
- self.inflection = inflection
- """A ``PropbankInflection`` object describing the inflection of
- this instance's predicate."""
- self.predicate = predicate
- """A ``PropbankTreePointer`` indicating the position of this
- instance's predicate within its containing sentence."""
- self.arguments = tuple(arguments)
- """A list of tuples (argloc, argid), specifying the location
- and identifier for each of the predicate's argument in the
- containing sentence. Argument identifiers are strings such as
- ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
- the predicate."""
- self.parse_corpus = parse_corpus
- """A corpus reader for the parse trees corresponding to the
- instances in this propbank corpus."""
- @property
- def baseform(self):
- """The baseform of the predicate."""
- return self.roleset.split(".")[0]
- @property
- def sensenumber(self):
- """The sense number of the predicate."""
- return self.roleset.split(".")[1]
- @property
- def predid(self):
- """Identifier of the predicate."""
- return "rel"
- def __repr__(self):
- return "<PropbankInstance: %s, sent %s, word %s>" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- )
- def __str__(self):
- s = "%s %s %s %s %s %s" % (
- self.fileid,
- self.sentnum,
- self.wordnum,
- self.tagger,
- self.roleset,
- self.inflection,
- )
- items = self.arguments + ((self.predicate, "rel"),)
- for (argloc, argid) in sorted(items):
- s += " %s-%s" % (argloc, argid)
- return s
- def _get_tree(self):
- if self.parse_corpus is None:
- return None
- if self.fileid not in self.parse_corpus.fileids():
- return None
- return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
- tree = property(
- _get_tree,
- doc="""
- The parse tree corresponding to this instance, or None if
- the corresponding tree is not available.""",
- )
- @staticmethod
- def parse(s, parse_fileid_xform=None, parse_corpus=None):
- pieces = s.split()
- if len(pieces) < 7:
- raise ValueError("Badly formatted propbank line: %r" % s)
- # Divide the line into its basic pieces.
- (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
- rel = [p for p in pieces[6:] if p.endswith("-rel")]
- args = [p for p in pieces[6:] if not p.endswith("-rel")]
- if len(rel) != 1:
- raise ValueError("Badly formatted propbank line: %r" % s)
- # Apply the fileid selector, if any.
- if parse_fileid_xform is not None:
- fileid = parse_fileid_xform(fileid)
- # Convert sentence & word numbers to ints.
- sentnum = int(sentnum)
- wordnum = int(wordnum)
- # Parse the inflection
- inflection = PropbankInflection.parse(inflection)
- # Parse the predicate location.
- predicate = PropbankTreePointer.parse(rel[0][:-4])
- # Parse the arguments.
- arguments = []
- for arg in args:
- argloc, argid = arg.split("-", 1)
- arguments.append((PropbankTreePointer.parse(argloc), argid))
- # Put it all together.
- return PropbankInstance(
- fileid,
- sentnum,
- wordnum,
- tagger,
- roleset,
- inflection,
- predicate,
- arguments,
- parse_corpus,
- )
- class PropbankPointer(object):
- """
- A pointer used by propbank to identify one or more constituents in
- a parse tree. ``PropbankPointer`` is an abstract base class with
- three concrete subclasses:
- - ``PropbankTreePointer`` is used to point to single constituents.
- - ``PropbankSplitTreePointer`` is used to point to 'split'
- constituents, which consist of a sequence of two or more
- ``PropbankTreePointer`` pointers.
- - ``PropbankChainTreePointer`` is used to point to entire trace
- chains in a tree. It consists of a sequence of pieces, which
- can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
- """
- def __init__(self):
- if self.__class__ == PropbankPointer:
- raise NotImplementedError()
- class PropbankChainTreePointer(PropbankPointer):
- def __init__(self, pieces):
- self.pieces = pieces
- """A list of the pieces that make up this chain. Elements may
- be either ``PropbankSplitTreePointer`` or
- ``PropbankTreePointer`` pointers."""
- def __str__(self):
- return "*".join("%s" % p for p in self.pieces)
- def __repr__(self):
- return "<PropbankChainTreePointer: %s>" % self
- def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
- class PropbankSplitTreePointer(PropbankPointer):
- def __init__(self, pieces):
- self.pieces = pieces
- """A list of the pieces that make up this chain. Elements are
- all ``PropbankTreePointer`` pointers."""
- def __str__(self):
- return ",".join("%s" % p for p in self.pieces)
- def __repr__(self):
- return "<PropbankSplitTreePointer: %s>" % self
- def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])
- @total_ordering
- class PropbankTreePointer(PropbankPointer):
- """
- wordnum:height*wordnum:height*...
- wordnum:height,
- """
- def __init__(self, wordnum, height):
- self.wordnum = wordnum
- self.height = height
- @staticmethod
- def parse(s):
- # Deal with chains (xx*yy*zz)
- pieces = s.split("*")
- if len(pieces) > 1:
- return PropbankChainTreePointer(
- [PropbankTreePointer.parse(elt) for elt in pieces]
- )
- # Deal with split args (xx,yy,zz)
- pieces = s.split(",")
- if len(pieces) > 1:
- return PropbankSplitTreePointer(
- [PropbankTreePointer.parse(elt) for elt in pieces]
- )
- # Deal with normal pointers.
- pieces = s.split(":")
- if len(pieces) != 2:
- raise ValueError("bad propbank pointer %r" % s)
- return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
- def __str__(self):
- return "%s:%s" % (self.wordnum, self.height)
- def __repr__(self):
- return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height)
- def __eq__(self, other):
- while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
- other = other.pieces[0]
- if not isinstance(other, PropbankTreePointer):
- return self is other
- return self.wordnum == other.wordnum and self.height == other.height
- def __ne__(self, other):
- return not self == other
- def __lt__(self, other):
- while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
- other = other.pieces[0]
- if not isinstance(other, PropbankTreePointer):
- return id(self) < id(other)
- return (self.wordnum, -self.height) < (other.wordnum, -other.height)
- def select(self, tree):
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- return tree[self.treepos(tree)]
- def treepos(self, tree):
- """
- Convert this pointer to a standard 'tree position' pointer,
- given that it points to the given tree.
- """
- if tree is None:
- raise ValueError("Parse tree not avaialable")
- stack = [tree]
- treepos = []
- wordnum = 0
- while True:
- # tree node:
- if isinstance(stack[-1], Tree):
- # Select the next child.
- if len(treepos) < len(stack):
- treepos.append(0)
- else:
- treepos[-1] += 1
- # Update the stack.
- if treepos[-1] < len(stack[-1]):
- stack.append(stack[-1][treepos[-1]])
- else:
- # End of node's child list: pop up a level.
- stack.pop()
- treepos.pop()
- # word node:
- else:
- if wordnum == self.wordnum:
- return tuple(treepos[: len(treepos) - self.height - 1])
- else:
- wordnum += 1
- stack.pop()
- class PropbankInflection(object):
- # { Inflection Form
- INFINITIVE = "i"
- GERUND = "g"
- PARTICIPLE = "p"
- FINITE = "v"
- # { Inflection Tense
- FUTURE = "f"
- PAST = "p"
- PRESENT = "n"
- # { Inflection Aspect
- PERFECT = "p"
- PROGRESSIVE = "o"
- PERFECT_AND_PROGRESSIVE = "b"
- # { Inflection Person
- THIRD_PERSON = "3"
- # { Inflection Voice
- ACTIVE = "a"
- PASSIVE = "p"
- # { Inflection
- NONE = "-"
- # }
- def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"):
- self.form = form
- self.tense = tense
- self.aspect = aspect
- self.person = person
- self.voice = voice
- def __str__(self):
- return self.form + self.tense + self.aspect + self.person + self.voice
- def __repr__(self):
- return "<PropbankInflection: %s>" % self
- _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$")
- @staticmethod
- def parse(s):
- if not isinstance(s, str):
- raise TypeError("expected a string")
- if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
- raise ValueError("Bad propbank inflection string %r" % s)
- return PropbankInflection(*s)
|