| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- # Natural Language Toolkit: PP Attachment Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Read lines from the Prepositional Phrase Attachment Corpus.
- The PP Attachment Corpus contains several files having the format:
- sentence_id verb noun1 preposition noun2 attachment
- For example:
- 42960 gives authority to administration V
- 46742 gives inventors of microchip N
- The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
- (VP gives (NP authority) (PP to administration))
- (VP gives (NP inventors (PP of microchip)))
- The corpus contains the following files:
- training: training set
- devset: development test set, used for algorithm development.
- test: test set, used to report results
- bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
- Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
- Phrase Attachment. Proceedings of the ARPA Human Language Technology
- Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
- The PP Attachment Corpus is distributed with NLTK with the permission
- of the author.
- """
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class PPAttachment(object):
- def __init__(self, sent, verb, noun1, prep, noun2, attachment):
- self.sent = sent
- self.verb = verb
- self.noun1 = noun1
- self.prep = prep
- self.noun2 = noun2
- self.attachment = attachment
- def __repr__(self):
- return (
- "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
- "noun2=%r, attachment=%r)"
- % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
- )
- class PPAttachmentCorpusReader(CorpusReader):
- """
- sentence_id verb noun1 preposition noun2 attachment
- """
- def attachments(self, fileids):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
- def tuples(self, fileids):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
- for (fileid, enc) in self.abspaths(fileids, True)
- ]
- )
- def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def _read_tuple_block(self, stream):
- line = stream.readline()
- if line:
- return [tuple(line.split())]
- else:
- return []
- def _read_obj_block(self, stream):
- line = stream.readline()
- if line:
- return [PPAttachment(*line.split())]
- else:
- return []
|