ppattach.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # Natural Language Toolkit: PP Attachment Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Read lines from the Prepositional Phrase Attachment Corpus.
  10. The PP Attachment Corpus contains several files having the format:
  11. sentence_id verb noun1 preposition noun2 attachment
  12. For example:
  13. 42960 gives authority to administration V
  14. 46742 gives inventors of microchip N
  15. The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
  16. (VP gives (NP authority) (PP to administration))
  17. (VP gives (NP inventors (PP of microchip)))
  18. The corpus contains the following files:
  19. training: training set
  20. devset: development test set, used for algorithm development.
  21. test: test set, used to report results
  22. bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
  23. Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
  24. Phrase Attachment. Proceedings of the ARPA Human Language Technology
  25. Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
  26. The PP Attachment Corpus is distributed with NLTK with the permission
  27. of the author.
  28. """
  29. from nltk.corpus.reader.util import *
  30. from nltk.corpus.reader.api import *
  31. class PPAttachment(object):
  32. def __init__(self, sent, verb, noun1, prep, noun2, attachment):
  33. self.sent = sent
  34. self.verb = verb
  35. self.noun1 = noun1
  36. self.prep = prep
  37. self.noun2 = noun2
  38. self.attachment = attachment
  39. def __repr__(self):
  40. return (
  41. "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, "
  42. "noun2=%r, attachment=%r)"
  43. % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
  44. )
  45. class PPAttachmentCorpusReader(CorpusReader):
  46. """
  47. sentence_id verb noun1 preposition noun2 attachment
  48. """
  49. def attachments(self, fileids):
  50. return concat(
  51. [
  52. StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
  53. for (fileid, enc) in self.abspaths(fileids, True)
  54. ]
  55. )
  56. def tuples(self, fileids):
  57. return concat(
  58. [
  59. StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
  60. for (fileid, enc) in self.abspaths(fileids, True)
  61. ]
  62. )
  63. def raw(self, fileids=None):
  64. if fileids is None:
  65. fileids = self._fileids
  66. elif isinstance(fileids, str):
  67. fileids = [fileids]
  68. return concat([self.open(f).read() for f in fileids])
  69. def _read_tuple_block(self, stream):
  70. line = stream.readline()
  71. if line:
  72. return [tuple(line.split())]
  73. else:
  74. return []
  75. def _read_obj_block(self, stream):
  76. line = stream.readline()
  77. if line:
  78. return [PPAttachment(*line.split())]
  79. else:
  80. return []