aligned.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # Natural Language Toolkit: Aligned Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # URL: <http://nltk.org/>
  5. # Author: Steven Bird <stevenbird1@gmail.com>
  6. # For license information, see LICENSE.TXT
  7. from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
  8. from nltk.translate import AlignedSent, Alignment
  9. from nltk.corpus.reader.api import CorpusReader
  10. from nltk.corpus.reader.util import (
  11. StreamBackedCorpusView,
  12. concat,
  13. read_alignedsent_block,
  14. )
  15. class AlignedCorpusReader(CorpusReader):
  16. """
  17. Reader for corpora of word-aligned sentences. Tokens are assumed
  18. to be separated by whitespace. Sentences begin on separate lines.
  19. """
  20. def __init__(
  21. self,
  22. root,
  23. fileids,
  24. sep="/",
  25. word_tokenizer=WhitespaceTokenizer(),
  26. sent_tokenizer=RegexpTokenizer("\n", gaps=True),
  27. alignedsent_block_reader=read_alignedsent_block,
  28. encoding="latin1",
  29. ):
  30. """
  31. Construct a new Aligned Corpus reader for a set of documents
  32. located at the given root directory. Example usage:
  33. >>> root = '/...path to corpus.../'
  34. >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
  35. :param root: The root directory for this corpus.
  36. :param fileids: A list or regexp specifying the fileids in this corpus.
  37. """
  38. CorpusReader.__init__(self, root, fileids, encoding)
  39. self._sep = sep
  40. self._word_tokenizer = word_tokenizer
  41. self._sent_tokenizer = sent_tokenizer
  42. self._alignedsent_block_reader = alignedsent_block_reader
  43. def raw(self, fileids=None):
  44. """
  45. :return: the given file(s) as a single string.
  46. :rtype: str
  47. """
  48. if fileids is None:
  49. fileids = self._fileids
  50. elif isinstance(fileids, str):
  51. fileids = [fileids]
  52. return concat([self.open(f).read() for f in fileids])
  53. def words(self, fileids=None):
  54. """
  55. :return: the given file(s) as a list of words
  56. and punctuation symbols.
  57. :rtype: list(str)
  58. """
  59. return concat(
  60. [
  61. AlignedSentCorpusView(
  62. fileid,
  63. enc,
  64. False,
  65. False,
  66. self._word_tokenizer,
  67. self._sent_tokenizer,
  68. self._alignedsent_block_reader,
  69. )
  70. for (fileid, enc) in self.abspaths(fileids, True)
  71. ]
  72. )
  73. def sents(self, fileids=None):
  74. """
  75. :return: the given file(s) as a list of
  76. sentences or utterances, each encoded as a list of word
  77. strings.
  78. :rtype: list(list(str))
  79. """
  80. return concat(
  81. [
  82. AlignedSentCorpusView(
  83. fileid,
  84. enc,
  85. False,
  86. True,
  87. self._word_tokenizer,
  88. self._sent_tokenizer,
  89. self._alignedsent_block_reader,
  90. )
  91. for (fileid, enc) in self.abspaths(fileids, True)
  92. ]
  93. )
  94. def aligned_sents(self, fileids=None):
  95. """
  96. :return: the given file(s) as a list of AlignedSent objects.
  97. :rtype: list(AlignedSent)
  98. """
  99. return concat(
  100. [
  101. AlignedSentCorpusView(
  102. fileid,
  103. enc,
  104. True,
  105. True,
  106. self._word_tokenizer,
  107. self._sent_tokenizer,
  108. self._alignedsent_block_reader,
  109. )
  110. for (fileid, enc) in self.abspaths(fileids, True)
  111. ]
  112. )
  113. class AlignedSentCorpusView(StreamBackedCorpusView):
  114. """
  115. A specialized corpus view for aligned sentences.
  116. ``AlignedSentCorpusView`` objects are typically created by
  117. ``AlignedCorpusReader`` (not directly by nltk users).
  118. """
  119. def __init__(
  120. self,
  121. corpus_file,
  122. encoding,
  123. aligned,
  124. group_by_sent,
  125. word_tokenizer,
  126. sent_tokenizer,
  127. alignedsent_block_reader,
  128. ):
  129. self._aligned = aligned
  130. self._group_by_sent = group_by_sent
  131. self._word_tokenizer = word_tokenizer
  132. self._sent_tokenizer = sent_tokenizer
  133. self._alignedsent_block_reader = alignedsent_block_reader
  134. StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
  135. def read_block(self, stream):
  136. block = [
  137. self._word_tokenizer.tokenize(sent_str)
  138. for alignedsent_str in self._alignedsent_block_reader(stream)
  139. for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
  140. ]
  141. if self._aligned:
  142. block[2] = Alignment.fromstring(
  143. " ".join(block[2])
  144. ) # kludge; we shouldn't have tokenized the alignment string
  145. block = [AlignedSent(*block)]
  146. elif self._group_by_sent:
  147. block = [block[0]]
  148. else:
  149. block = block[0]
  150. return block