plaintext.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. # Natural Language Toolkit: Plaintext Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # Nitin Madnani <nmadnani@umiacs.umd.edu>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. A reader for corpora that consist of plaintext documents.
  11. """
  12. import nltk.data
  13. from nltk.tokenize import *
  14. from nltk.corpus.reader.util import *
  15. from nltk.corpus.reader.api import *
  16. class PlaintextCorpusReader(CorpusReader):
  17. """
  18. Reader for corpora that consist of plaintext documents. Paragraphs
  19. are assumed to be split using blank lines. Sentences and words can
  20. be tokenized using the default tokenizers, or by custom tokenizers
  21. specificed as parameters to the constructor.
  22. This corpus reader can be customized (e.g., to skip preface
  23. sections of specific document formats) by creating a subclass and
  24. overriding the ``CorpusView`` class variable.
  25. """
  26. CorpusView = StreamBackedCorpusView
  27. """The corpus view class used by this reader. Subclasses of
  28. ``PlaintextCorpusReader`` may specify alternative corpus view
  29. classes (e.g., to skip the preface sections of documents.)"""
  30. def __init__(
  31. self,
  32. root,
  33. fileids,
  34. word_tokenizer=WordPunctTokenizer(),
  35. sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
  36. para_block_reader=read_blankline_block,
  37. encoding="utf8",
  38. ):
  39. """
  40. Construct a new plaintext corpus reader for a set of documents
  41. located at the given root directory. Example usage:
  42. >>> root = '/usr/local/share/nltk_data/corpora/webtext/'
  43. >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
  44. :param root: The root directory for this corpus.
  45. :param fileids: A list or regexp specifying the fileids in this corpus.
  46. :param word_tokenizer: Tokenizer for breaking sentences or
  47. paragraphs into words.
  48. :param sent_tokenizer: Tokenizer for breaking paragraphs
  49. into words.
  50. :param para_block_reader: The block reader used to divide the
  51. corpus into paragraph blocks.
  52. """
  53. CorpusReader.__init__(self, root, fileids, encoding)
  54. self._word_tokenizer = word_tokenizer
  55. self._sent_tokenizer = sent_tokenizer
  56. self._para_block_reader = para_block_reader
  57. def raw(self, fileids=None):
  58. """
  59. :return: the given file(s) as a single string.
  60. :rtype: str
  61. """
  62. if fileids is None:
  63. fileids = self._fileids
  64. elif isinstance(fileids, str):
  65. fileids = [fileids]
  66. raw_texts = []
  67. for f in fileids:
  68. _fin = self.open(f)
  69. raw_texts.append(_fin.read())
  70. _fin.close()
  71. return concat(raw_texts)
  72. def words(self, fileids=None):
  73. """
  74. :return: the given file(s) as a list of words
  75. and punctuation symbols.
  76. :rtype: list(str)
  77. """
  78. return concat(
  79. [
  80. self.CorpusView(path, self._read_word_block, encoding=enc)
  81. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  82. ]
  83. )
  84. def sents(self, fileids=None):
  85. """
  86. :return: the given file(s) as a list of
  87. sentences or utterances, each encoded as a list of word
  88. strings.
  89. :rtype: list(list(str))
  90. """
  91. if self._sent_tokenizer is None:
  92. raise ValueError("No sentence tokenizer for this corpus")
  93. return concat(
  94. [
  95. self.CorpusView(path, self._read_sent_block, encoding=enc)
  96. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  97. ]
  98. )
  99. def paras(self, fileids=None):
  100. """
  101. :return: the given file(s) as a list of
  102. paragraphs, each encoded as a list of sentences, which are
  103. in turn encoded as lists of word strings.
  104. :rtype: list(list(list(str)))
  105. """
  106. if self._sent_tokenizer is None:
  107. raise ValueError("No sentence tokenizer for this corpus")
  108. return concat(
  109. [
  110. self.CorpusView(path, self._read_para_block, encoding=enc)
  111. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  112. ]
  113. )
  114. def _read_word_block(self, stream):
  115. words = []
  116. for i in range(20): # Read 20 lines at a time.
  117. words.extend(self._word_tokenizer.tokenize(stream.readline()))
  118. return words
  119. def _read_sent_block(self, stream):
  120. sents = []
  121. for para in self._para_block_reader(stream):
  122. sents.extend(
  123. [
  124. self._word_tokenizer.tokenize(sent)
  125. for sent in self._sent_tokenizer.tokenize(para)
  126. ]
  127. )
  128. return sents
  129. def _read_para_block(self, stream):
  130. paras = []
  131. for para in self._para_block_reader(stream):
  132. paras.append(
  133. [
  134. self._word_tokenizer.tokenize(sent)
  135. for sent in self._sent_tokenizer.tokenize(para)
  136. ]
  137. )
  138. return paras
  139. class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
  140. """
  141. A reader for plaintext corpora whose documents are divided into
  142. categories based on their file identifiers.
  143. """
  144. def __init__(self, *args, **kwargs):
  145. """
  146. Initialize the corpus reader. Categorization arguments
  147. (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
  148. the ``CategorizedCorpusReader`` constructor. The remaining arguments
  149. are passed to the ``PlaintextCorpusReader`` constructor.
  150. """
  151. CategorizedCorpusReader.__init__(self, kwargs)
  152. PlaintextCorpusReader.__init__(self, *args, **kwargs)
  153. def _resolve(self, fileids, categories):
  154. if fileids is not None and categories is not None:
  155. raise ValueError("Specify fileids or categories, not both")
  156. if categories is not None:
  157. return self.fileids(categories)
  158. else:
  159. return fileids
  160. def raw(self, fileids=None, categories=None):
  161. return PlaintextCorpusReader.raw(self, self._resolve(fileids, categories))
  162. def words(self, fileids=None, categories=None):
  163. return PlaintextCorpusReader.words(self, self._resolve(fileids, categories))
  164. def sents(self, fileids=None, categories=None):
  165. return PlaintextCorpusReader.sents(self, self._resolve(fileids, categories))
  166. def paras(self, fileids=None, categories=None):
  167. return PlaintextCorpusReader.paras(self, self._resolve(fileids, categories))
  168. # FIXME: Is there a better way? How to not hardcode this?
  169. # Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
  170. # override the `sent_tokenizer`.
  171. class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
  172. def __init__(self, *args, **kwargs):
  173. CategorizedCorpusReader.__init__(self, kwargs)
  174. kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
  175. "tokenizers/punkt/portuguese.pickle"
  176. )
  177. PlaintextCorpusReader.__init__(self, *args, **kwargs)
  178. class EuroparlCorpusReader(PlaintextCorpusReader):
  179. """
  180. Reader for Europarl corpora that consist of plaintext documents.
  181. Documents are divided into chapters instead of paragraphs as
  182. for regular plaintext documents. Chapters are separated using blank
  183. lines. Everything is inherited from ``PlaintextCorpusReader`` except
  184. that:
  185. - Since the corpus is pre-processed and pre-tokenized, the
  186. word tokenizer should just split the line at whitespaces.
  187. - For the same reason, the sentence tokenizer should just
  188. split the paragraph at line breaks.
  189. - There is a new 'chapters()' method that returns chapters instead
  190. instead of paragraphs.
  191. - The 'paras()' method inherited from PlaintextCorpusReader is
  192. made non-functional to remove any confusion between chapters
  193. and paragraphs for Europarl.
  194. """
  195. def _read_word_block(self, stream):
  196. words = []
  197. for i in range(20): # Read 20 lines at a time.
  198. words.extend(stream.readline().split())
  199. return words
  200. def _read_sent_block(self, stream):
  201. sents = []
  202. for para in self._para_block_reader(stream):
  203. sents.extend([sent.split() for sent in para.splitlines()])
  204. return sents
  205. def _read_para_block(self, stream):
  206. paras = []
  207. for para in self._para_block_reader(stream):
  208. paras.append([sent.split() for sent in para.splitlines()])
  209. return paras
  210. def chapters(self, fileids=None):
  211. """
  212. :return: the given file(s) as a list of
  213. chapters, each encoded as a list of sentences, which are
  214. in turn encoded as lists of word strings.
  215. :rtype: list(list(list(str)))
  216. """
  217. return concat(
  218. [
  219. self.CorpusView(fileid, self._read_para_block, encoding=enc)
  220. for (fileid, enc) in self.abspaths(fileids, True)
  221. ]
  222. )
  223. def paras(self, fileids=None):
  224. raise NotImplementedError(
  225. "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
  226. )