chunked.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. # Natural Language Toolkit: Chunked Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. A reader for corpora that contain chunked (and optionally tagged)
  10. documents.
  11. """
  12. import os.path, codecs
  13. import nltk
  14. from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
  15. from nltk.tree import Tree
  16. from nltk.tokenize import *
  17. from nltk.chunk import tagstr2tree
  18. from nltk.corpus.reader.util import *
  19. from nltk.corpus.reader.api import *
  20. class ChunkedCorpusReader(CorpusReader):
  21. """
  22. Reader for chunked (and optionally tagged) corpora. Paragraphs
  23. are split using a block reader. They are then tokenized into
  24. sentences using a sentence tokenizer. Finally, these sentences
  25. are parsed into chunk trees using a string-to-chunktree conversion
  26. function. Each of these steps can be performed using a default
  27. function or a custom function. By default, paragraphs are split
  28. on blank lines; sentences are listed one per line; and sentences
  29. are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
  30. """
  31. def __init__(
  32. self,
  33. root,
  34. fileids,
  35. extension="",
  36. str2chunktree=tagstr2tree,
  37. sent_tokenizer=RegexpTokenizer("\n", gaps=True),
  38. para_block_reader=read_blankline_block,
  39. encoding="utf8",
  40. tagset=None,
  41. ):
  42. """
  43. :param root: The root directory for this corpus.
  44. :param fileids: A list or regexp specifying the fileids in this corpus.
  45. """
  46. CorpusReader.__init__(self, root, fileids, encoding)
  47. self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
  48. """Arguments for corpus views generated by this corpus: a tuple
  49. (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
  50. def raw(self, fileids=None):
  51. """
  52. :return: the given file(s) as a single string.
  53. :rtype: str
  54. """
  55. if fileids is None:
  56. fileids = self._fileids
  57. elif isinstance(fileids, str):
  58. fileids = [fileids]
  59. return concat([self.open(f).read() for f in fileids])
  60. def words(self, fileids=None):
  61. """
  62. :return: the given file(s) as a list of words
  63. and punctuation symbols.
  64. :rtype: list(str)
  65. """
  66. return concat(
  67. [
  68. ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
  69. for (f, enc) in self.abspaths(fileids, True)
  70. ]
  71. )
  72. def sents(self, fileids=None):
  73. """
  74. :return: the given file(s) as a list of
  75. sentences or utterances, each encoded as a list of word
  76. strings.
  77. :rtype: list(list(str))
  78. """
  79. return concat(
  80. [
  81. ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
  82. for (f, enc) in self.abspaths(fileids, True)
  83. ]
  84. )
  85. def paras(self, fileids=None):
  86. """
  87. :return: the given file(s) as a list of
  88. paragraphs, each encoded as a list of sentences, which are
  89. in turn encoded as lists of word strings.
  90. :rtype: list(list(list(str)))
  91. """
  92. return concat(
  93. [
  94. ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
  95. for (f, enc) in self.abspaths(fileids, True)
  96. ]
  97. )
  98. def tagged_words(self, fileids=None, tagset=None):
  99. """
  100. :return: the given file(s) as a list of tagged
  101. words and punctuation symbols, encoded as tuples
  102. ``(word,tag)``.
  103. :rtype: list(tuple(str,str))
  104. """
  105. return concat(
  106. [
  107. ChunkedCorpusView(
  108. f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
  109. )
  110. for (f, enc) in self.abspaths(fileids, True)
  111. ]
  112. )
  113. def tagged_sents(self, fileids=None, tagset=None):
  114. """
  115. :return: the given file(s) as a list of
  116. sentences, each encoded as a list of ``(word,tag)`` tuples.
  117. :rtype: list(list(tuple(str,str)))
  118. """
  119. return concat(
  120. [
  121. ChunkedCorpusView(
  122. f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
  123. )
  124. for (f, enc) in self.abspaths(fileids, True)
  125. ]
  126. )
  127. def tagged_paras(self, fileids=None, tagset=None):
  128. """
  129. :return: the given file(s) as a list of
  130. paragraphs, each encoded as a list of sentences, which are
  131. in turn encoded as lists of ``(word,tag)`` tuples.
  132. :rtype: list(list(list(tuple(str,str))))
  133. """
  134. return concat(
  135. [
  136. ChunkedCorpusView(
  137. f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
  138. )
  139. for (f, enc) in self.abspaths(fileids, True)
  140. ]
  141. )
  142. def chunked_words(self, fileids=None, tagset=None):
  143. """
  144. :return: the given file(s) as a list of tagged
  145. words and chunks. Words are encoded as ``(word, tag)``
  146. tuples (if the corpus has tags) or word strings (if the
  147. corpus has no tags). Chunks are encoded as depth-one
  148. trees over ``(word,tag)`` tuples or word strings.
  149. :rtype: list(tuple(str,str) and Tree)
  150. """
  151. return concat(
  152. [
  153. ChunkedCorpusView(
  154. f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
  155. )
  156. for (f, enc) in self.abspaths(fileids, True)
  157. ]
  158. )
  159. def chunked_sents(self, fileids=None, tagset=None):
  160. """
  161. :return: the given file(s) as a list of
  162. sentences, each encoded as a shallow Tree. The leaves
  163. of these trees are encoded as ``(word, tag)`` tuples (if
  164. the corpus has tags) or word strings (if the corpus has no
  165. tags).
  166. :rtype: list(Tree)
  167. """
  168. return concat(
  169. [
  170. ChunkedCorpusView(
  171. f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
  172. )
  173. for (f, enc) in self.abspaths(fileids, True)
  174. ]
  175. )
  176. def chunked_paras(self, fileids=None, tagset=None):
  177. """
  178. :return: the given file(s) as a list of
  179. paragraphs, each encoded as a list of sentences, which are
  180. in turn encoded as a shallow Tree. The leaves of these
  181. trees are encoded as ``(word, tag)`` tuples (if the corpus
  182. has tags) or word strings (if the corpus has no tags).
  183. :rtype: list(list(Tree))
  184. """
  185. return concat(
  186. [
  187. ChunkedCorpusView(
  188. f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
  189. )
  190. for (f, enc) in self.abspaths(fileids, True)
  191. ]
  192. )
  193. def _read_block(self, stream):
  194. return [tagstr2tree(t) for t in read_blankline_block(stream)]
  195. class ChunkedCorpusView(StreamBackedCorpusView):
  196. def __init__(
  197. self,
  198. fileid,
  199. encoding,
  200. tagged,
  201. group_by_sent,
  202. group_by_para,
  203. chunked,
  204. str2chunktree,
  205. sent_tokenizer,
  206. para_block_reader,
  207. source_tagset=None,
  208. target_tagset=None,
  209. ):
  210. StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
  211. self._tagged = tagged
  212. self._group_by_sent = group_by_sent
  213. self._group_by_para = group_by_para
  214. self._chunked = chunked
  215. self._str2chunktree = str2chunktree
  216. self._sent_tokenizer = sent_tokenizer
  217. self._para_block_reader = para_block_reader
  218. self._source_tagset = source_tagset
  219. self._target_tagset = target_tagset
  220. def read_block(self, stream):
  221. block = []
  222. for para_str in self._para_block_reader(stream):
  223. para = []
  224. for sent_str in self._sent_tokenizer.tokenize(para_str):
  225. sent = self._str2chunktree(
  226. sent_str,
  227. source_tagset=self._source_tagset,
  228. target_tagset=self._target_tagset,
  229. )
  230. # If requested, throw away the tags.
  231. if not self._tagged:
  232. sent = self._untag(sent)
  233. # If requested, throw away the chunks.
  234. if not self._chunked:
  235. sent = sent.leaves()
  236. # Add the sentence to `para`.
  237. if self._group_by_sent:
  238. para.append(sent)
  239. else:
  240. para.extend(sent)
  241. # Add the paragraph to `block`.
  242. if self._group_by_para:
  243. block.append(para)
  244. else:
  245. block.extend(para)
  246. # Return the block
  247. return block
  248. def _untag(self, tree):
  249. for i, child in enumerate(tree):
  250. if isinstance(child, Tree):
  251. self._untag(child)
  252. elif isinstance(child, tuple):
  253. tree[i] = child[0]
  254. else:
  255. raise ValueError("expected child to be Tree or tuple")
  256. return tree