| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- # Natural Language Toolkit: Chunked Corpus Reader
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A reader for corpora that contain chunked (and optionally tagged)
- documents.
- """
- import os.path, codecs
- import nltk
- from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
- from nltk.tree import Tree
- from nltk.tokenize import *
- from nltk.chunk import tagstr2tree
- from nltk.corpus.reader.util import *
- from nltk.corpus.reader.api import *
- class ChunkedCorpusReader(CorpusReader):
- """
- Reader for chunked (and optionally tagged) corpora. Paragraphs
- are split using a block reader. They are then tokenized into
- sentences using a sentence tokenizer. Finally, these sentences
- are parsed into chunk trees using a string-to-chunktree conversion
- function. Each of these steps can be performed using a default
- function or a custom function. By default, paragraphs are split
- on blank lines; sentences are listed one per line; and sentences
- are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
- """
- def __init__(
- self,
- root,
- fileids,
- extension="",
- str2chunktree=tagstr2tree,
- sent_tokenizer=RegexpTokenizer("\n", gaps=True),
- para_block_reader=read_blankline_block,
- encoding="utf8",
- tagset=None,
- ):
- """
- :param root: The root directory for this corpus.
- :param fileids: A list or regexp specifying the fileids in this corpus.
- """
- CorpusReader.__init__(self, root, fileids, encoding)
- self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
- """Arguments for corpus views generated by this corpus: a tuple
- (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
- def raw(self, fileids=None):
- """
- :return: the given file(s) as a single string.
- :rtype: str
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def words(self, fileids=None):
- """
- :return: the given file(s) as a list of words
- and punctuation symbols.
- :rtype: list(str)
- """
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def sents(self, fileids=None):
- """
- :return: the given file(s) as a list of
- sentences or utterances, each encoded as a list of word
- strings.
- :rtype: list(list(str))
- """
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def paras(self, fileids=None):
- """
- :return: the given file(s) as a list of
- paragraphs, each encoded as a list of sentences, which are
- in turn encoded as lists of word strings.
- :rtype: list(list(list(str)))
- """
- return concat(
- [
- ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def tagged_words(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of tagged
- words and punctuation symbols, encoded as tuples
- ``(word,tag)``.
- :rtype: list(tuple(str,str))
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def tagged_sents(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of
- sentences, each encoded as a list of ``(word,tag)`` tuples.
- :rtype: list(list(tuple(str,str)))
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def tagged_paras(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of
- paragraphs, each encoded as a list of sentences, which are
- in turn encoded as lists of ``(word,tag)`` tuples.
- :rtype: list(list(list(tuple(str,str))))
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def chunked_words(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of tagged
- words and chunks. Words are encoded as ``(word, tag)``
- tuples (if the corpus has tags) or word strings (if the
- corpus has no tags). Chunks are encoded as depth-one
- trees over ``(word,tag)`` tuples or word strings.
- :rtype: list(tuple(str,str) and Tree)
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def chunked_sents(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of
- sentences, each encoded as a shallow Tree. The leaves
- of these trees are encoded as ``(word, tag)`` tuples (if
- the corpus has tags) or word strings (if the corpus has no
- tags).
- :rtype: list(Tree)
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def chunked_paras(self, fileids=None, tagset=None):
- """
- :return: the given file(s) as a list of
- paragraphs, each encoded as a list of sentences, which are
- in turn encoded as a shallow Tree. The leaves of these
- trees are encoded as ``(word, tag)`` tuples (if the corpus
- has tags) or word strings (if the corpus has no tags).
- :rtype: list(list(Tree))
- """
- return concat(
- [
- ChunkedCorpusView(
- f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
- )
- for (f, enc) in self.abspaths(fileids, True)
- ]
- )
- def _read_block(self, stream):
- return [tagstr2tree(t) for t in read_blankline_block(stream)]
- class ChunkedCorpusView(StreamBackedCorpusView):
- def __init__(
- self,
- fileid,
- encoding,
- tagged,
- group_by_sent,
- group_by_para,
- chunked,
- str2chunktree,
- sent_tokenizer,
- para_block_reader,
- source_tagset=None,
- target_tagset=None,
- ):
- StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
- self._tagged = tagged
- self._group_by_sent = group_by_sent
- self._group_by_para = group_by_para
- self._chunked = chunked
- self._str2chunktree = str2chunktree
- self._sent_tokenizer = sent_tokenizer
- self._para_block_reader = para_block_reader
- self._source_tagset = source_tagset
- self._target_tagset = target_tagset
- def read_block(self, stream):
- block = []
- for para_str in self._para_block_reader(stream):
- para = []
- for sent_str in self._sent_tokenizer.tokenize(para_str):
- sent = self._str2chunktree(
- sent_str,
- source_tagset=self._source_tagset,
- target_tagset=self._target_tagset,
- )
- # If requested, throw away the tags.
- if not self._tagged:
- sent = self._untag(sent)
- # If requested, throw away the chunks.
- if not self._chunked:
- sent = sent.leaves()
- # Add the sentence to `para`.
- if self._group_by_sent:
- para.append(sent)
- else:
- para.extend(sent)
- # Add the paragraph to `block`.
- if self._group_by_para:
- block.append(para)
- else:
- block.extend(para)
- # Return the block
- return block
- def _untag(self, tree):
- for i, child in enumerate(tree):
- if isinstance(child, Tree):
- self._untag(child)
- elif isinstance(child, tuple):
- tree[i] = child[0]
- else:
- raise ValueError("expected child to be Tree or tuple")
- return tree
|