| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479 |
- # Natural Language Toolkit: API for Corpus Readers
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- API for corpus readers.
- """
- import os
- import re
- from collections import defaultdict
- from itertools import chain
- from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
- from nltk.corpus.reader.util import *
- class CorpusReader(object):
- """
- A base class for "corpus reader" classes, each of which can be
- used to read a specific corpus format. Each individual corpus
- reader instance is used to read a specific corpus, consisting of
- one or more files under a common root directory. Each file is
- identified by its ``file identifier``, which is the relative path
- to the file from the root directory.
- A separate subclass is defined for each corpus format. These
- subclasses define one or more methods that provide 'views' on the
- corpus contents, such as ``words()`` (for a list of words) and
- ``parsed_sents()`` (for a list of parsed sentences). Called with
- no arguments, these methods will return the contents of the entire
- corpus. For most corpora, these methods define one or more
- selection arguments, such as ``fileids`` or ``categories``, which can
- be used to select which portion of the corpus should be returned.
- """
- def __init__(self, root, fileids, encoding="utf8", tagset=None):
- """
- :type root: PathPointer or str
- :param root: A path pointer identifying the root directory for
- this corpus. If a string is specified, then it will be
- converted to a ``PathPointer`` automatically.
- :param fileids: A list of the files that make up this corpus.
- This list can either be specified explicitly, as a list of
- strings; or implicitly, as a regular expression over file
- paths. The absolute path for each file will be constructed
- by joining the reader's root to each file name.
- :param encoding: The default unicode encoding for the files
- that make up the corpus. The value of ``encoding`` can be any
- of the following:
- - A string: ``encoding`` is the encoding name for all files.
- - A dictionary: ``encoding[file_id]`` is the encoding
- name for the file whose identifier is ``file_id``. If
- ``file_id`` is not in ``encoding``, then the file
- contents will be processed using non-unicode byte strings.
- - A list: ``encoding`` should be a list of ``(regexp, encoding)``
- tuples. The encoding for a file whose identifier is ``file_id``
- will be the ``encoding`` value for the first tuple whose
- ``regexp`` matches the ``file_id``. If no tuple's ``regexp``
- matches the ``file_id``, the file contents will be processed
- using non-unicode byte strings.
- - None: the file contents of all files will be
- processed using non-unicode byte strings.
- :param tagset: The name of the tagset used by this corpus, to be used
- for normalizing or converting the POS tags returned by the
- tagged_...() methods.
- """
- # Convert the root to a path pointer, if necessary.
- if isinstance(root, str) and not isinstance(root, PathPointer):
- m = re.match("(.*\.zip)/?(.*)$|", root)
- zipfile, zipentry = m.groups()
- if zipfile:
- root = ZipFilePathPointer(zipfile, zipentry)
- else:
- root = FileSystemPathPointer(root)
- elif not isinstance(root, PathPointer):
- raise TypeError("CorpusReader: expected a string or a PathPointer")
- # If `fileids` is a regexp, then expand it.
- if isinstance(fileids, str):
- fileids = find_corpus_fileids(root, fileids)
- self._fileids = fileids
- """A list of the relative paths for the fileids that make up
- this corpus."""
- self._root = root
- """The root directory for this corpus."""
- # If encoding was specified as a list of regexps, then convert
- # it to a dictionary.
- if isinstance(encoding, list):
- encoding_dict = {}
- for fileid in self._fileids:
- for x in encoding:
- (regexp, enc) = x
- if re.match(regexp, fileid):
- encoding_dict[fileid] = enc
- break
- encoding = encoding_dict
- self._encoding = encoding
- """The default unicode encoding for the fileids that make up
- this corpus. If ``encoding`` is None, then the file
- contents are processed using byte strings."""
- self._tagset = tagset
- def __repr__(self):
- if isinstance(self._root, ZipFilePathPointer):
- path = "%s/%s" % (self._root.zipfile.filename, self._root.entry)
- else:
- path = "%s" % self._root.path
- return "<%s in %r>" % (self.__class__.__name__, path)
- def ensure_loaded(self):
- """
- Load this corpus (if it has not already been loaded). This is
- used by LazyCorpusLoader as a simple method that can be used to
- make sure a corpus is loaded -- e.g., in case a user wants to
- do help(some_corpus).
- """
- pass # no need to actually do anything.
- def readme(self):
- """
- Return the contents of the corpus README file, if it exists.
- """
- return self.open("README").read()
- def license(self):
- """
- Return the contents of the corpus LICENSE file, if it exists.
- """
- return self.open("LICENSE").read()
- def citation(self):
- """
- Return the contents of the corpus citation.bib file, if it exists.
- """
- return self.open("citation.bib").read()
- def fileids(self):
- """
- Return a list of file identifiers for the fileids that make up
- this corpus.
- """
- return self._fileids
- def abspath(self, fileid):
- """
- Return the absolute path for the given file.
- :type fileid: str
- :param fileid: The file identifier for the file whose path
- should be returned.
- :rtype: PathPointer
- """
- return self._root.join(fileid)
- def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
- """
- Return a list of the absolute paths for all fileids in this corpus;
- or for the given list of fileids, if specified.
- :type fileids: None or str or list
- :param fileids: Specifies the set of fileids for which paths should
- be returned. Can be None, for all fileids; a list of
- file identifiers, for a specified set of fileids; or a single
- file identifier, for a single file. Note that the return
- value is always a list of paths, even if ``fileids`` is a
- single file identifier.
- :param include_encoding: If true, then return a list of
- ``(path_pointer, encoding)`` tuples.
- :rtype: list(PathPointer)
- """
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- paths = [self._root.join(f) for f in fileids]
- if include_encoding and include_fileid:
- return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
- elif include_fileid:
- return list(zip(paths, fileids))
- elif include_encoding:
- return list(zip(paths, [self.encoding(f) for f in fileids]))
- else:
- return paths
- def open(self, file):
- """
- Return an open stream that can be used to read the given file.
- If the file's encoding is not None, then the stream will
- automatically decode the file's contents into unicode.
- :param file: The file identifier of the file to read.
- """
- encoding = self.encoding(file)
- stream = self._root.join(file).open(encoding)
- return stream
- def encoding(self, file):
- """
- Return the unicode encoding for the given corpus file, if known.
- If the encoding is unknown, or if the given file should be
- processed using byte strings (str), then return None.
- """
- if isinstance(self._encoding, dict):
- return self._encoding.get(file)
- else:
- return self._encoding
- def _get_root(self):
- return self._root
- root = property(
- _get_root,
- doc="""
- The directory where this corpus is stored.
- :type: PathPointer""",
- )
- ######################################################################
- # { Corpora containing categorized items
- ######################################################################
- class CategorizedCorpusReader(object):
- """
- A mixin class used to aid in the implementation of corpus readers
- for categorized corpora. This class defines the method
- ``categories()``, which returns a list of the categories for the
- corpus or for a specified set of fileids; and overrides ``fileids()``
- to take a ``categories`` argument, restricting the set of fileids to
- be returned.
- Subclasses are expected to:
- - Call ``__init__()`` to set up the mapping.
- - Override all view methods to accept a ``categories`` parameter,
- which can be used *instead* of the ``fileids`` parameter, to
- select which fileids should be included in the returned view.
- """
- def __init__(self, kwargs):
- """
- Initialize this mapping based on keyword arguments, as
- follows:
- - cat_pattern: A regular expression pattern used to find the
- category for each file identifier. The pattern will be
- applied to each file identifier, and the first matching
- group will be used as the category label for that file.
- - cat_map: A dictionary, mapping from file identifiers to
- category labels.
- - cat_file: The name of a file that contains the mapping
- from file identifiers to categories. The argument
- ``cat_delimiter`` can be used to specify a delimiter.
- The corresponding argument will be deleted from ``kwargs``. If
- more than one argument is specified, an exception will be
- raised.
- """
- self._f2c = None #: file-to-category mapping
- self._c2f = None #: category-to-file mapping
- self._pattern = None #: regexp specifying the mapping
- self._map = None #: dict specifying the mapping
- self._file = None #: fileid of file containing the mapping
- self._delimiter = None #: delimiter for ``self._file``
- if "cat_pattern" in kwargs:
- self._pattern = kwargs["cat_pattern"]
- del kwargs["cat_pattern"]
- elif "cat_map" in kwargs:
- self._map = kwargs["cat_map"]
- del kwargs["cat_map"]
- elif "cat_file" in kwargs:
- self._file = kwargs["cat_file"]
- del kwargs["cat_file"]
- if "cat_delimiter" in kwargs:
- self._delimiter = kwargs["cat_delimiter"]
- del kwargs["cat_delimiter"]
- else:
- raise ValueError(
- "Expected keyword argument cat_pattern or " "cat_map or cat_file."
- )
- if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs:
- raise ValueError(
- "Specify exactly one of: cat_pattern, " "cat_map, cat_file."
- )
- def _init(self):
- self._f2c = defaultdict(set)
- self._c2f = defaultdict(set)
- if self._pattern is not None:
- for file_id in self._fileids:
- category = re.match(self._pattern, file_id).group(1)
- self._add(file_id, category)
- elif self._map is not None:
- for (file_id, categories) in self._map.items():
- for category in categories:
- self._add(file_id, category)
- elif self._file is not None:
- for line in self.open(self._file).readlines():
- line = line.strip()
- file_id, categories = line.split(self._delimiter, 1)
- if file_id not in self.fileids():
- raise ValueError(
- "In category mapping file %s: %s "
- "not found" % (self._file, file_id)
- )
- for category in categories.split(self._delimiter):
- self._add(file_id, category)
- def _add(self, file_id, category):
- self._f2c[file_id].add(category)
- self._c2f[category].add(file_id)
- def categories(self, fileids=None):
- """
- Return a list of the categories that are defined for this corpus,
- or for the file(s) if it is given.
- """
- if self._f2c is None:
- self._init()
- if fileids is None:
- return sorted(self._c2f)
- if isinstance(fileids, str):
- fileids = [fileids]
- return sorted(set.union(*[self._f2c[d] for d in fileids]))
- def fileids(self, categories=None):
- """
- Return a list of file identifiers for the files that make up
- this corpus, or that make up the given category(s) if specified.
- """
- if categories is None:
- return super(CategorizedCorpusReader, self).fileids()
- elif isinstance(categories, str):
- if self._f2c is None:
- self._init()
- if categories in self._c2f:
- return sorted(self._c2f[categories])
- else:
- raise ValueError("Category %s not found" % categories)
- else:
- if self._f2c is None:
- self._init()
- return sorted(set.union(*[self._c2f[c] for c in categories]))
- ######################################################################
- # { Treebank readers
- ######################################################################
- # [xx] is it worth it to factor this out?
- class SyntaxCorpusReader(CorpusReader):
- """
- An abstract base class for reading corpora consisting of
- syntactically parsed text. Subclasses should define:
- - ``__init__``, which specifies the location of the corpus
- and a method for detecting the sentence blocks in corpus files.
- - ``_read_block``, which reads a block from the input stream.
- - ``_word``, which takes a block and returns a list of list of words.
- - ``_tag``, which takes a block and returns a list of list of tagged
- words.
- - ``_parse``, which takes a block and returns a list of parsed
- sentences.
- """
- def _parse(self, s):
- raise NotImplementedError()
- def _word(self, s):
- raise NotImplementedError()
- def _tag(self, s):
- raise NotImplementedError()
- def _read_block(self, stream):
- raise NotImplementedError()
- def raw(self, fileids=None):
- if fileids is None:
- fileids = self._fileids
- elif isinstance(fileids, str):
- fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
- def parsed_sents(self, fileids=None):
- reader = self._read_parsed_sent_block
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
- def tagged_sents(self, fileids=None, tagset=None):
- def reader(stream):
- return self._read_tagged_sent_block(stream, tagset)
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
- def sents(self, fileids=None):
- reader = self._read_sent_block
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
- def tagged_words(self, fileids=None, tagset=None):
- def reader(stream):
- return self._read_tagged_word_block(stream, tagset)
- return concat(
- [
- StreamBackedCorpusView(fileid, reader, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
- def words(self, fileids=None):
- return concat(
- [
- StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
- for fileid, enc in self.abspaths(fileids, True)
- ]
- )
- # ------------------------------------------------------------
- # { Block Readers
- def _read_word_block(self, stream):
- return list(chain(*self._read_sent_block(stream)))
- def _read_tagged_word_block(self, stream, tagset=None):
- return list(chain(*self._read_tagged_sent_block(stream, tagset)))
- def _read_sent_block(self, stream):
- return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
- def _read_tagged_sent_block(self, stream, tagset=None):
- return list(
- filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
- )
- def _read_parsed_sent_block(self, stream):
- return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
- # } End of Block Readers
- # ------------------------------------------------------------
|