gbrault
/
jupytersketcher
mirror of https://github.com/gbrault/jupytersketcher.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
							#
# Copyright (C) 2001-2020 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html

import sys

from nltk.corpus.reader import util

from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *


class ChasenCorpusReader(CorpusReader):
    def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
        self._sent_splitter = sent_splitter
        CorpusReader.__init__(self, root, fileids, encoding)

    def raw(self, fileids=None):
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, str):
            fileids = [fileids]
        return concat([self.open(f).read() for f in fileids])

    def words(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_words(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def sents(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_sents(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def paras(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_paras(self, fileids=None):
        return concat(
            [
                ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )


class ChasenCorpusView(StreamBackedCorpusView):
    """
    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
    but this'll use fixed sets of word and sentence tokenizer.
    """

    def __init__(
        self,
        corpus_file,
        encoding,
        tagged,
        group_by_sent,
        group_by_para,
        sent_splitter=None,
    ):
        self._tagged = tagged
        self._group_by_sent = group_by_sent
        self._group_by_para = group_by_para
        self._sent_splitter = sent_splitter
        StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)

    def read_block(self, stream):
        """Reads one paragraph at a time."""
        block = []
        for para_str in read_regexp_block(stream, r".", r"^EOS\n"):

            para = []

            sent = []
            for line in para_str.splitlines():

                _eos = line.strip() == "EOS"
                _cells = line.split("\t")
                w = (_cells[0], "\t".join(_cells[1:]))
                if not _eos:
                    sent.append(w)

                if _eos or (self._sent_splitter and self._sent_splitter(w)):
                    if not self._tagged:
                        sent = [w for (w, t) in sent]
                    if self._group_by_sent:
                        para.append(sent)
                    else:
                        para.extend(sent)
                    sent = []

            if len(sent) > 0:
                if not self._tagged:
                    sent = [w for (w, t) in sent]

                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        return block


def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
    print("/".join(jeita.words()[22100:22140]))

    print(
        "\nEOS\n".join(
            "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
            for sent in jeita.tagged_sents()[2170:2173]
        )
    )


def test():

    from nltk.corpus.util import LazyCorpusLoader

    jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")

    assert isinstance(jeita.tagged_words()[0][1], str)


if __name__ == "__main__":
    demo()
    test()