| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- #! /usr/bin/env python
- # KNB Corpus reader
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Masato Hagiwara <hagisan@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
- import re
- from nltk.parse import DependencyGraph
- from nltk.corpus.reader.util import (
- FileSystemPathPointer,
- find_corpus_fileids,
- read_blankline_block,
- )
- from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
- # default function to convert morphlist to str for tree representation
- _morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
- class KNBCorpusReader(SyntaxCorpusReader):
- """
- This class implements:
- - ``__init__``, which specifies the location of the corpus
- and a method for detecting the sentence blocks in corpus files.
- - ``_read_block``, which reads a block from the input stream.
- - ``_word``, which takes a block and returns a list of list of words.
- - ``_tag``, which takes a block and returns a list of list of tagged
- words.
- - ``_parse``, which takes a block and returns a list of parsed
- sentences.
- The structure of tagged words:
- tagged_word = (word(str), tags(tuple))
- tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
- Usage example
- -------------
- >>> from nltk.corpus.util import LazyCorpusLoader
- >>> knbc = LazyCorpusLoader(
- ... 'knbc/corpus1',
- ... KNBCorpusReader,
- ... r'.*/KN.*',
- ... encoding='euc-jp',
- ... )
- >>> len(knbc.sents()[0])
- 9
- """
- def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
- """
- Initialize KNBCorpusReader
- morphs2str is a function to convert morphlist to str for tree representation
- for _parse()
- """
- # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
- # from CorpusReader?
- CorpusReader.__init__(self, root, fileids, encoding)
- self.morphs2str = morphs2str
- def _read_block(self, stream):
- # blocks are split by blankline (or EOF) - default
- return read_blankline_block(stream)
- def _word(self, t):
- res = []
- for line in t.splitlines():
- # ignore the Bunsets headers
- if not re.match(r"EOS|\*|\#|\+", line):
- cells = line.strip().split(" ")
- res.append(cells[0])
- return res
- # ignores tagset argument
- def _tag(self, t, tagset=None):
- res = []
- for line in t.splitlines():
- # ignore the Bunsets headers
- if not re.match(r"EOS|\*|\#|\+", line):
- cells = line.strip().split(" ")
- # convert cells to morph tuples
- res.append((cells[0], " ".join(cells[1:])))
- return res
- def _parse(self, t):
- dg = DependencyGraph()
- i = 0
- for line in t.splitlines():
- if line[0] in "*+":
- # start of bunsetsu or tag
- cells = line.strip().split(" ", 3)
- m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
- assert m is not None
- node = dg.nodes[i]
- node.update({"address": i, "rel": m.group(2), "word": []})
- dep_parent = int(m.group(1))
- if dep_parent == -1:
- dg.root = node
- else:
- dg.nodes[dep_parent]["deps"].append(i)
- i += 1
- elif line[0] != "#":
- # normal morph
- cells = line.strip().split(" ")
- # convert cells to morph tuples
- morph = cells[0], " ".join(cells[1:])
- dg.nodes[i - 1]["word"].append(morph)
- if self.morphs2str:
- for node in dg.nodes.values():
- node["word"] = self.morphs2str(node["word"])
- return dg.tree()
- ######################################################################
- # Demo
- ######################################################################
- def demo():
- import nltk
- from nltk.corpus.util import LazyCorpusLoader
- root = nltk.data.find("corpora/knbc/corpus1")
- fileids = [
- f
- for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
- if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
- ]
- def _knbc_fileids_sort(x):
- cells = x.split("-")
- return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
- knbc = LazyCorpusLoader(
- "knbc/corpus1",
- KNBCorpusReader,
- sorted(fileids, key=_knbc_fileids_sort),
- encoding="euc-jp",
- )
- print(knbc.fileids()[:10])
- print("".join(knbc.words()[:100]))
- print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
- knbc.morphs2str = lambda morphs: "/".join(
- "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
- ).encode("utf-8")
- print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
- print(
- "\n".join(
- " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
- for sent in knbc.tagged_sents()[0:2]
- )
- )
- def test():
- from nltk.corpus.util import LazyCorpusLoader
- knbc = LazyCorpusLoader(
- "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
- )
- assert isinstance(knbc.words()[0], str)
- assert isinstance(knbc.sents()[0][0], str)
- assert isinstance(knbc.tagged_words()[0], tuple)
- assert isinstance(knbc.tagged_sents()[0][0], tuple)
- if __name__ == "__main__":
- demo()
|