knbc.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #! /usr/bin/env python
  2. # KNB Corpus reader
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Masato Hagiwara <hagisan@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
  8. import re
  9. from nltk.parse import DependencyGraph
  10. from nltk.corpus.reader.util import (
  11. FileSystemPathPointer,
  12. find_corpus_fileids,
  13. read_blankline_block,
  14. )
  15. from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
  16. # default function to convert morphlist to str for tree representation
  17. _morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS")
  18. class KNBCorpusReader(SyntaxCorpusReader):
  19. """
  20. This class implements:
  21. - ``__init__``, which specifies the location of the corpus
  22. and a method for detecting the sentence blocks in corpus files.
  23. - ``_read_block``, which reads a block from the input stream.
  24. - ``_word``, which takes a block and returns a list of list of words.
  25. - ``_tag``, which takes a block and returns a list of list of tagged
  26. words.
  27. - ``_parse``, which takes a block and returns a list of parsed
  28. sentences.
  29. The structure of tagged words:
  30. tagged_word = (word(str), tags(tuple))
  31. tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
  32. Usage example
  33. -------------
  34. >>> from nltk.corpus.util import LazyCorpusLoader
  35. >>> knbc = LazyCorpusLoader(
  36. ... 'knbc/corpus1',
  37. ... KNBCorpusReader,
  38. ... r'.*/KN.*',
  39. ... encoding='euc-jp',
  40. ... )
  41. >>> len(knbc.sents()[0])
  42. 9
  43. """
  44. def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default):
  45. """
  46. Initialize KNBCorpusReader
  47. morphs2str is a function to convert morphlist to str for tree representation
  48. for _parse()
  49. """
  50. # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
  51. # from CorpusReader?
  52. CorpusReader.__init__(self, root, fileids, encoding)
  53. self.morphs2str = morphs2str
  54. def _read_block(self, stream):
  55. # blocks are split by blankline (or EOF) - default
  56. return read_blankline_block(stream)
  57. def _word(self, t):
  58. res = []
  59. for line in t.splitlines():
  60. # ignore the Bunsets headers
  61. if not re.match(r"EOS|\*|\#|\+", line):
  62. cells = line.strip().split(" ")
  63. res.append(cells[0])
  64. return res
  65. # ignores tagset argument
  66. def _tag(self, t, tagset=None):
  67. res = []
  68. for line in t.splitlines():
  69. # ignore the Bunsets headers
  70. if not re.match(r"EOS|\*|\#|\+", line):
  71. cells = line.strip().split(" ")
  72. # convert cells to morph tuples
  73. res.append((cells[0], " ".join(cells[1:])))
  74. return res
  75. def _parse(self, t):
  76. dg = DependencyGraph()
  77. i = 0
  78. for line in t.splitlines():
  79. if line[0] in "*+":
  80. # start of bunsetsu or tag
  81. cells = line.strip().split(" ", 3)
  82. m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
  83. assert m is not None
  84. node = dg.nodes[i]
  85. node.update({"address": i, "rel": m.group(2), "word": []})
  86. dep_parent = int(m.group(1))
  87. if dep_parent == -1:
  88. dg.root = node
  89. else:
  90. dg.nodes[dep_parent]["deps"].append(i)
  91. i += 1
  92. elif line[0] != "#":
  93. # normal morph
  94. cells = line.strip().split(" ")
  95. # convert cells to morph tuples
  96. morph = cells[0], " ".join(cells[1:])
  97. dg.nodes[i - 1]["word"].append(morph)
  98. if self.morphs2str:
  99. for node in dg.nodes.values():
  100. node["word"] = self.morphs2str(node["word"])
  101. return dg.tree()
  102. ######################################################################
  103. # Demo
  104. ######################################################################
  105. def demo():
  106. import nltk
  107. from nltk.corpus.util import LazyCorpusLoader
  108. root = nltk.data.find("corpora/knbc/corpus1")
  109. fileids = [
  110. f
  111. for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
  112. if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
  113. ]
  114. def _knbc_fileids_sort(x):
  115. cells = x.split("-")
  116. return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
  117. knbc = LazyCorpusLoader(
  118. "knbc/corpus1",
  119. KNBCorpusReader,
  120. sorted(fileids, key=_knbc_fileids_sort),
  121. encoding="euc-jp",
  122. )
  123. print(knbc.fileids()[:10])
  124. print("".join(knbc.words()[:100]))
  125. print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))
  126. knbc.morphs2str = lambda morphs: "/".join(
  127. "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS"
  128. ).encode("utf-8")
  129. print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))
  130. print(
  131. "\n".join(
  132. " ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent)
  133. for sent in knbc.tagged_sents()[0:2]
  134. )
  135. )
  136. def test():
  137. from nltk.corpus.util import LazyCorpusLoader
  138. knbc = LazyCorpusLoader(
  139. "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
  140. )
  141. assert isinstance(knbc.words()[0], str)
  142. assert isinstance(knbc.sents()[0][0], str)
  143. assert isinstance(knbc.tagged_words()[0], tuple)
  144. assert isinstance(knbc.tagged_sents()[0][0], tuple)
  145. if __name__ == "__main__":
  146. demo()