chasen.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #
  2. # Copyright (C) 2001-2020 NLTK Project
  3. # Author: Masato Hagiwara <hagisan@gmail.com>
  4. # URL: <http://nltk.org/>
  5. # For license information, see LICENSE.TXT
  6. # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
  7. import sys
  8. from nltk.corpus.reader import util
  9. from nltk.corpus.reader.util import *
  10. from nltk.corpus.reader.api import *
  11. class ChasenCorpusReader(CorpusReader):
  12. def __init__(self, root, fileids, encoding="utf8", sent_splitter=None):
  13. self._sent_splitter = sent_splitter
  14. CorpusReader.__init__(self, root, fileids, encoding)
  15. def raw(self, fileids=None):
  16. if fileids is None:
  17. fileids = self._fileids
  18. elif isinstance(fileids, str):
  19. fileids = [fileids]
  20. return concat([self.open(f).read() for f in fileids])
  21. def words(self, fileids=None):
  22. return concat(
  23. [
  24. ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
  25. for (fileid, enc) in self.abspaths(fileids, True)
  26. ]
  27. )
  28. def tagged_words(self, fileids=None):
  29. return concat(
  30. [
  31. ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
  32. for (fileid, enc) in self.abspaths(fileids, True)
  33. ]
  34. )
  35. def sents(self, fileids=None):
  36. return concat(
  37. [
  38. ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
  39. for (fileid, enc) in self.abspaths(fileids, True)
  40. ]
  41. )
  42. def tagged_sents(self, fileids=None):
  43. return concat(
  44. [
  45. ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
  46. for (fileid, enc) in self.abspaths(fileids, True)
  47. ]
  48. )
  49. def paras(self, fileids=None):
  50. return concat(
  51. [
  52. ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
  53. for (fileid, enc) in self.abspaths(fileids, True)
  54. ]
  55. )
  56. def tagged_paras(self, fileids=None):
  57. return concat(
  58. [
  59. ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
  60. for (fileid, enc) in self.abspaths(fileids, True)
  61. ]
  62. )
  63. class ChasenCorpusView(StreamBackedCorpusView):
  64. """
  65. A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
  66. but this'll use fixed sets of word and sentence tokenizer.
  67. """
  68. def __init__(
  69. self,
  70. corpus_file,
  71. encoding,
  72. tagged,
  73. group_by_sent,
  74. group_by_para,
  75. sent_splitter=None,
  76. ):
  77. self._tagged = tagged
  78. self._group_by_sent = group_by_sent
  79. self._group_by_para = group_by_para
  80. self._sent_splitter = sent_splitter
  81. StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
  82. def read_block(self, stream):
  83. """Reads one paragraph at a time."""
  84. block = []
  85. for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
  86. para = []
  87. sent = []
  88. for line in para_str.splitlines():
  89. _eos = line.strip() == "EOS"
  90. _cells = line.split("\t")
  91. w = (_cells[0], "\t".join(_cells[1:]))
  92. if not _eos:
  93. sent.append(w)
  94. if _eos or (self._sent_splitter and self._sent_splitter(w)):
  95. if not self._tagged:
  96. sent = [w for (w, t) in sent]
  97. if self._group_by_sent:
  98. para.append(sent)
  99. else:
  100. para.extend(sent)
  101. sent = []
  102. if len(sent) > 0:
  103. if not self._tagged:
  104. sent = [w for (w, t) in sent]
  105. if self._group_by_sent:
  106. para.append(sent)
  107. else:
  108. para.extend(sent)
  109. if self._group_by_para:
  110. block.append(para)
  111. else:
  112. block.extend(para)
  113. return block
  114. def demo():
  115. import nltk
  116. from nltk.corpus.util import LazyCorpusLoader
  117. jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
  118. print("/".join(jeita.words()[22100:22140]))
  119. print(
  120. "\nEOS\n".join(
  121. "\n".join("%s/%s" % (w[0], w[1].split("\t")[2]) for w in sent)
  122. for sent in jeita.tagged_sents()[2170:2173]
  123. )
  124. )
  125. def test():
  126. from nltk.corpus.util import LazyCorpusLoader
  127. jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8")
  128. assert isinstance(jeita.tagged_words()[0][1], str)
  129. if __name__ == "__main__":
  130. demo()
  131. test()