dependency.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # Natural Language Toolkit: Dependency Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Kepa Sarasola <kepa.sarasola@ehu.es>
  5. # Iker Manterola <returntothehangar@hotmail.com>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. import codecs
  10. from nltk.parse import DependencyGraph
  11. from nltk.tokenize import *
  12. from nltk.corpus.reader.util import *
  13. from nltk.corpus.reader.api import *
  14. class DependencyCorpusReader(SyntaxCorpusReader):
  15. def __init__(
  16. self,
  17. root,
  18. fileids,
  19. encoding="utf8",
  20. word_tokenizer=TabTokenizer(),
  21. sent_tokenizer=RegexpTokenizer("\n", gaps=True),
  22. para_block_reader=read_blankline_block,
  23. ):
  24. # FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
  25. # from CorpusReader?
  26. CorpusReader.__init__(self, root, fileids, encoding)
  27. #########################################################
  28. def raw(self, fileids=None):
  29. """
  30. :return: the given file(s) as a single string.
  31. :rtype: str
  32. """
  33. result = []
  34. for fileid, encoding in self.abspaths(fileids, include_encoding=True):
  35. if isinstance(fileid, PathPointer):
  36. result.append(fileid.open(encoding=encoding).read())
  37. else:
  38. with codecs.open(fileid, "r", encoding) as fp:
  39. result.append(fp.read())
  40. return concat(result)
  41. def words(self, fileids=None):
  42. return concat(
  43. [
  44. DependencyCorpusView(fileid, False, False, False, encoding=enc)
  45. for fileid, enc in self.abspaths(fileids, include_encoding=True)
  46. ]
  47. )
  48. def tagged_words(self, fileids=None):
  49. return concat(
  50. [
  51. DependencyCorpusView(fileid, True, False, False, encoding=enc)
  52. for fileid, enc in self.abspaths(fileids, include_encoding=True)
  53. ]
  54. )
  55. def sents(self, fileids=None):
  56. return concat(
  57. [
  58. DependencyCorpusView(fileid, False, True, False, encoding=enc)
  59. for fileid, enc in self.abspaths(fileids, include_encoding=True)
  60. ]
  61. )
  62. def tagged_sents(self, fileids=None):
  63. return concat(
  64. [
  65. DependencyCorpusView(fileid, True, True, False, encoding=enc)
  66. for fileid, enc in self.abspaths(fileids, include_encoding=True)
  67. ]
  68. )
  69. def parsed_sents(self, fileids=None):
  70. sents = concat(
  71. [
  72. DependencyCorpusView(fileid, False, True, True, encoding=enc)
  73. for fileid, enc in self.abspaths(fileids, include_encoding=True)
  74. ]
  75. )
  76. return [DependencyGraph(sent) for sent in sents]
  77. class DependencyCorpusView(StreamBackedCorpusView):
  78. _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
  79. def __init__(
  80. self,
  81. corpus_file,
  82. tagged,
  83. group_by_sent,
  84. dependencies,
  85. chunk_types=None,
  86. encoding="utf8",
  87. ):
  88. self._tagged = tagged
  89. self._dependencies = dependencies
  90. self._group_by_sent = group_by_sent
  91. self._chunk_types = chunk_types
  92. StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
  93. def read_block(self, stream):
  94. # Read the next sentence.
  95. sent = read_blankline_block(stream)[0].strip()
  96. # Strip off the docstart marker, if present.
  97. if sent.startswith(self._DOCSTART):
  98. sent = sent[len(self._DOCSTART) :].lstrip()
  99. # extract word and tag from any of the formats
  100. if not self._dependencies:
  101. lines = [line.split("\t") for line in sent.split("\n")]
  102. if len(lines[0]) == 3 or len(lines[0]) == 4:
  103. sent = [(line[0], line[1]) for line in lines]
  104. elif len(lines[0]) == 10:
  105. sent = [(line[1], line[4]) for line in lines]
  106. else:
  107. raise ValueError("Unexpected number of fields in dependency tree file")
  108. # discard tags if they weren't requested
  109. if not self._tagged:
  110. sent = [word for (word, tag) in sent]
  111. # Return the result.
  112. if self._group_by_sent:
  113. return [sent]
  114. else:
  115. return list(sent)