toolbox.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # Natural Language Toolkit: Toolbox Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Greg Aumann <greg_aumann@sil.org>
  5. # Stuart Robinson <Stuart.Robinson@mpi.nl>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Module for reading, writing and manipulating
  11. Toolbox databases and settings fileids.
  12. """
  13. from nltk.toolbox import ToolboxData
  14. from nltk.corpus.reader.util import *
  15. from nltk.corpus.reader.api import *
  16. class ToolboxCorpusReader(CorpusReader):
  17. def xml(self, fileids, key=None):
  18. return concat(
  19. [
  20. ToolboxData(path, enc).parse(key=key)
  21. for (path, enc) in self.abspaths(fileids, True)
  22. ]
  23. )
  24. def fields(
  25. self,
  26. fileids,
  27. strip=True,
  28. unwrap=True,
  29. encoding="utf8",
  30. errors="strict",
  31. unicode_fields=None,
  32. ):
  33. return concat(
  34. [
  35. list(
  36. ToolboxData(fileid, enc).fields(
  37. strip, unwrap, encoding, errors, unicode_fields
  38. )
  39. )
  40. for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
  41. ]
  42. )
  43. # should probably be done lazily:
  44. def entries(self, fileids, **kwargs):
  45. if "key" in kwargs:
  46. key = kwargs["key"]
  47. del kwargs["key"]
  48. else:
  49. key = "lx" # the default key in MDF
  50. entries = []
  51. for marker, contents in self.fields(fileids, **kwargs):
  52. if marker == key:
  53. entries.append((contents, []))
  54. else:
  55. try:
  56. entries[-1][-1].append((marker, contents))
  57. except IndexError:
  58. pass
  59. return entries
  60. def words(self, fileids, key="lx"):
  61. return [contents for marker, contents in self.fields(fileids) if marker == key]
  62. def raw(self, fileids):
  63. if fileids is None:
  64. fileids = self._fileids
  65. elif isinstance(fileids, str):
  66. fileids = [fileids]
  67. return concat([self.open(f).read() for f in fileids])
  68. def demo():
  69. pass
  70. if __name__ == "__main__":
  71. demo()