util.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # Natural Language Toolkit: Corpus Reader Utility Functions
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. ######################################################################
  8. # { Lazy Corpus Loader
  9. ######################################################################
  10. import re
  11. import gc
  12. import nltk
  13. TRY_ZIPFILE_FIRST = False
  14. class LazyCorpusLoader(object):
  15. """
  16. To see the API documentation for this lazily loaded corpus, first
  17. run corpus.ensure_loaded(), and then run help(this_corpus).
  18. LazyCorpusLoader is a proxy object which is used to stand in for a
  19. corpus object before the corpus is loaded. This allows NLTK to
  20. create an object for each corpus, but defer the costs associated
  21. with loading those corpora until the first time that they're
  22. actually accessed.
  23. The first time this object is accessed in any way, it will load
  24. the corresponding corpus, and transform itself into that corpus
  25. (by modifying its own ``__class__`` and ``__dict__`` attributes).
  26. If the corpus can not be found, then accessing this object will
  27. raise an exception, displaying installation instructions for the
  28. NLTK data package. Once they've properly installed the data
  29. package (or modified ``nltk.data.path`` to point to its location),
  30. they can then use the corpus object without restarting python.
  31. :param name: The name of the corpus
  32. :type name: str
  33. :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
  34. :type reader: nltk.corpus.reader.api.CorpusReader
  35. :param nltk_data_subdir: The subdirectory where the corpus is stored.
  36. :type nltk_data_subdir: str
  37. :param *args: Any other non-keywords arguments that `reader_cls` might need.
  38. :param *kargs: Any other keywords arguments that `reader_cls` might need.
  39. """
  40. def __init__(self, name, reader_cls, *args, **kwargs):
  41. from nltk.corpus.reader.api import CorpusReader
  42. assert issubclass(reader_cls, CorpusReader)
  43. self.__name = self.__name__ = name
  44. self.__reader_cls = reader_cls
  45. # If nltk_data_subdir is set explicitly
  46. if "nltk_data_subdir" in kwargs:
  47. # Use the specified subdirectory path
  48. self.subdir = kwargs["nltk_data_subdir"]
  49. # Pops the `nltk_data_subdir` argument, we don't need it anymore.
  50. kwargs.pop("nltk_data_subdir", None)
  51. else: # Otherwise use 'nltk_data/corpora'
  52. self.subdir = "corpora"
  53. self.__args = args
  54. self.__kwargs = kwargs
  55. def __load(self):
  56. # Find the corpus root directory.
  57. zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name)
  58. if TRY_ZIPFILE_FIRST:
  59. try:
  60. root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
  61. except LookupError as e:
  62. try:
  63. root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
  64. except LookupError:
  65. raise e
  66. else:
  67. try:
  68. root = nltk.data.find("{}/{}".format(self.subdir, self.__name))
  69. except LookupError as e:
  70. try:
  71. root = nltk.data.find("{}/{}".format(self.subdir, zip_name))
  72. except LookupError:
  73. raise e
  74. # Load the corpus.
  75. corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
  76. # This is where the magic happens! Transform ourselves into
  77. # the corpus by modifying our own __dict__ and __class__ to
  78. # match that of the corpus.
  79. args, kwargs = self.__args, self.__kwargs
  80. name, reader_cls = self.__name, self.__reader_cls
  81. self.__dict__ = corpus.__dict__
  82. self.__class__ = corpus.__class__
  83. # _unload support: assign __dict__ and __class__ back, then do GC.
  84. # after reassigning __dict__ there shouldn't be any references to
  85. # corpus data so the memory should be deallocated after gc.collect()
  86. def _unload(self):
  87. lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs)
  88. self.__dict__ = lazy_reader.__dict__
  89. self.__class__ = lazy_reader.__class__
  90. gc.collect()
  91. self._unload = _make_bound_method(_unload, self)
  92. def __getattr__(self, attr):
  93. # Fix for inspect.isclass under Python 2.6
  94. # (see http://bugs.python.org/issue1225107).
  95. # Without this fix tests may take extra 1.5GB RAM
  96. # because all corpora gets loaded during test collection.
  97. if attr == "__bases__":
  98. raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
  99. self.__load()
  100. # This looks circular, but its not, since __load() changes our
  101. # __class__ to something new:
  102. return getattr(self, attr)
  103. def __repr__(self):
  104. return "<%s in %r (not loaded yet)>" % (
  105. self.__reader_cls.__name__,
  106. ".../corpora/" + self.__name,
  107. )
  108. def _unload(self):
  109. # If an exception occures during corpus loading then
  110. # '_unload' method may be unattached, so __getattr__ can be called;
  111. # we shouldn't trigger corpus loading again in this case.
  112. pass
  113. def _make_bound_method(func, self):
  114. """
  115. Magic for creating bound methods (used for _unload).
  116. """
  117. class Foo(object):
  118. def meth(self):
  119. pass
  120. f = Foo()
  121. bound_method = type(f.meth)
  122. try:
  123. return bound_method(func, self, self.__class__)
  124. except TypeError: # python3
  125. return bound_method(func, self)