memory.py 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. """
  2. A context object for caching a function's return value each time it
  3. is called with the same input arguments.
  4. """
  5. # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
  6. # Copyright (c) 2009 Gael Varoquaux
  7. # License: BSD Style, 3 clauses.
  8. from __future__ import with_statement
  9. import os
  10. import time
  11. import pathlib
  12. import pydoc
  13. import re
  14. import functools
  15. import traceback
  16. import warnings
  17. import inspect
  18. import sys
  19. import weakref
  20. from tokenize import open as open_py_source
  21. # Local imports
  22. from . import hashing
  23. from .func_inspect import get_func_code, get_func_name, filter_args
  24. from .func_inspect import format_call
  25. from .func_inspect import format_signature
  26. from .logger import Logger, format_time, pformat
  27. from ._store_backends import StoreBackendBase, FileSystemStoreBackend
  28. FIRST_LINE_TEXT = "# first line:"
  29. # TODO: The following object should have a data store object as a sub
  30. # object, and the interface to persist and query should be separated in
  31. # the data store.
  32. #
  33. # This would enable creating 'Memory' objects with a different logic for
  34. # pickling that would simply span a MemorizedFunc with the same
  35. # store (or do we want to copy it to avoid cross-talks?), for instance to
  36. # implement HDF5 pickling.
  37. # TODO: Same remark for the logger, and probably use the Python logging
  38. # mechanism.
  39. def extract_first_line(func_code):
  40. """ Extract the first line information from the function code
  41. text if available.
  42. """
  43. if func_code.startswith(FIRST_LINE_TEXT):
  44. func_code = func_code.split('\n')
  45. first_line = int(func_code[0][len(FIRST_LINE_TEXT):])
  46. func_code = '\n'.join(func_code[1:])
  47. else:
  48. first_line = -1
  49. return func_code, first_line
  50. class JobLibCollisionWarning(UserWarning):
  51. """ Warn that there might be a collision between names of functions.
  52. """
  53. _STORE_BACKENDS = {'local': FileSystemStoreBackend}
  54. def register_store_backend(backend_name, backend):
  55. """Extend available store backends.
  56. The Memory, MemorizeResult and MemorizeFunc objects are designed to be
  57. agnostic to the type of store used behind. By default, the local file
  58. system is used but this function gives the possibility to extend joblib's
  59. memory pattern with other types of storage such as cloud storage (S3, GCS,
  60. OpenStack, HadoopFS, etc) or blob DBs.
  61. Parameters
  62. ----------
  63. backend_name: str
  64. The name identifying the store backend being registered. For example,
  65. 'local' is used with FileSystemStoreBackend.
  66. backend: StoreBackendBase subclass
  67. The name of a class that implements the StoreBackendBase interface.
  68. """
  69. if not isinstance(backend_name, str):
  70. raise ValueError("Store backend name should be a string, "
  71. "'{0}' given.".format(backend_name))
  72. if backend is None or not issubclass(backend, StoreBackendBase):
  73. raise ValueError("Store backend should inherit "
  74. "StoreBackendBase, "
  75. "'{0}' given.".format(backend))
  76. _STORE_BACKENDS[backend_name] = backend
  77. def _store_backend_factory(backend, location, verbose=0, backend_options=None):
  78. """Return the correct store object for the given location."""
  79. if backend_options is None:
  80. backend_options = {}
  81. if isinstance(location, pathlib.Path):
  82. location = str(location)
  83. if isinstance(location, StoreBackendBase):
  84. return location
  85. elif isinstance(location, str):
  86. obj = None
  87. location = os.path.expanduser(location)
  88. # The location is not a local file system, we look in the
  89. # registered backends if there's one matching the given backend
  90. # name.
  91. for backend_key, backend_obj in _STORE_BACKENDS.items():
  92. if backend == backend_key:
  93. obj = backend_obj()
  94. # By default, we assume the FileSystemStoreBackend can be used if no
  95. # matching backend could be found.
  96. if obj is None:
  97. raise TypeError('Unknown location {0} or backend {1}'.format(
  98. location, backend))
  99. # The store backend is configured with the extra named parameters,
  100. # some of them are specific to the underlying store backend.
  101. obj.configure(location, verbose=verbose,
  102. backend_options=backend_options)
  103. return obj
  104. elif location is not None:
  105. warnings.warn(
  106. "Instanciating a backend using a {} as a location is not "
  107. "supported by joblib. Returning None instead.".format(
  108. location.__class__.__name__), UserWarning)
  109. return None
  110. def _get_func_fullname(func):
  111. """Compute the part of part associated with a function."""
  112. modules, funcname = get_func_name(func)
  113. modules.append(funcname)
  114. return os.path.join(*modules)
  115. def _build_func_identifier(func):
  116. """Build a roughly unique identifier for the cached function."""
  117. parts = []
  118. if isinstance(func, str):
  119. parts.append(func)
  120. else:
  121. parts.append(_get_func_fullname(func))
  122. # We reuse historical fs-like way of building a function identifier
  123. return os.path.join(*parts)
  124. def _format_load_msg(func_id, args_id, timestamp=None, metadata=None):
  125. """ Helper function to format the message when loading the results.
  126. """
  127. signature = ""
  128. try:
  129. if metadata is not None:
  130. args = ", ".join(['%s=%s' % (name, value)
  131. for name, value
  132. in metadata['input_args'].items()])
  133. signature = "%s(%s)" % (os.path.basename(func_id), args)
  134. else:
  135. signature = os.path.basename(func_id)
  136. except KeyError:
  137. pass
  138. if timestamp is not None:
  139. ts_string = "{0: <16}".format(format_time(time.time() - timestamp))
  140. else:
  141. ts_string = ""
  142. return '[Memory]{0}: Loading {1}'.format(ts_string, str(signature))
  143. # An in-memory store to avoid looking at the disk-based function
  144. # source code to check if a function definition has changed
  145. _FUNCTION_HASHES = weakref.WeakKeyDictionary()
  146. ###############################################################################
  147. # class `MemorizedResult`
  148. ###############################################################################
  149. class MemorizedResult(Logger):
  150. """Object representing a cached value.
  151. Attributes
  152. ----------
  153. location: str
  154. The location of joblib cache. Depends on the store backend used.
  155. func: function or str
  156. function whose output is cached. The string case is intended only for
  157. instanciation based on the output of repr() on another instance.
  158. (namely eval(repr(memorized_instance)) works).
  159. argument_hash: str
  160. hash of the function arguments.
  161. backend: str
  162. Type of store backend for reading/writing cache files.
  163. Default is 'local'.
  164. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
  165. The memmapping mode used when loading from cache numpy arrays. See
  166. numpy.load for the meaning of the different values.
  167. verbose: int
  168. verbosity level (0 means no message).
  169. timestamp, metadata: string
  170. for internal use only.
  171. """
  172. def __init__(self, location, func, args_id, backend='local',
  173. mmap_mode=None, verbose=0, timestamp=None, metadata=None):
  174. Logger.__init__(self)
  175. self.func_id = _build_func_identifier(func)
  176. if isinstance(func, str):
  177. self.func = func
  178. else:
  179. self.func = self.func_id
  180. self.args_id = args_id
  181. self.store_backend = _store_backend_factory(backend, location,
  182. verbose=verbose)
  183. self.mmap_mode = mmap_mode
  184. if metadata is not None:
  185. self.metadata = metadata
  186. else:
  187. self.metadata = self.store_backend.get_metadata(
  188. [self.func_id, self.args_id])
  189. self.duration = self.metadata.get('duration', None)
  190. self.verbose = verbose
  191. self.timestamp = timestamp
  192. @property
  193. def argument_hash(self):
  194. warnings.warn(
  195. "The 'argument_hash' attribute has been deprecated in version "
  196. "0.12 and will be removed in version 0.14.\n"
  197. "Use `args_id` attribute instead.",
  198. DeprecationWarning, stacklevel=2)
  199. return self.args_id
  200. def get(self):
  201. """Read value from cache and return it."""
  202. if self.verbose:
  203. msg = _format_load_msg(self.func_id, self.args_id,
  204. timestamp=self.timestamp,
  205. metadata=self.metadata)
  206. else:
  207. msg = None
  208. try:
  209. return self.store_backend.load_item(
  210. [self.func_id, self.args_id], msg=msg, verbose=self.verbose)
  211. except ValueError as exc:
  212. new_exc = KeyError(
  213. "Error while trying to load a MemorizedResult's value. "
  214. "It seems that this folder is corrupted : {}".format(
  215. os.path.join(
  216. self.store_backend.location, self.func_id,
  217. self.args_id)
  218. ))
  219. raise new_exc from exc
  220. def clear(self):
  221. """Clear value from cache"""
  222. self.store_backend.clear_item([self.func_id, self.args_id])
  223. def __repr__(self):
  224. return ('{class_name}(location="{location}", func="{func}", '
  225. 'args_id="{args_id}")'
  226. .format(class_name=self.__class__.__name__,
  227. location=self.store_backend.location,
  228. func=self.func,
  229. args_id=self.args_id
  230. ))
  231. def __getstate__(self):
  232. state = self.__dict__.copy()
  233. state['timestamp'] = None
  234. return state
  235. class NotMemorizedResult(object):
  236. """Class representing an arbitrary value.
  237. This class is a replacement for MemorizedResult when there is no cache.
  238. """
  239. __slots__ = ('value', 'valid')
  240. def __init__(self, value):
  241. self.value = value
  242. self.valid = True
  243. def get(self):
  244. if self.valid:
  245. return self.value
  246. else:
  247. raise KeyError("No value stored.")
  248. def clear(self):
  249. self.valid = False
  250. self.value = None
  251. def __repr__(self):
  252. if self.valid:
  253. return ('{class_name}({value})'
  254. .format(class_name=self.__class__.__name__,
  255. value=pformat(self.value)))
  256. else:
  257. return self.__class__.__name__ + ' with no value'
  258. # __getstate__ and __setstate__ are required because of __slots__
  259. def __getstate__(self):
  260. return {"valid": self.valid, "value": self.value}
  261. def __setstate__(self, state):
  262. self.valid = state["valid"]
  263. self.value = state["value"]
  264. ###############################################################################
  265. # class `NotMemorizedFunc`
  266. ###############################################################################
  267. class NotMemorizedFunc(object):
  268. """No-op object decorating a function.
  269. This class replaces MemorizedFunc when there is no cache. It provides an
  270. identical API but does not write anything on disk.
  271. Attributes
  272. ----------
  273. func: callable
  274. Original undecorated function.
  275. """
  276. # Should be a light as possible (for speed)
  277. def __init__(self, func):
  278. self.func = func
  279. def __call__(self, *args, **kwargs):
  280. return self.func(*args, **kwargs)
  281. def call_and_shelve(self, *args, **kwargs):
  282. return NotMemorizedResult(self.func(*args, **kwargs))
  283. def __repr__(self):
  284. return '{0}(func={1})'.format(self.__class__.__name__, self.func)
  285. def clear(self, warn=True):
  286. # Argument "warn" is for compatibility with MemorizedFunc.clear
  287. pass
  288. ###############################################################################
  289. # class `MemorizedFunc`
  290. ###############################################################################
  291. class MemorizedFunc(Logger):
  292. """Callable object decorating a function for caching its return value
  293. each time it is called.
  294. Methods are provided to inspect the cache or clean it.
  295. Attributes
  296. ----------
  297. func: callable
  298. The original, undecorated, function.
  299. location: string
  300. The location of joblib cache. Depends on the store backend used.
  301. backend: str
  302. Type of store backend for reading/writing cache files.
  303. Default is 'local', in which case the location is the path to a
  304. disk storage.
  305. ignore: list or None
  306. List of variable names to ignore when choosing whether to
  307. recompute.
  308. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
  309. The memmapping mode used when loading from cache
  310. numpy arrays. See numpy.load for the meaning of the different
  311. values.
  312. compress: boolean, or integer
  313. Whether to zip the stored data on disk. If an integer is
  314. given, it should be between 1 and 9, and sets the amount
  315. of compression. Note that compressed arrays cannot be
  316. read by memmapping.
  317. verbose: int, optional
  318. The verbosity flag, controls messages that are issued as
  319. the function is evaluated.
  320. """
  321. # ------------------------------------------------------------------------
  322. # Public interface
  323. # ------------------------------------------------------------------------
  324. def __init__(self, func, location, backend='local', ignore=None,
  325. mmap_mode=None, compress=False, verbose=1, timestamp=None):
  326. Logger.__init__(self)
  327. self.mmap_mode = mmap_mode
  328. self.compress = compress
  329. self.func = func
  330. if ignore is None:
  331. ignore = []
  332. self.ignore = ignore
  333. self._verbose = verbose
  334. # retrieve store object from backend type and location.
  335. self.store_backend = _store_backend_factory(backend, location,
  336. verbose=verbose,
  337. backend_options=dict(
  338. compress=compress,
  339. mmap_mode=mmap_mode),
  340. )
  341. if self.store_backend is not None:
  342. # Create func directory on demand.
  343. self.store_backend.\
  344. store_cached_func_code([_build_func_identifier(self.func)])
  345. if timestamp is None:
  346. timestamp = time.time()
  347. self.timestamp = timestamp
  348. try:
  349. functools.update_wrapper(self, func)
  350. except:
  351. " Objects like ufunc don't like that "
  352. if inspect.isfunction(func):
  353. doc = pydoc.TextDoc().document(func)
  354. # Remove blank line
  355. doc = doc.replace('\n', '\n\n', 1)
  356. # Strip backspace-overprints for compatibility with autodoc
  357. doc = re.sub('\x08.', '', doc)
  358. else:
  359. # Pydoc does a poor job on other objects
  360. doc = func.__doc__
  361. self.__doc__ = 'Memoized version of %s' % doc
  362. def _cached_call(self, args, kwargs, shelving=False):
  363. """Call wrapped function and cache result, or read cache if available.
  364. This function returns the wrapped function output and some metadata.
  365. Arguments:
  366. ----------
  367. args, kwargs: list and dict
  368. input arguments for wrapped function
  369. shelving: bool
  370. True when called via the call_and_shelve function.
  371. Returns
  372. -------
  373. output: value or tuple or None
  374. Output of the wrapped function.
  375. If shelving is True and the call has been already cached,
  376. output is None.
  377. argument_hash: string
  378. Hash of function arguments.
  379. metadata: dict
  380. Some metadata about wrapped function call (see _persist_input()).
  381. """
  382. func_id, args_id = self._get_output_identifiers(*args, **kwargs)
  383. metadata = None
  384. msg = None
  385. # Wether or not the memorized function must be called
  386. must_call = False
  387. # FIXME: The statements below should be try/excepted
  388. # Compare the function code with the previous to see if the
  389. # function code has changed
  390. if not (self._check_previous_func_code(stacklevel=4) and
  391. self.store_backend.contains_item([func_id, args_id])):
  392. if self._verbose > 10:
  393. _, name = get_func_name(self.func)
  394. self.warn('Computing func {0}, argument hash {1} '
  395. 'in location {2}'
  396. .format(name, args_id,
  397. self.store_backend.
  398. get_cached_func_info([func_id])['location']))
  399. must_call = True
  400. else:
  401. try:
  402. t0 = time.time()
  403. if self._verbose:
  404. msg = _format_load_msg(func_id, args_id,
  405. timestamp=self.timestamp,
  406. metadata=metadata)
  407. if not shelving:
  408. # When shelving, we do not need to load the output
  409. out = self.store_backend.load_item(
  410. [func_id, args_id],
  411. msg=msg,
  412. verbose=self._verbose)
  413. else:
  414. out = None
  415. if self._verbose > 4:
  416. t = time.time() - t0
  417. _, name = get_func_name(self.func)
  418. msg = '%s cache loaded - %s' % (name, format_time(t))
  419. print(max(0, (80 - len(msg))) * '_' + msg)
  420. except Exception:
  421. # XXX: Should use an exception logger
  422. _, signature = format_signature(self.func, *args, **kwargs)
  423. self.warn('Exception while loading results for '
  424. '{}\n {}'.format(signature, traceback.format_exc()))
  425. must_call = True
  426. if must_call:
  427. out, metadata = self.call(*args, **kwargs)
  428. if self.mmap_mode is not None:
  429. # Memmap the output at the first call to be consistent with
  430. # later calls
  431. if self._verbose:
  432. msg = _format_load_msg(func_id, args_id,
  433. timestamp=self.timestamp,
  434. metadata=metadata)
  435. out = self.store_backend.load_item([func_id, args_id], msg=msg,
  436. verbose=self._verbose)
  437. return (out, args_id, metadata)
  438. def call_and_shelve(self, *args, **kwargs):
  439. """Call wrapped function, cache result and return a reference.
  440. This method returns a reference to the cached result instead of the
  441. result itself. The reference object is small and pickeable, allowing
  442. to send or store it easily. Call .get() on reference object to get
  443. result.
  444. Returns
  445. -------
  446. cached_result: MemorizedResult or NotMemorizedResult
  447. reference to the value returned by the wrapped function. The
  448. class "NotMemorizedResult" is used when there is no cache
  449. activated (e.g. location=None in Memory).
  450. """
  451. _, args_id, metadata = self._cached_call(args, kwargs, shelving=True)
  452. return MemorizedResult(self.store_backend, self.func, args_id,
  453. metadata=metadata, verbose=self._verbose - 1,
  454. timestamp=self.timestamp)
  455. def __call__(self, *args, **kwargs):
  456. return self._cached_call(args, kwargs)[0]
  457. def __getstate__(self):
  458. """ We don't store the timestamp when pickling, to avoid the hash
  459. depending from it.
  460. """
  461. state = self.__dict__.copy()
  462. state['timestamp'] = None
  463. return state
  464. # ------------------------------------------------------------------------
  465. # Private interface
  466. # ------------------------------------------------------------------------
  467. def _get_argument_hash(self, *args, **kwargs):
  468. return hashing.hash(filter_args(self.func, self.ignore, args, kwargs),
  469. coerce_mmap=(self.mmap_mode is not None))
  470. def _get_output_identifiers(self, *args, **kwargs):
  471. """Return the func identifier and input parameter hash of a result."""
  472. func_id = _build_func_identifier(self.func)
  473. argument_hash = self._get_argument_hash(*args, **kwargs)
  474. return func_id, argument_hash
  475. def _hash_func(self):
  476. """Hash a function to key the online cache"""
  477. func_code_h = hash(getattr(self.func, '__code__', None))
  478. return id(self.func), hash(self.func), func_code_h
  479. def _write_func_code(self, func_code, first_line):
  480. """ Write the function code and the filename to a file.
  481. """
  482. # We store the first line because the filename and the function
  483. # name is not always enough to identify a function: people
  484. # sometimes have several functions named the same way in a
  485. # file. This is bad practice, but joblib should be robust to bad
  486. # practice.
  487. func_id = _build_func_identifier(self.func)
  488. func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code)
  489. self.store_backend.store_cached_func_code([func_id], func_code)
  490. # Also store in the in-memory store of function hashes
  491. is_named_callable = False
  492. is_named_callable = (hasattr(self.func, '__name__') and
  493. self.func.__name__ != '<lambda>')
  494. if is_named_callable:
  495. # Don't do this for lambda functions or strange callable
  496. # objects, as it ends up being too fragile
  497. func_hash = self._hash_func()
  498. try:
  499. _FUNCTION_HASHES[self.func] = func_hash
  500. except TypeError:
  501. # Some callable are not hashable
  502. pass
  503. def _check_previous_func_code(self, stacklevel=2):
  504. """
  505. stacklevel is the depth a which this function is called, to
  506. issue useful warnings to the user.
  507. """
  508. # First check if our function is in the in-memory store.
  509. # Using the in-memory store not only makes things faster, but it
  510. # also renders us robust to variations of the files when the
  511. # in-memory version of the code does not vary
  512. try:
  513. if self.func in _FUNCTION_HASHES:
  514. # We use as an identifier the id of the function and its
  515. # hash. This is more likely to falsely change than have hash
  516. # collisions, thus we are on the safe side.
  517. func_hash = self._hash_func()
  518. if func_hash == _FUNCTION_HASHES[self.func]:
  519. return True
  520. except TypeError:
  521. # Some callables are not hashable
  522. pass
  523. # Here, we go through some effort to be robust to dynamically
  524. # changing code and collision. We cannot inspect.getsource
  525. # because it is not reliable when using IPython's magic "%run".
  526. func_code, source_file, first_line = get_func_code(self.func)
  527. func_id = _build_func_identifier(self.func)
  528. try:
  529. old_func_code, old_first_line =\
  530. extract_first_line(
  531. self.store_backend.get_cached_func_code([func_id]))
  532. except (IOError, OSError): # some backend can also raise OSError
  533. self._write_func_code(func_code, first_line)
  534. return False
  535. if old_func_code == func_code:
  536. return True
  537. # We have differing code, is this because we are referring to
  538. # different functions, or because the function we are referring to has
  539. # changed?
  540. _, func_name = get_func_name(self.func, resolv_alias=False,
  541. win_characters=False)
  542. if old_first_line == first_line == -1 or func_name == '<lambda>':
  543. if not first_line == -1:
  544. func_description = ("{0} ({1}:{2})"
  545. .format(func_name, source_file,
  546. first_line))
  547. else:
  548. func_description = func_name
  549. warnings.warn(JobLibCollisionWarning(
  550. "Cannot detect name collisions for function '{0}'"
  551. .format(func_description)), stacklevel=stacklevel)
  552. # Fetch the code at the old location and compare it. If it is the
  553. # same than the code store, we have a collision: the code in the
  554. # file has not changed, but the name we have is pointing to a new
  555. # code block.
  556. if not old_first_line == first_line and source_file is not None:
  557. possible_collision = False
  558. if os.path.exists(source_file):
  559. _, func_name = get_func_name(self.func, resolv_alias=False)
  560. num_lines = len(func_code.split('\n'))
  561. with open_py_source(source_file) as f:
  562. on_disk_func_code = f.readlines()[
  563. old_first_line - 1:old_first_line - 1 + num_lines - 1]
  564. on_disk_func_code = ''.join(on_disk_func_code)
  565. possible_collision = (on_disk_func_code.rstrip() ==
  566. old_func_code.rstrip())
  567. else:
  568. possible_collision = source_file.startswith('<doctest ')
  569. if possible_collision:
  570. warnings.warn(JobLibCollisionWarning(
  571. 'Possible name collisions between functions '
  572. "'%s' (%s:%i) and '%s' (%s:%i)" %
  573. (func_name, source_file, old_first_line,
  574. func_name, source_file, first_line)),
  575. stacklevel=stacklevel)
  576. # The function has changed, wipe the cache directory.
  577. # XXX: Should be using warnings, and giving stacklevel
  578. if self._verbose > 10:
  579. _, func_name = get_func_name(self.func, resolv_alias=False)
  580. self.warn("Function {0} (identified by {1}) has changed"
  581. ".".format(func_name, func_id))
  582. self.clear(warn=True)
  583. return False
  584. def clear(self, warn=True):
  585. """Empty the function's cache."""
  586. func_id = _build_func_identifier(self.func)
  587. if self._verbose > 0 and warn:
  588. self.warn("Clearing function cache identified by %s" % func_id)
  589. self.store_backend.clear_path([func_id, ])
  590. func_code, _, first_line = get_func_code(self.func)
  591. self._write_func_code(func_code, first_line)
  592. def call(self, *args, **kwargs):
  593. """ Force the execution of the function with the given arguments and
  594. persist the output values.
  595. """
  596. start_time = time.time()
  597. func_id, args_id = self._get_output_identifiers(*args, **kwargs)
  598. if self._verbose > 0:
  599. print(format_call(self.func, args, kwargs))
  600. output = self.func(*args, **kwargs)
  601. self.store_backend.dump_item(
  602. [func_id, args_id], output, verbose=self._verbose)
  603. duration = time.time() - start_time
  604. metadata = self._persist_input(duration, args, kwargs)
  605. if self._verbose > 0:
  606. _, name = get_func_name(self.func)
  607. msg = '%s - %s' % (name, format_time(duration))
  608. print(max(0, (80 - len(msg))) * '_' + msg)
  609. return output, metadata
  610. def _persist_input(self, duration, args, kwargs, this_duration_limit=0.5):
  611. """ Save a small summary of the call using json format in the
  612. output directory.
  613. output_dir: string
  614. directory where to write metadata.
  615. duration: float
  616. time taken by hashing input arguments, calling the wrapped
  617. function and persisting its output.
  618. args, kwargs: list and dict
  619. input arguments for wrapped function
  620. this_duration_limit: float
  621. Max execution time for this function before issuing a warning.
  622. """
  623. start_time = time.time()
  624. argument_dict = filter_args(self.func, self.ignore,
  625. args, kwargs)
  626. input_repr = dict((k, repr(v)) for k, v in argument_dict.items())
  627. # This can fail due to race-conditions with multiple
  628. # concurrent joblibs removing the file or the directory
  629. metadata = {"duration": duration, "input_args": input_repr}
  630. func_id, args_id = self._get_output_identifiers(*args, **kwargs)
  631. self.store_backend.store_metadata([func_id, args_id], metadata)
  632. this_duration = time.time() - start_time
  633. if this_duration > this_duration_limit:
  634. # This persistence should be fast. It will not be if repr() takes
  635. # time and its output is large, because json.dump will have to
  636. # write a large file. This should not be an issue with numpy arrays
  637. # for which repr() always output a short representation, but can
  638. # be with complex dictionaries. Fixing the problem should be a
  639. # matter of replacing repr() above by something smarter.
  640. warnings.warn("Persisting input arguments took %.2fs to run.\n"
  641. "If this happens often in your code, it can cause "
  642. "performance problems \n"
  643. "(results will be correct in all cases). \n"
  644. "The reason for this is probably some large input "
  645. "arguments for a wrapped\n"
  646. " function (e.g. large strings).\n"
  647. "THIS IS A JOBLIB ISSUE. If you can, kindly provide "
  648. "the joblib's team with an\n"
  649. " example so that they can fix the problem."
  650. % this_duration, stacklevel=5)
  651. return metadata
  652. # XXX: Need a method to check if results are available.
  653. # ------------------------------------------------------------------------
  654. # Private `object` interface
  655. # ------------------------------------------------------------------------
  656. def __repr__(self):
  657. return '{class_name}(func={func}, location={location})'.format(
  658. class_name=self.__class__.__name__,
  659. func=self.func,
  660. location=self.store_backend.location,)
  661. ###############################################################################
  662. # class `Memory`
  663. ###############################################################################
  664. class Memory(Logger):
  665. """ A context object for caching a function's return value each time it
  666. is called with the same input arguments.
  667. All values are cached on the filesystem, in a deep directory
  668. structure.
  669. Read more in the :ref:`User Guide <memory>`.
  670. Parameters
  671. ----------
  672. location: str or None
  673. The path of the base directory to use as a data store
  674. or None. If None is given, no caching is done and
  675. the Memory object is completely transparent. This option
  676. replaces cachedir since version 0.12.
  677. backend: str, optional
  678. Type of store backend for reading/writing cache files.
  679. Default: 'local'.
  680. The 'local' backend is using regular filesystem operations to
  681. manipulate data (open, mv, etc) in the backend.
  682. cachedir: str or None, optional
  683. .. deprecated: 0.12
  684. 'cachedir' has been deprecated in 0.12 and will be
  685. removed in 0.14. Use the 'location' parameter instead.
  686. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
  687. The memmapping mode used when loading from cache
  688. numpy arrays. See numpy.load for the meaning of the
  689. arguments.
  690. compress: boolean, or integer, optional
  691. Whether to zip the stored data on disk. If an integer is
  692. given, it should be between 1 and 9, and sets the amount
  693. of compression. Note that compressed arrays cannot be
  694. read by memmapping.
  695. verbose: int, optional
  696. Verbosity flag, controls the debug messages that are issued
  697. as functions are evaluated.
  698. bytes_limit: int, optional
  699. Limit in bytes of the size of the cache.
  700. backend_options: dict, optional
  701. Contains a dictionnary of named parameters used to configure
  702. the store backend.
  703. """
  704. # ------------------------------------------------------------------------
  705. # Public interface
  706. # ------------------------------------------------------------------------
  707. def __init__(self, location=None, backend='local', cachedir=None,
  708. mmap_mode=None, compress=False, verbose=1, bytes_limit=None,
  709. backend_options=None):
  710. # XXX: Bad explanation of the None value of cachedir
  711. Logger.__init__(self)
  712. self._verbose = verbose
  713. self.mmap_mode = mmap_mode
  714. self.timestamp = time.time()
  715. self.bytes_limit = bytes_limit
  716. self.backend = backend
  717. self.compress = compress
  718. if backend_options is None:
  719. backend_options = {}
  720. self.backend_options = backend_options
  721. if compress and mmap_mode is not None:
  722. warnings.warn('Compressed results cannot be memmapped',
  723. stacklevel=2)
  724. if cachedir is not None:
  725. if location is not None:
  726. raise ValueError(
  727. 'You set both "location={0!r} and "cachedir={1!r}". '
  728. "'cachedir' has been deprecated in version "
  729. "0.12 and will be removed in version 0.14.\n"
  730. 'Please only set "location={0!r}"'.format(
  731. location, cachedir))
  732. warnings.warn(
  733. "The 'cachedir' parameter has been deprecated in version "
  734. "0.12 and will be removed in version 0.14.\n"
  735. 'You provided "cachedir={0!r}", '
  736. 'use "location={0!r}" instead.'.format(cachedir),
  737. DeprecationWarning, stacklevel=2)
  738. location = cachedir
  739. self.location = location
  740. if isinstance(location, str):
  741. location = os.path.join(location, 'joblib')
  742. self.store_backend = _store_backend_factory(
  743. backend, location, verbose=self._verbose,
  744. backend_options=dict(compress=compress, mmap_mode=mmap_mode,
  745. **backend_options))
  746. @property
  747. def cachedir(self):
  748. warnings.warn(
  749. "The 'cachedir' attribute has been deprecated in version 0.12 "
  750. "and will be removed in version 0.14.\n"
  751. "Use os.path.join(memory.location, 'joblib') attribute instead.",
  752. DeprecationWarning, stacklevel=2)
  753. if self.location is None:
  754. return None
  755. return os.path.join(self.location, 'joblib')
  756. def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False):
  757. """ Decorates the given function func to only compute its return
  758. value for input arguments not cached on disk.
  759. Parameters
  760. ----------
  761. func: callable, optional
  762. The function to be decorated
  763. ignore: list of strings
  764. A list of arguments name to ignore in the hashing
  765. verbose: integer, optional
  766. The verbosity mode of the function. By default that
  767. of the memory object is used.
  768. mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
  769. The memmapping mode used when loading from cache
  770. numpy arrays. See numpy.load for the meaning of the
  771. arguments. By default that of the memory object is used.
  772. Returns
  773. -------
  774. decorated_func: MemorizedFunc object
  775. The returned object is a MemorizedFunc object, that is
  776. callable (behaves like a function), but offers extra
  777. methods for cache lookup and management. See the
  778. documentation for :class:`joblib.memory.MemorizedFunc`.
  779. """
  780. if func is None:
  781. # Partial application, to be able to specify extra keyword
  782. # arguments in decorators
  783. return functools.partial(self.cache, ignore=ignore,
  784. verbose=verbose, mmap_mode=mmap_mode)
  785. if self.store_backend is None:
  786. return NotMemorizedFunc(func)
  787. if verbose is None:
  788. verbose = self._verbose
  789. if mmap_mode is False:
  790. mmap_mode = self.mmap_mode
  791. if isinstance(func, MemorizedFunc):
  792. func = func.func
  793. return MemorizedFunc(func, location=self.store_backend,
  794. backend=self.backend,
  795. ignore=ignore, mmap_mode=mmap_mode,
  796. compress=self.compress,
  797. verbose=verbose, timestamp=self.timestamp)
  798. def clear(self, warn=True):
  799. """ Erase the complete cache directory.
  800. """
  801. if warn:
  802. self.warn('Flushing completely the cache')
  803. if self.store_backend is not None:
  804. self.store_backend.clear()
  805. def reduce_size(self):
  806. """Remove cache elements to make cache size fit in ``bytes_limit``."""
  807. if self.bytes_limit is not None and self.store_backend is not None:
  808. self.store_backend.reduce_store_size(self.bytes_limit)
  809. def eval(self, func, *args, **kwargs):
  810. """ Eval function func with arguments `*args` and `**kwargs`,
  811. in the context of the memory.
  812. This method works similarly to the builtin `apply`, except
  813. that the function is called only if the cache is not
  814. up to date.
  815. """
  816. if self.store_backend is None:
  817. return func(*args, **kwargs)
  818. return self.cache(func)(*args, **kwargs)
  819. # ------------------------------------------------------------------------
  820. # Private `object` interface
  821. # ------------------------------------------------------------------------
  822. def __repr__(self):
  823. return '{class_name}(location={location})'.format(
  824. class_name=self.__class__.__name__,
  825. location=(None if self.store_backend is None
  826. else self.store_backend.location))
  827. def __getstate__(self):
  828. """ We don't store the timestamp when pickling, to avoid the hash
  829. depending from it.
  830. """
  831. state = self.__dict__.copy()
  832. state['timestamp'] = None
  833. return state