hashing.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. """
  2. Fast cryptographic hash of Python objects, with a special case for fast
  3. hashing of numpy arrays.
  4. """
  5. # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
  6. # Copyright (c) 2009 Gael Varoquaux
  7. # License: BSD Style, 3 clauses.
  8. import pickle
  9. import hashlib
  10. import sys
  11. import types
  12. import struct
  13. import io
  14. import decimal
  15. Pickler = pickle._Pickler
  16. class _ConsistentSet(object):
  17. """ Class used to ensure the hash of Sets is preserved
  18. whatever the order of its items.
  19. """
  20. def __init__(self, set_sequence):
  21. # Forces order of elements in set to ensure consistent hash.
  22. try:
  23. # Trying first to order the set assuming the type of elements is
  24. # consistent and orderable.
  25. # This fails on python 3 when elements are unorderable
  26. # but we keep it in a try as it's faster.
  27. self._sequence = sorted(set_sequence)
  28. except (TypeError, decimal.InvalidOperation):
  29. # If elements are unorderable, sorting them using their hash.
  30. # This is slower but works in any case.
  31. self._sequence = sorted((hash(e) for e in set_sequence))
  32. class _MyHash(object):
  33. """ Class used to hash objects that won't normally pickle """
  34. def __init__(self, *args):
  35. self.args = args
  36. class Hasher(Pickler):
  37. """ A subclass of pickler, to do cryptographic hashing, rather than
  38. pickling.
  39. """
  40. def __init__(self, hash_name='md5'):
  41. self.stream = io.BytesIO()
  42. # By default we want a pickle protocol that only changes with
  43. # the major python version and not the minor one
  44. protocol = 3
  45. Pickler.__init__(self, self.stream, protocol=protocol)
  46. # Initialise the hash obj
  47. self._hash = hashlib.new(hash_name)
  48. def hash(self, obj, return_digest=True):
  49. try:
  50. self.dump(obj)
  51. except pickle.PicklingError as e:
  52. e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
  53. raise
  54. dumps = self.stream.getvalue()
  55. self._hash.update(dumps)
  56. if return_digest:
  57. return self._hash.hexdigest()
  58. def save(self, obj):
  59. if isinstance(obj, (types.MethodType, type({}.pop))):
  60. # the Pickler cannot pickle instance methods; here we decompose
  61. # them into components that make them uniquely identifiable
  62. if hasattr(obj, '__func__'):
  63. func_name = obj.__func__.__name__
  64. else:
  65. func_name = obj.__name__
  66. inst = obj.__self__
  67. if type(inst) == type(pickle):
  68. obj = _MyHash(func_name, inst.__name__)
  69. elif inst is None:
  70. # type(None) or type(module) do not pickle
  71. obj = _MyHash(func_name, inst)
  72. else:
  73. cls = obj.__self__.__class__
  74. obj = _MyHash(func_name, inst, cls)
  75. Pickler.save(self, obj)
  76. def memoize(self, obj):
  77. # We want hashing to be sensitive to value instead of reference.
  78. # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
  79. # to hash to the same value and that's why we disable memoization
  80. # for strings
  81. if isinstance(obj, (bytes, str)):
  82. return
  83. Pickler.memoize(self, obj)
  84. # The dispatch table of the pickler is not accessible in Python
  85. # 3, as these lines are only bugware for IPython, we skip them.
  86. def save_global(self, obj, name=None, pack=struct.pack):
  87. # We have to override this method in order to deal with objects
  88. # defined interactively in IPython that are not injected in
  89. # __main__
  90. kwargs = dict(name=name, pack=pack)
  91. del kwargs['pack']
  92. try:
  93. Pickler.save_global(self, obj, **kwargs)
  94. except pickle.PicklingError:
  95. Pickler.save_global(self, obj, **kwargs)
  96. module = getattr(obj, "__module__", None)
  97. if module == '__main__':
  98. my_name = name
  99. if my_name is None:
  100. my_name = obj.__name__
  101. mod = sys.modules[module]
  102. if not hasattr(mod, my_name):
  103. # IPython doesn't inject the variables define
  104. # interactively in __main__
  105. setattr(mod, my_name, obj)
  106. dispatch = Pickler.dispatch.copy()
  107. # builtin
  108. dispatch[type(len)] = save_global
  109. # type
  110. dispatch[type(object)] = save_global
  111. # classobj
  112. dispatch[type(Pickler)] = save_global
  113. # function
  114. dispatch[type(pickle.dump)] = save_global
  115. def _batch_setitems(self, items):
  116. # forces order of keys in dict to ensure consistent hash.
  117. try:
  118. # Trying first to compare dict assuming the type of keys is
  119. # consistent and orderable.
  120. # This fails on python 3 when keys are unorderable
  121. # but we keep it in a try as it's faster.
  122. Pickler._batch_setitems(self, iter(sorted(items)))
  123. except TypeError:
  124. # If keys are unorderable, sorting them using their hash. This is
  125. # slower but works in any case.
  126. Pickler._batch_setitems(self, iter(sorted((hash(k), v)
  127. for k, v in items)))
  128. def save_set(self, set_items):
  129. # forces order of items in Set to ensure consistent hash
  130. Pickler.save(self, _ConsistentSet(set_items))
  131. dispatch[type(set())] = save_set
  132. class NumpyHasher(Hasher):
  133. """ Special case the hasher for when numpy is loaded.
  134. """
  135. def __init__(self, hash_name='md5', coerce_mmap=False):
  136. """
  137. Parameters
  138. ----------
  139. hash_name: string
  140. The hash algorithm to be used
  141. coerce_mmap: boolean
  142. Make no difference between np.memmap and np.ndarray
  143. objects.
  144. """
  145. self.coerce_mmap = coerce_mmap
  146. Hasher.__init__(self, hash_name=hash_name)
  147. # delayed import of numpy, to avoid tight coupling
  148. import numpy as np
  149. self.np = np
  150. if hasattr(np, 'getbuffer'):
  151. self._getbuffer = np.getbuffer
  152. else:
  153. self._getbuffer = memoryview
  154. def save(self, obj):
  155. """ Subclass the save method, to hash ndarray subclass, rather
  156. than pickling them. Off course, this is a total abuse of
  157. the Pickler class.
  158. """
  159. if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
  160. # Compute a hash of the object
  161. # The update function of the hash requires a c_contiguous buffer.
  162. if obj.shape == ():
  163. # 0d arrays need to be flattened because viewing them as bytes
  164. # raises a ValueError exception.
  165. obj_c_contiguous = obj.flatten()
  166. elif obj.flags.c_contiguous:
  167. obj_c_contiguous = obj
  168. elif obj.flags.f_contiguous:
  169. obj_c_contiguous = obj.T
  170. else:
  171. # Cater for non-single-segment arrays: this creates a
  172. # copy, and thus aleviates this issue.
  173. # XXX: There might be a more efficient way of doing this
  174. obj_c_contiguous = obj.flatten()
  175. # memoryview is not supported for some dtypes, e.g. datetime64, see
  176. # https://github.com/numpy/numpy/issues/4983. The
  177. # workaround is to view the array as bytes before
  178. # taking the memoryview.
  179. self._hash.update(
  180. self._getbuffer(obj_c_contiguous.view(self.np.uint8)))
  181. # We store the class, to be able to distinguish between
  182. # Objects with the same binary content, but different
  183. # classes.
  184. if self.coerce_mmap and isinstance(obj, self.np.memmap):
  185. # We don't make the difference between memmap and
  186. # normal ndarrays, to be able to reload previously
  187. # computed results with memmap.
  188. klass = self.np.ndarray
  189. else:
  190. klass = obj.__class__
  191. # We also return the dtype and the shape, to distinguish
  192. # different views on the same data with different dtypes.
  193. # The object will be pickled by the pickler hashed at the end.
  194. obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
  195. elif isinstance(obj, self.np.dtype):
  196. # Atomic dtype objects are interned by their default constructor:
  197. # np.dtype('f8') is np.dtype('f8')
  198. # This interning is not maintained by a
  199. # pickle.loads + pickle.dumps cycle, because __reduce__
  200. # uses copy=True in the dtype constructor. This
  201. # non-deterministic behavior causes the internal memoizer
  202. # of the hasher to generate different hash values
  203. # depending on the history of the dtype object.
  204. # To prevent the hash from being sensitive to this, we use
  205. # .descr which is a full (and never interned) description of
  206. # the array dtype according to the numpy doc.
  207. klass = obj.__class__
  208. obj = (klass, ('HASHED', obj.descr))
  209. Hasher.save(self, obj)
  210. def hash(obj, hash_name='md5', coerce_mmap=False):
  211. """ Quick calculation of a hash to identify uniquely Python objects
  212. containing numpy arrays.
  213. Parameters
  214. -----------
  215. hash_name: 'md5' or 'sha1'
  216. Hashing algorithm used. sha1 is supposedly safer, but md5 is
  217. faster.
  218. coerce_mmap: boolean
  219. Make no difference between np.memmap and np.ndarray
  220. """
  221. valid_hash_names = ('md5', 'sha1')
  222. if hash_name not in valid_hash_names:
  223. raise ValueError("Valid options for 'hash_name' are {}. "
  224. "Got hash_name={!r} instead."
  225. .format(valid_hash_names, hash_name))
  226. if 'numpy' in sys.modules:
  227. hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
  228. else:
  229. hasher = Hasher(hash_name=hash_name)
  230. return hasher.hash(obj)