Source code for girder.utility.hash_state

#!/usr/bin/env python
# -*- coding: utf-8 -*-

###############################################################################
#  Copyright 2013 Kitware Inc.
#
#  Licensed under the Apache License, Version 2.0 ( the "License" );
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
###############################################################################

# Utility for saving and restoring the internal state of a hash object
# so that checksums can be streamed without having to remain in memory.
# Inspired by http://code.activestate.com/recipes/
# 578479-save-and-restore-sha-512-internal-state/

import hashlib
import ctypes
import binascii
import sys

_ver = sys.version_info
_HASHLIB_INLINE_EVP_STRUCT = _ver < (2, 7, 13) or (_ver >= (3,) and _ver < (3, 5, 3))


def _getHashStateDataPointer(hashObject):
    """
    Get a pointer to the internal hash state of a hash object.

    :param hashObject: A hashlib hash object.
    :return: A ctypes pointer to the internal hash state.
    :rtype: ctypes.POINTER(ctypes.c_char)
    """
    hashPointer = ctypes.cast(id(hashObject), ctypes.POINTER(ctypes.c_void_p))

    if _HASHLIB_INLINE_EVP_STRUCT:
        # From
        #   * github.com/python/cpython/blob/2.7/Modules/_hashopenssl.c#L58
        # the layout of a hashObject (_hashlib.HASH) is:
        #     typedef struct {
        #         PyObject_HEAD
        #         PyObject *name;
        #         EVP_MD_CTX ctx;
        #         ...
        #     } EVPobject;
        #
        # Using
        #   * docs.python.org/2/c-api/structures.html#c.PyObject_HEAD
        #   * github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/evp/evp.h#L265
        # this expands to:
        #     typedef struct {
        #         Py_ssize_t ob_refcnt;      // Word 0
        #         PyTypeObject *ob_type;     // Word 1
        #         PyObject *name;            // Word 2
        #         struct env_md_ctx_st {
        #             const EVP_MD *digest;  // Word 3
        #             ENGINE *engine;        // Word 4
        #             unsigned long flags;   // Word 5
        #             void *md_data;         // Word 6   << this guy is what we want
        #             ...
        #         } /* EVP_MD_CTX */;
        #         ...
        #     } EVPobject;
        #
        # Thus, an offset of 6 words ( sizeof(void*) is 1 word) is required
        stateDataPointer = ctypes.cast(hashPointer[6], ctypes.POINTER(ctypes.c_char))
    else:
        # In cpython 2.7.13, hashlib changed to store a pointer to the OpenSSL hash
        # object rather than inlining it in the struct, so we require an extra dereference. See
        # https://github.com/python/cpython/commit/9d9615f6782be4b1f38b47d4d56cee208c26a970
        evpStruct = ctypes.cast(hashPointer[3], ctypes.POINTER(ctypes.c_void_p))
        stateDataPointer = ctypes.cast(evpStruct[3], ctypes.POINTER(ctypes.c_char))

    assert stateDataPointer
    return stateDataPointer


_HASH_INFOS = dict()


class _HashInfo(object):
    def __init__(self, type, stateSize, initVectorFirstWord):
        self.type = type
        self.stateSize = stateSize

        # Instantiate to get the name and do a sanity check
        instance = self.type()
        self.name = instance.name

        stateDataPointer = _getHashStateDataPointer(instance)
        assert stateDataPointer[:8] == initVectorFirstWord

        _HASH_INFOS[self.name] = self


_HashInfo(
    type=hashlib.md5,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/md5/md5.h#L100
    stateSize=92,
    # Found empirically
    initVectorFirstWord=b'\x01\x23\x45\x67\x89\xab\xcd\xef'
)
_HashInfo(
    type=hashlib.sha1,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha.h#L100
    stateSize=96,
    # Found empirically
    initVectorFirstWord=b'\x01\x23\x45\x67\x89\xab\xcd\xef'
)
_HashInfo(
    type=hashlib.sha224,
    # Uses the same struct as sha256
    stateSize=112,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha256.c#L22
    initVectorFirstWord=b'\xd8\x9e\x05\xc1\x07\xd5\x7c\x36'
)
_HashInfo(
    type=hashlib.sha256,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha.h#L135
    stateSize=112,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha256.c#L33
    initVectorFirstWord=b'\x67\xe6\x09\x6a\x85\xae\x67\xbb'
)
_HashInfo(
    type=hashlib.sha384,
    # Uses the same struct as sha512
    stateSize=216,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha512.c#L64
    initVectorFirstWord=b'\xd8\x9e\x05\xc1\x5d\x9d\xbb\xcb'
)
_HashInfo(
    type=hashlib.sha512,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha.h#L182
    stateSize=216,
    # github.com/openssl/openssl/blob/OpenSSL_1_0_1f/crypto/sha/sha512.c#L80
    initVectorFirstWord=b'\x08\xc9\xbc\xf3\x67\xe6\x09\x6a'
)


[docs]def serialize(hashObject): """ Serializes the internal state of the hash object passed in. Calling restore() on the serialized value afterward will return an equivalent hash object to the one passed. :param hashObject: The hash object to serialize. :type hashObject: _hashlib.HASH :returns: A binary string representing the serialized state. """ stateDataPointer = _getHashStateDataPointer(hashObject) hashInfo = _HASH_INFOS[hashObject.name] return stateDataPointer[:hashInfo.stateSize]
[docs]def restore(oldStateData, hashName): """ Unserializes a hash object from a binary string created by the serialize() method in this module. :param oldStateData: The serialized state. :type oldStateData: binary string :param hashName: The name of the hash algorithm type. :type hashName: str :returns: A hash object in the same state as the one that serialize() was called on. """ hashInfo = _HASH_INFOS[hashName] assert len(oldStateData) == hashInfo.stateSize hashObject = hashInfo.type() stateDataPointer = _getHashStateDataPointer(hashObject) for i, byte in enumerate(oldStateData): stateDataPointer[i] = byte return hashObject
def serializeHex(hashObject): return binascii.b2a_hex(serialize(hashObject)) def restoreHex(oldHexStateData, hashName): return restore(binascii.a2b_hex(oldHexStateData), hashName)