| """Utilities to efficiently compute the SHA 256 hash of a bunch of bytes.""" |
|
|
| from typing import BinaryIO, Optional |
|
|
| from .insecure_hashlib import sha1, sha256 |
|
|
|
|
| def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes: |
| """ |
| Computes the sha256 hash of the given file object, by chunks of size `chunk_size`. |
| |
| Args: |
| fileobj (file-like object): |
| The File object to compute sha256 for, typically obtained with `open(path, "rb")` |
| chunk_size (`int`, *optional*): |
| The number of bytes to read from `fileobj` at once, defaults to 1MB. |
| |
| Returns: |
| `bytes`: `fileobj`'s sha256 hash as bytes |
| """ |
| chunk_size = chunk_size if chunk_size is not None else 1024 * 1024 |
|
|
| sha = sha256() |
| while True: |
| chunk = fileobj.read(chunk_size) |
| sha.update(chunk) |
| if not chunk: |
| break |
| return sha.digest() |
|
|
|
|
| def git_hash(data: bytes) -> str: |
| """ |
| Computes the git-sha1 hash of the given bytes, using the same algorithm as git. |
| |
| This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object |
| for more details. |
| |
| Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the |
| pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of |
| the LFS file content when we want to compare LFS files. |
| |
| Args: |
| data (`bytes`): |
| The data to compute the git-hash for. |
| |
| Returns: |
| `str`: the git-hash of `data` as an hexadecimal string. |
| |
| Example: |
| ```python |
| >>> from huggingface_hub.utils.sha import git_hash |
| >>> git_hash(b"Hello, World!") |
| 'b45ef6fec89518d314f546fd6c3025367b721684' |
| ``` |
| """ |
| |
| |
| sha = sha1() |
| sha.update(b"blob ") |
| sha.update(str(len(data)).encode()) |
| sha.update(b"\0") |
| sha.update(data) |
| return sha.hexdigest() |
|
|