not-pegasus
/

IMAGE_MODAL

Model card Files Files and versions

IMAGE_MODAL / env /lib /python3.13 /site-packages /huggingface_hub /utils /sha.py

not-pegasus's picture

Add files using upload-large-folder tool

0549051 verified 4 months ago

history blame contribute delete

2.13 kB

	"""Utilities to efficiently compute the SHA 256 hash of a bunch of bytes."""

	from typing import BinaryIO, Optional

	from .insecure_hashlib import sha1, sha256


	def sha_fileobj(fileobj: BinaryIO, chunk_size: Optional[int] = None) -> bytes:
	"""
	Computes the sha256 hash of the given file object, by chunks of size `chunk_size`.

	Args:
	fileobj (file-like object):
	The File object to compute sha256 for, typically obtained with `open(path, "rb")`
	chunk_size (`int`, optional):
	The number of bytes to read from `fileobj` at once, defaults to 1MB.

	Returns:
	`bytes`: `fileobj`'s sha256 hash as bytes
	"""
	chunk_size = chunk_size if chunk_size is not None else 1024 * 1024

	sha = sha256()
	while True:
	chunk = fileobj.read(chunk_size)
	sha.update(chunk)
	if not chunk:
	break
	return sha.digest()


	def git_hash(data: bytes) -> str:
	"""
	Computes the git-sha1 hash of the given bytes, using the same algorithm as git.

	This is equivalent to running `git hash-object`. See https://git-scm.com/docs/git-hash-object
	for more details.

	Note: this method is valid for regular files. For LFS files, the proper git hash is supposed to be computed on the
	pointer file content, not the actual file content. However, for simplicity, we directly compare the sha256 of
	the LFS file content when we want to compare LFS files.

	Args:
	data (`bytes`):
	The data to compute the git-hash for.

	Returns:
	`str`: the git-hash of `data` as an hexadecimal string.

	Example:
	```python
	>>> from huggingface_hub.utils.sha import git_hash
	>>> git_hash(b"Hello, World!")
	'b45ef6fec89518d314f546fd6c3025367b721684'
	```
	"""
	# Taken from https://gist.github.com/msabramo/763200
	# Note: no need to optimize by reading the file in chunks as we're not supposed to hash huge files (5MB maximum).
	sha = sha1()
	sha.update(b"blob ")
	sha.update(str(len(data)).encode())
	sha.update(b"\0")
	sha.update(data)
	return sha.hexdigest()