Spaces:

charles-azam
/

deepdraft

Runtime error

deepdraft / src /deepengineer /webcrawler /utils.py

Charles Azam

clean: run linting and formating on repo

b5fafa1 5 months ago

3.6 kB

	import re
	import unicodedata


	def sanitize_filename(filename, replacement_char="_", max_length=255):
	"""
	Sanitizes a string to be suitable for use as a filename.

	This function performs the following steps:
	1. Replaces spaces with the specified replacement_char.
	2. Converts the string to Unicode Normalization Form KD (NFKD) to separate
	base characters from their diacritical marks (e.g., 'é' becomes 'e').
	3. Encodes to ASCII and decodes back to remove non-ASCII characters.
	4. Removes any characters that are not alphanumeric, hyphens, underscores,
	or periods, replacing them with the replacement_char.
	5. Replaces multiple consecutive replacement_char characters with a single one.
	6. Trims leading/trailing replacement_char characters.
	7. Ensures the filename doesn't start with a period (which makes it hidden on some systems).
	8. Truncates the filename to the specified max_length (important for OS compatibility).

	Args:
	filename (str): The original string to sanitize.
	replacement_char (str, optional): The character to replace invalid characters with.
	Defaults to "_".
	max_length (int, optional): The maximum allowed length for the filename.
	Defaults to 255, a common OS limit.

	Returns:
	str: The sanitized filename.
	"""

	# 1. Replace spaces with the replacement_char
	# This is done early to ensure spaces are handled before other replacements
	# to avoid issues with double replacement characters in subsequent steps.
	cleaned_filename = filename.replace(" ", replacement_char)

	# 2. Convert to NFKD and encode to ASCII to handle accented characters
	# This transforms 'crème brûlée' into 'creme brulee'
	cleaned_filename = (
	unicodedata.normalize("NFKD", cleaned_filename)
	.encode("ascii", "ignore")
	.decode("utf-8")
	)

	# 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
	# Replace them with the specified replacement_char.
	# The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
	# (a-z, A-Z, 0-9, hyphen, underscore, or period).
	cleaned_filename = re.sub(r"[^a-zA-Z0-9\-_.]", replacement_char, cleaned_filename)

	# 4. Replace multiple consecutive replacement_char characters with a single one
	cleaned_filename = re.sub(
	re.escape(replacement_char) + r"+", replacement_char, cleaned_filename
	)

	# 5. Trim leading/trailing replacement_char characters
	cleaned_filename = cleaned_filename.strip(replacement_char)

	# 6. Ensure the filename doesn't start with a period (hidden file on some systems)
	if cleaned_filename.startswith("."):
	cleaned_filename = replacement_char + cleaned_filename[1:]

	# 7. Truncate to max_length
	# This is important for cross-platform compatibility (e.g., Windows limits are around 255)
	if len(cleaned_filename) > max_length:
	# Try to keep the file extension if present
	name, ext = "", ""
	if "." in cleaned_filename:
	parts = cleaned_filename.rsplit(".", 1)
	name, ext = parts[0], "." + parts[1]

	if len(name) > max_length - len(ext):
	cleaned_filename = name[: max_length - len(ext)] + ext
	else:
	cleaned_filename = cleaned_filename[:max_length]

	# Handle empty string case after all operations
	if not cleaned_filename:
	return "untitled" + replacement_char + "file"

	return cleaned_filename