tokenizers / app /utils /validators.py
bartar's picture
Upload 26 files
d66ab65 verified
"""
Validation utilities for security and input validation
"""
import os
import re
from typing import Optional
from urllib.parse import urlparse
class ValidationError(Exception):
"""Custom exception for validation errors."""
pass
class Validators:
"""Collection of validation functions for security and input validation."""
# Regex patterns for validation - allow numbers, letters, hyphens, underscores, dots
HUGGINGFACE_MODEL_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+/[a-zA-Z0-9_\-\.]+$')
SAFE_FILENAME_PATTERN = re.compile(r'^[a-zA-Z0-9_\-\.]+$')
@staticmethod
def validate_model_path(model_path: str) -> bool:
"""
Validate that a custom model path is safe and follows expected patterns.
Args:
model_path: The model path to validate
Returns:
bool: True if valid, False otherwise
Raises:
ValidationError: If the model path is invalid
"""
if not model_path or not isinstance(model_path, str):
raise ValidationError("Model path cannot be empty")
# Trim whitespace
model_path = model_path.strip()
# Check for dangerous characters (excluding single forward slash for HuggingFace format)
dangerous_chars = ['..', '\\', '|', ';', '&', '$', '`', '<', '>']
if any(char in model_path for char in dangerous_chars):
raise ValidationError("Model path contains invalid characters")
# Check for multiple slashes or leading/trailing slashes
if '//' in model_path or model_path.startswith('/') or model_path.endswith('/'):
raise ValidationError("Model path contains invalid characters")
# Check if it looks like a HuggingFace model path (user/model format)
if not Validators.HUGGINGFACE_MODEL_PATTERN.match(model_path):
raise ValidationError("Model path must follow the format 'organization/model-name'")
# Check length limits
if len(model_path) > 200:
raise ValidationError("Model path is too long")
return True
@staticmethod
def validate_filename(filename: str) -> bool:
"""
Validate that a filename is safe for upload.
Args:
filename: The filename to validate
Returns:
bool: True if valid, False otherwise
Raises:
ValidationError: If the filename is invalid
"""
if not filename or not isinstance(filename, str):
raise ValidationError("Filename cannot be empty")
# Check for dangerous characters and patterns
dangerous_patterns = ['..', '/', '\\', '|', ';', '&', '$', '`', '<', '>']
if any(pattern in filename for pattern in dangerous_patterns):
raise ValidationError("Filename contains invalid characters")
# Check if filename starts with a dot (hidden files)
if filename.startswith('.'):
raise ValidationError("Hidden files are not allowed")
# Check length
if len(filename) > 255:
raise ValidationError("Filename is too long")
return True
@staticmethod
def validate_file_extension(filename: str, allowed_extensions: set) -> bool:
"""
Validate that a file has an allowed extension.
Args:
filename: The filename to check
allowed_extensions: Set of allowed extensions (e.g., {'.txt', '.py'})
Returns:
bool: True if valid, False otherwise
Raises:
ValidationError: If the extension is not allowed
"""
if not filename:
raise ValidationError("Filename cannot be empty")
_, ext = os.path.splitext(filename.lower())
if ext not in allowed_extensions:
allowed_list = ', '.join(sorted(allowed_extensions))
raise ValidationError(f"File type '{ext}' not allowed. Allowed types: {allowed_list}")
return True
@staticmethod
def validate_file_size(file_size: int, max_size: int) -> bool:
"""
Validate that a file size is within limits.
Args:
file_size: Size of the file in bytes
max_size: Maximum allowed size in bytes
Returns:
bool: True if valid, False otherwise
Raises:
ValidationError: If the file is too large
"""
if file_size > max_size:
max_mb = max_size / (1024 * 1024)
current_mb = file_size / (1024 * 1024)
raise ValidationError(f"File too large: {current_mb:.1f}MB (max: {max_mb:.1f}MB)")
return True
@staticmethod
def validate_text_input(text: str, max_length: int = 1000000) -> bool:
"""
Validate text input for processing.
Args:
text: The text to validate
max_length: Maximum allowed length
Returns:
bool: True if valid, False otherwise
Raises:
ValidationError: If the text is invalid
"""
if not isinstance(text, str):
raise ValidationError("Text input must be a string")
if len(text) > max_length:
raise ValidationError(f"Text too long: {len(text)} characters (max: {max_length})")
return True
@staticmethod
def sanitize_model_path(model_path: str) -> str:
"""
Sanitize a model path by removing potentially dangerous elements.
Args:
model_path: The model path to sanitize
Returns:
str: Sanitized model path
"""
if not model_path:
return ""
# Remove whitespace
sanitized = model_path.strip()
# Remove any path traversal attempts
sanitized = sanitized.replace('..', '')
sanitized = sanitized.replace('/', '')
sanitized = sanitized.replace('\\', '')
return sanitized
@staticmethod
def is_safe_path(path: str, base_path: str) -> bool:
"""
Check if a path is safe and within the expected base directory.
Args:
path: The path to check
base_path: The base directory that the path should be within
Returns:
bool: True if the path is safe, False otherwise
"""
try:
# Resolve both paths to absolute paths
abs_path = os.path.abspath(path)
abs_base = os.path.abspath(base_path)
# Check if the path is within the base directory
return abs_path.startswith(abs_base)
except (OSError, ValueError):
return False
# Global instance
validators = Validators()