Spaces:

bartar
/

tokenizers

Running

App Files Files Community

tokenizers / tests /test_validators.py

bartar

Upload 26 files

d66ab65 verified 10 days ago

raw

history blame contribute delete

11 kB

	"""
	Unit tests for Validators utility
	"""
	import pytest
	from app.utils.validators import Validators, ValidationError


	class TestValidators:
	"""Test cases for Validators utility."""

	def setup_method(self):
	"""Set up test fixtures."""
	self.validators = Validators()

	def test_validate_filename_valid(self):
	"""Test filename validation with valid filenames."""
	# Valid filenames should not raise
	self.validators.validate_filename('test.txt')
	self.validators.validate_filename('document.md')
	self.validators.validate_filename('script_file.py')
	self.validators.validate_filename('My Document.txt')
	self.validators.validate_filename('file-name.json')
	self.validators.validate_filename('data123.csv')

	def test_validate_filename_invalid(self):
	"""Test filename validation with invalid filenames."""
	# Empty or None filename
	with pytest.raises(ValidationError):
	self.validators.validate_filename('')

	with pytest.raises(ValidationError):
	self.validators.validate_filename(None)

	# Dangerous characters
	with pytest.raises(ValidationError):
	self.validators.validate_filename('../../../etc/passwd')

	with pytest.raises(ValidationError):
	self.validators.validate_filename('file\\with\\backslashes.txt')

	# Null bytes
	with pytest.raises(ValidationError):
	self.validators.validate_filename('file\x00.txt')

	# Control characters
	with pytest.raises(ValidationError):
	self.validators.validate_filename('file\x01\x02.txt')

	# Reserved names on Windows
	with pytest.raises(ValidationError):
	self.validators.validate_filename('CON.txt')

	with pytest.raises(ValidationError):
	self.validators.validate_filename('PRN.txt')

	with pytest.raises(ValidationError):
	self.validators.validate_filename('AUX.txt')

	def test_validate_file_extension_valid(self):
	"""Test file extension validation with valid extensions."""
	allowed_extensions = {'.txt', '.md', '.py', '.js', '.json'}

	# Valid extensions should not raise
	self.validators.validate_file_extension('test.txt', allowed_extensions)
	self.validators.validate_file_extension('document.md', allowed_extensions)
	self.validators.validate_file_extension('script.py', allowed_extensions)
	self.validators.validate_file_extension('data.json', allowed_extensions)

	# Case insensitive
	self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
	self.validators.validate_file_extension('Document.MD', allowed_extensions)

	def test_validate_file_extension_invalid(self):
	"""Test file extension validation with invalid extensions."""
	allowed_extensions = {'.txt', '.md', '.py'}

	# Invalid extensions should raise
	with pytest.raises(ValidationError):
	self.validators.validate_file_extension('virus.exe', allowed_extensions)

	with pytest.raises(ValidationError):
	self.validators.validate_file_extension('archive.zip', allowed_extensions)

	with pytest.raises(ValidationError):
	self.validators.validate_file_extension('image.jpg', allowed_extensions)

	# No extension
	with pytest.raises(ValidationError):
	self.validators.validate_file_extension('filename', allowed_extensions)

	# Empty filename
	with pytest.raises(ValidationError):
	self.validators.validate_file_extension('', allowed_extensions)

	def test_validate_model_path_valid(self):
	"""Test model path validation with valid paths."""
	# Valid HuggingFace model paths
	valid_paths = [
	'microsoft/DialoGPT-medium',
	'google/bert-base-uncased',
	'meta-llama/Llama-2-7b-hf',
	'mistralai/Mistral-7B-Instruct-v0.1',
	'Qwen/Qwen2.5-72B-Instruct',
	'THUDM/chatglm-6b',
	'deepseek-ai/deepseek-coder-6.7b-base',
	'unsloth/llama-2-7b-bnb-4bit',
	'google-bert/bert-base-uncased',
	'bartar/SPLM-2' # User's specific case
	]

	for path in valid_paths:
	self.validators.validate_model_path(path) # Should not raise

	def test_validate_model_path_invalid_format(self):
	"""Test model path validation with invalid formats."""
	# Invalid formats should raise
	invalid_paths = [
	'', # Empty
	'invalid-path', # No slash
	'user/', # Empty model name
	'/model-name', # Empty user
	'user//model', # Double slash
	'user/model/extra', # Too many parts
	'user name/model', # Space in user
	'user/model name', # Space in model (actually this might be valid)
	'user@domain/model', # Invalid characters
	'../malicious/path', # Path traversal
	'user\\model', # Backslash
	]

	for path in invalid_paths:
	with pytest.raises(ValidationError):
	self.validators.validate_model_path(path)

	def test_validate_model_path_untrusted_prefix(self):
	"""Test model path validation with untrusted prefixes."""
	# Paths with untrusted prefixes should raise
	untrusted_paths = [
	'random-user/some-model',
	'untrusted/malicious-model',
	'hacker/backdoor-model',
	'suspicious/model'
	]

	for path in untrusted_paths:
	with pytest.raises(ValidationError):
	self.validators.validate_model_path(path)

	def test_validate_model_path_edge_cases(self):
	"""Test model path validation edge cases."""
	# None input
	with pytest.raises(ValidationError):
	self.validators.validate_model_path(None)

	# Very long path
	long_path = 'microsoft/' + 'a' * 1000
	with pytest.raises(ValidationError):
	self.validators.validate_model_path(long_path)

	# Special characters in allowed prefix
	self.validators.validate_model_path('microsoft/model-with-dashes')
	self.validators.validate_model_path('microsoft/model_with_underscores')
	self.validators.validate_model_path('microsoft/model.with.dots')

	def test_validate_text_input_valid(self):
	"""Test text input validation with valid inputs."""
	# Valid text inputs should not raise
	self.validators.validate_text_input('Hello world!')
	self.validators.validate_text_input('A' * 1000) # Long but reasonable text
	self.validators.validate_text_input('Text with\nnewlines\nand\ttabs')
	self.validators.validate_text_input('Unicode: 你好世界 🌍')
	self.validators.validate_text_input('') # Empty text might be valid

	def test_validate_text_input_invalid(self):
	"""Test text input validation with invalid inputs."""
	# None input
	with pytest.raises(ValidationError):
	self.validators.validate_text_input(None)

	# Extremely long text (if there's a limit)
	very_long_text = 'A' * (10 * 1024 * 1024) # 10MB of text
	with pytest.raises(ValidationError):
	self.validators.validate_text_input(very_long_text)

	def test_validate_text_input_malicious_content(self):
	"""Test text input validation with potentially malicious content."""
	# Null bytes
	with pytest.raises(ValidationError):
	self.validators.validate_text_input('text\x00with\x00nulls')

	# Control characters (some might be allowed like \n, \t)
	try:
	self.validators.validate_text_input('text\x01with\x02controls')
	except ValidationError:
	pass # This might be expected

	def test_validation_error_messages(self):
	"""Test that ValidationError contains meaningful messages."""
	# Test filename validation error message
	try:
	self.validators.validate_filename('../../../etc/passwd')
	assert False, "Should have raised ValidationError"
	except ValidationError as e:
	assert 'filename' in str(e).lower() or 'path' in str(e).lower()

	# Test file extension error message
	try:
	self.validators.validate_file_extension('virus.exe', {'.txt'})
	assert False, "Should have raised ValidationError"
	except ValidationError as e:
	assert 'extension' in str(e).lower() or 'allowed' in str(e).lower()

	# Test model path error message
	try:
	self.validators.validate_model_path('invalid-path')
	assert False, "Should have raised ValidationError"
	except ValidationError as e:
	assert 'model' in str(e).lower() or 'path' in str(e).lower()

	def test_allowed_model_prefixes_coverage(self):
	"""Test that all common model prefixes are covered."""
	# This test ensures we have good coverage of trusted model prefixes
	common_prefixes = [
	'microsoft/',
	'google/',
	'meta-llama/',
	'mistralai/',
	'openai-community/',
	'Qwen/',
	'THUDM/',
	'deepseek-ai/',
	'unsloth/',
	'google-bert/'
	]

	for prefix in common_prefixes:
	# Should be able to validate models with these prefixes
	test_path = prefix + 'test-model'
	try:
	self.validators.validate_model_path(test_path)
	except ValidationError:
	pytest.fail(f"Trusted prefix {prefix} should be allowed")

	def test_case_sensitivity(self):
	"""Test case sensitivity in various validations."""
	# File extensions should be case insensitive
	allowed_extensions = {'.txt', '.md'}
	self.validators.validate_file_extension('FILE.TXT', allowed_extensions)
	self.validators.validate_file_extension('Document.MD', allowed_extensions)

	# Model path prefixes should be case sensitive (HuggingFace convention)
	self.validators.validate_model_path('Microsoft/model') # Capital M

	# But random capitalization in untrusted prefixes should still fail
	with pytest.raises(ValidationError):
	self.validators.validate_model_path('RANDOM/model')