Commit
·
78403ba
1
Parent(s):
204c034
Further changes to fix duplicate tests
Browse files- tools/find_duplicate_pages.py +9 -0
- tools/secure_path_utils.py +19 -2
tools/find_duplicate_pages.py
CHANGED
|
@@ -462,6 +462,9 @@ def combine_ocr_dataframes(
|
|
| 462 |
output_files = list()
|
| 463 |
if output_folder and output_filename:
|
| 464 |
# Validate path safety before creating directories and files
|
|
|
|
|
|
|
|
|
|
| 465 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 466 |
raise ValueError(f"Unsafe output folder path: {output_folder}")
|
| 467 |
if not validate_path_safety(output_filename):
|
|
@@ -656,6 +659,9 @@ def save_results_and_redaction_lists(
|
|
| 656 |
list: A list of paths to all generated files.
|
| 657 |
"""
|
| 658 |
# Validate the output_folder path for security
|
|
|
|
|
|
|
|
|
|
| 659 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 660 |
raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
|
| 661 |
|
|
@@ -665,6 +671,9 @@ def save_results_and_redaction_lists(
|
|
| 665 |
try:
|
| 666 |
output_folder_path = Path(output_folder).resolve()
|
| 667 |
# Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
|
|
|
|
|
|
|
|
|
|
| 668 |
if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
|
| 669 |
raise ValueError(
|
| 670 |
f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
|
|
|
|
| 462 |
output_files = list()
|
| 463 |
if output_folder and output_filename:
|
| 464 |
# Validate path safety before creating directories and files
|
| 465 |
+
print(
|
| 466 |
+
f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 467 |
+
)
|
| 468 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 469 |
raise ValueError(f"Unsafe output folder path: {output_folder}")
|
| 470 |
if not validate_path_safety(output_filename):
|
|
|
|
| 659 |
list: A list of paths to all generated files.
|
| 660 |
"""
|
| 661 |
# Validate the output_folder path for security
|
| 662 |
+
print(
|
| 663 |
+
f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 664 |
+
)
|
| 665 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 666 |
raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
|
| 667 |
|
|
|
|
| 671 |
try:
|
| 672 |
output_folder_path = Path(output_folder).resolve()
|
| 673 |
# Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
|
| 674 |
+
print(
|
| 675 |
+
f"DEBUG: Validating resolved path='{output_folder_path}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 676 |
+
)
|
| 677 |
if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
|
| 678 |
raise ValueError(
|
| 679 |
f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
|
tools/secure_path_utils.py
CHANGED
|
@@ -311,6 +311,14 @@ def validate_folder_containment(
|
|
| 311 |
path_str = str(normalized_path).lower()
|
| 312 |
base_str = str(normalized_base).lower()
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
# Check if this is a test scenario
|
| 315 |
is_test_path = any(
|
| 316 |
test_pattern in path_str
|
|
@@ -339,6 +347,8 @@ def validate_folder_containment(
|
|
| 339 |
]
|
| 340 |
)
|
| 341 |
|
|
|
|
|
|
|
| 342 |
# For test scenarios, be more permissive
|
| 343 |
if is_test_path or is_test_base:
|
| 344 |
print(f"DEBUG: Allowing test path: {path_str} (base: {base_str})")
|
|
@@ -346,18 +356,25 @@ def validate_folder_containment(
|
|
| 346 |
|
| 347 |
# Ensure the base path exists and is a directory
|
| 348 |
if not os.path.exists(normalized_base) or not os.path.isdir(normalized_base):
|
|
|
|
|
|
|
|
|
|
| 349 |
return False
|
| 350 |
|
| 351 |
# Use commonpath to check containment
|
| 352 |
try:
|
| 353 |
common_path = os.path.commonpath([normalized_path, normalized_base])
|
| 354 |
# The common path must be exactly the base path for strict containment
|
| 355 |
-
|
|
|
|
|
|
|
| 356 |
except ValueError:
|
| 357 |
# commonpath raises ValueError if paths are on different drives (Windows)
|
|
|
|
| 358 |
return False
|
| 359 |
|
| 360 |
-
except Exception:
|
|
|
|
| 361 |
return False
|
| 362 |
|
| 363 |
|
|
|
|
| 311 |
path_str = str(normalized_path).lower()
|
| 312 |
base_str = str(normalized_base).lower()
|
| 313 |
|
| 314 |
+
print(
|
| 315 |
+
f"DEBUG: validate_folder_containment called with path='{path}' base_path='{base_path}'"
|
| 316 |
+
)
|
| 317 |
+
print(
|
| 318 |
+
f"DEBUG: normalized_path='{normalized_path}' normalized_base='{normalized_base}'"
|
| 319 |
+
)
|
| 320 |
+
print(f"DEBUG: path_str='{path_str}' base_str='{base_str}'")
|
| 321 |
+
|
| 322 |
# Check if this is a test scenario
|
| 323 |
is_test_path = any(
|
| 324 |
test_pattern in path_str
|
|
|
|
| 347 |
]
|
| 348 |
)
|
| 349 |
|
| 350 |
+
print(f"DEBUG: is_test_path={is_test_path} is_test_base={is_test_base}")
|
| 351 |
+
|
| 352 |
# For test scenarios, be more permissive
|
| 353 |
if is_test_path or is_test_base:
|
| 354 |
print(f"DEBUG: Allowing test path: {path_str} (base: {base_str})")
|
|
|
|
| 356 |
|
| 357 |
# Ensure the base path exists and is a directory
|
| 358 |
if not os.path.exists(normalized_base) or not os.path.isdir(normalized_base):
|
| 359 |
+
print(
|
| 360 |
+
f"DEBUG: Base path does not exist or is not a directory: {normalized_base}"
|
| 361 |
+
)
|
| 362 |
return False
|
| 363 |
|
| 364 |
# Use commonpath to check containment
|
| 365 |
try:
|
| 366 |
common_path = os.path.commonpath([normalized_path, normalized_base])
|
| 367 |
# The common path must be exactly the base path for strict containment
|
| 368 |
+
result = common_path == normalized_base
|
| 369 |
+
print(f"DEBUG: common_path='{common_path}' result={result}")
|
| 370 |
+
return result
|
| 371 |
except ValueError:
|
| 372 |
# commonpath raises ValueError if paths are on different drives (Windows)
|
| 373 |
+
print("DEBUG: ValueError in commonpath check")
|
| 374 |
return False
|
| 375 |
|
| 376 |
+
except Exception as e:
|
| 377 |
+
print(f"DEBUG: Exception in validate_folder_containment: {e}")
|
| 378 |
return False
|
| 379 |
|
| 380 |
|