|
|
| """
|
| Test script for metadata extraction functionality
|
| """
|
| import os
|
| import sys
|
| from src.extract_text.extract_meta_data import PDFArtworkMetadataExtractor
|
|
|
| def test_metadata_extraction():
|
| """Test the metadata extraction on a sample PDF"""
|
|
|
|
|
| base_path = "requirements_library/client-requirements"
|
|
|
| if not os.path.exists(base_path):
|
| print("β No requirements library found")
|
| return False
|
|
|
|
|
| pdf_file = None
|
| for root, dirs, files in os.walk(base_path):
|
| for file in files:
|
| if file.lower().endswith('.pdf'):
|
| pdf_file = os.path.join(root, file)
|
| break
|
| if pdf_file:
|
| break
|
|
|
| if not pdf_file:
|
| print("β No PDF files found in requirements library")
|
| return False
|
|
|
| print(f"π Testing metadata extraction on: {pdf_file}")
|
|
|
| try:
|
|
|
| extractor = PDFArtworkMetadataExtractor()
|
|
|
|
|
| metadata = extractor.extract_metadata(pdf_file)
|
|
|
| if 'error' in metadata:
|
| print(f"β Error extracting metadata: {metadata['error']}")
|
| return False
|
|
|
|
|
| print("β
Metadata extraction successful!")
|
| print(f"π Pages processed: {metadata.get('pages_processed', 0)}")
|
| print(f"π Has selectable text: {metadata.get('has_selectable_text', False)}")
|
| print(f"π§ Extraction method: {metadata.get('extraction_method', 'unknown')}")
|
|
|
|
|
| fonts = metadata.get('fonts', {})
|
| if fonts:
|
| print("\nπ€ Top 3 Fonts:")
|
| for i, (font, count) in enumerate(list(fonts.items())[:3]):
|
| print(f" {i+1}. {font}: {count} characters")
|
|
|
|
|
| font_sizes = metadata.get('font_sizes', {})
|
| if font_sizes:
|
| print("\nπ Top 3 Font Sizes:")
|
| for i, (size, count) in enumerate(list(font_sizes.items())[:3]):
|
| print(f" {i+1}. {size}pt: {count} characters")
|
|
|
|
|
| colors = metadata.get('text_colors', {})
|
| if colors:
|
| print("\nπ¨ Top 3 Text Colors:")
|
| for i, (color, count) in enumerate(list(colors.items())[:3]):
|
| print(f" {i+1}. RGB{color}: {count} characters")
|
|
|
| return True
|
|
|
| except Exception as e:
|
| print(f"β Test failed with error: {str(e)}")
|
| return False
|
|
|
| if __name__ == "__main__":
|
| print("π§ͺ Testing Metadata Extraction")
|
| print("=" * 40)
|
|
|
| success = test_metadata_extraction()
|
|
|
| if success:
|
| print("\nβ
All tests passed! Metadata extraction is working correctly.")
|
| else:
|
| print("\nβ Tests failed. Please check the error messages above.")
|
| sys.exit(1) |