|
|
| """
|
| Test script for Google Document AI functionality.
|
| This script demonstrates the text extraction with bounding boxes and height calculation.
|
| """
|
|
|
| import os
|
| import sys
|
| from pathlib import Path
|
|
|
|
|
| sys.path.append(str(Path(__file__).parent / "src"))
|
|
|
| from extract_text.google_document_api import GoogleDocumentAPI
|
|
|
| def test_google_doc_ai():
|
| """Test the Google Document AI functionality with a sample PDF."""
|
|
|
|
|
| credentials_path = "src/extract_text/photon-services-f0d3ec1417d0.json"
|
|
|
|
|
| test_pdf_path = "requirements_library/client-requirements/Kir-Kat/kitkat-f1.pdf"
|
|
|
|
|
| if not os.path.exists(credentials_path):
|
| print(f"β Credentials file not found: {credentials_path}")
|
| print("Please ensure the Google Cloud credentials file is in the correct location.")
|
| return
|
|
|
| if not os.path.exists(test_pdf_path):
|
| print(f"β Test PDF file not found: {test_pdf_path}")
|
| print("Please ensure the test PDF file exists.")
|
| return
|
|
|
| print("π Testing Google Document AI functionality...")
|
| print(f"π Using PDF: {test_pdf_path}")
|
| print(f"π Using credentials: {credentials_path}")
|
| print("-" * 80)
|
|
|
| try:
|
|
|
| print("1. Initializing Google Document API...")
|
| doc_api = GoogleDocumentAPI(credentials_path)
|
| print("β
Google Document API initialized successfully")
|
|
|
|
|
| print("\n2. Processing document...")
|
| document = doc_api.process_document(test_pdf_path)
|
| print("β
Document processed successfully")
|
|
|
|
|
| print("\n3. Extracting basic text...")
|
| basic_text = doc_api.get_document_text(document, page_number=0)
|
| print(f"π Basic text length: {len(basic_text)} characters")
|
| print(f"π First 200 characters: {basic_text[:200]}...")
|
|
|
|
|
| print("\n4. Extracting text with bounding boxes and height...")
|
| text_blocks = doc_api.extract_text_with_bounding_boxes(document)
|
| print(f"π Found {len(text_blocks)} text blocks")
|
|
|
|
|
| print("\n5. Sample text blocks with height information:")
|
| print("-" * 80)
|
| for i, block in enumerate(text_blocks[:10]):
|
| print(f"Block {i+1}:")
|
| print(f" Page: {block['page_number']}")
|
| print(f" Height: {block['height']:.2f} mm")
|
| print(f" Style: {block['style']}")
|
| print(f" Text: {block['text'][:100]}{'...' if len(block['text']) > 100 else ''}")
|
| print(f" Bounding Box: {block['bounding_box']}")
|
| print()
|
|
|
|
|
| print("\n6. Generating markdown table...")
|
| markdown_table = doc_api.extract_text_with_markdown_table(document)
|
| print("π Markdown table generated successfully")
|
|
|
|
|
| print("\n7. Testing extract_text_heights_mm function...")
|
| heights_mm = doc_api.extract_text_heights_mm(document)
|
| print(f"π Found {len(heights_mm)} lines with height in mm")
|
|
|
|
|
| print("\nπ Sample line heights (mm):")
|
| print("-" * 60)
|
| for i, (page_num, line_text, height_mm) in enumerate(heights_mm[:10]):
|
| print(f"Line {i+1}: Page {page_num}, Height={height_mm}mm | Text: {line_text[:50]}...")
|
|
|
|
|
| print("\n8. Saving results to files...")
|
|
|
|
|
| with open("test_results_text_blocks.txt", "w", encoding="utf-8") as f:
|
| f.write("Text Blocks with Height Information:\n")
|
| f.write("=" * 50 + "\n\n")
|
| for i, block in enumerate(text_blocks):
|
| f.write(f"Block {i+1}:\n")
|
| f.write(f" Page: {block['page_number']}\n")
|
| f.write(f" Height: {block['height']:.2f} mm\n")
|
| f.write(f" Style: {block['style']}\n")
|
| f.write(f" Text: {block['text']}\n")
|
| f.write(f" Bounding Box: {block['bounding_box']}\n")
|
| f.write("-" * 40 + "\n")
|
|
|
|
|
| with open("test_results_markdown_table.md", "w", encoding="utf-8") as f:
|
| f.write("# Google Document AI Results\n\n")
|
| f.write("## Text Blocks with Height Information\n\n")
|
| f.write(markdown_table)
|
|
|
|
|
| with open("test_results_basic_text.txt", "w", encoding="utf-8") as f:
|
| f.write("Basic Extracted Text:\n")
|
| f.write("=" * 30 + "\n\n")
|
| f.write(basic_text)
|
|
|
| print("β
Results saved to:")
|
| print(" - test_results_text_blocks.txt")
|
| print(" - test_results_markdown_table.md")
|
| print(" - test_results_basic_text.txt")
|
|
|
|
|
| with open("test_results_heights_mm.txt", "w", encoding="utf-8") as f:
|
| f.write("Line Heights in Millimeters:\n")
|
| f.write("=" * 40 + "\n\n")
|
| for i, (page_num, line_text, height_mm) in enumerate(heights_mm):
|
| f.write(f"Line {i+1}: Page {page_num}, Height={height_mm}mm\n")
|
| f.write(f"Text: {line_text}\n")
|
| f.write("-" * 40 + "\n")
|
|
|
| print(" - test_results_heights_mm.txt")
|
|
|
|
|
| print("\n9. Statistics:")
|
| print("-" * 30)
|
| heights = [block['height'] for block in text_blocks]
|
| if heights:
|
| print(f"π Height statistics:")
|
| print(f" Min height: {min(heights):.2f} mm")
|
| print(f" Max height: {max(heights):.2f} mm")
|
| print(f" Average height: {sum(heights)/len(heights):.2f} mm")
|
|
|
|
|
| styles = {}
|
| for block in text_blocks:
|
| style = block['style']
|
| styles[style] = styles.get(style, 0) + 1
|
|
|
| print(f"\nπ¨ Style distribution:")
|
| for style, count in sorted(styles.items(), key=lambda x: x[1], reverse=True):
|
| print(f" {style}: {count} blocks")
|
|
|
| print("\nπ Test completed successfully!")
|
|
|
| except Exception as e:
|
| print(f"β Error during testing: {str(e)}")
|
| import traceback
|
| traceback.print_exc()
|
|
|
| def display_markdown_preview():
|
| """Display a preview of the generated markdown table."""
|
| try:
|
| with open("test_results_markdown_table.md", "r", encoding="utf-8") as f:
|
| content = f.read()
|
|
|
| print("\nπ Markdown Table Preview:")
|
| print("=" * 80)
|
| print(content)
|
|
|
| except FileNotFoundError:
|
| print("β Markdown table file not found. Run the test first.")
|
|
|
| if __name__ == "__main__":
|
| print("π Google Document AI Test Script")
|
| print("=" * 50)
|
|
|
|
|
| test_google_doc_ai()
|
|
|
|
|
| display_markdown_preview() |