|
|
|
|
|
from PIL import Image |
|
import pytesseract |
|
from pdf2image import convert_from_bytes |
|
import os |
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
def enhance_image_fast(image: Image.Image) -> Image.Image: |
|
"""A lightweight image enhancement pipeline optimized for speed.""" |
|
return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1') |
|
|
|
def extract_text_from_image_fast(image: Image.Image) -> str: |
|
"""Extracts text using Tesseract with a configuration favoring speed.""" |
|
fast_config = r'--oem 1 --psm 6' |
|
text = pytesseract.image_to_string(image, config=fast_config) |
|
return text |
|
|
|
def _process_single_page_fast(page_image: Image.Image) -> str: |
|
"""Helper function that uses the new fast methods.""" |
|
enhanced_image = enhance_image_fast(page_image) |
|
return extract_text_from_image_fast(enhanced_image) |
|
|
|
def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]: |
|
"""Converts a PDF and processes pages in parallel using the FAST pipeline.""" |
|
print("FAST MODE: Converting PDF pages at 150 DPI...") |
|
images = convert_from_bytes(pdf_bytes, dpi=150) |
|
print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...") |
|
|
|
page_results = [] |
|
with ThreadPoolExecutor(max_workers=2) as executor: |
|
results = executor.map(_process_single_page_fast, images) |
|
for i, text in enumerate(results): |
|
page_results.append({ |
|
"page_number": i + 1, |
|
"text": text |
|
}) |
|
|
|
print("FAST MODE: Finished all pages.") |
|
return page_results |
|
|