# image_processor.py from PIL import Image import pytesseract from pdf2image import convert_from_bytes import os from concurrent.futures import ThreadPoolExecutor def enhance_image_fast(image: Image.Image) -> Image.Image: """A lightweight image enhancement pipeline optimized for speed.""" return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1') def extract_text_from_image_fast(image: Image.Image) -> str: """Extracts text using Tesseract with a configuration favoring speed.""" fast_config = r'--oem 1 --psm 6' text = pytesseract.image_to_string(image, config=fast_config) return text def _process_single_page_fast(page_image: Image.Image) -> str: """Helper function that uses the new fast methods.""" enhanced_image = enhance_image_fast(page_image) return extract_text_from_image_fast(enhanced_image) def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]: """Converts a PDF and processes pages in parallel using the FAST pipeline.""" print("FAST MODE: Converting PDF pages at 150 DPI...") images = convert_from_bytes(pdf_bytes, dpi=150) print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...") page_results = [] with ThreadPoolExecutor(max_workers=2) as executor: results = executor.map(_process_single_page_fast, images) for i, text in enumerate(results): page_results.append({ "page_number": i + 1, "text": text }) print("FAST MODE: Finished all pages.") return page_results