ocr-api / image_processor.py
Sugamdeol's picture
Upload 5 files
0776232 verified
# image_processor.py
from PIL import Image
import pytesseract
from pdf2image import convert_from_bytes
import os
from concurrent.futures import ThreadPoolExecutor
def enhance_image_fast(image: Image.Image) -> Image.Image:
"""A lightweight image enhancement pipeline optimized for speed."""
return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')
def extract_text_from_image_fast(image: Image.Image) -> str:
"""Extracts text using Tesseract with a configuration favoring speed."""
fast_config = r'--oem 1 --psm 6'
text = pytesseract.image_to_string(image, config=fast_config)
return text
def _process_single_page_fast(page_image: Image.Image) -> str:
"""Helper function that uses the new fast methods."""
enhanced_image = enhance_image_fast(page_image)
return extract_text_from_image_fast(enhanced_image)
def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
"""Converts a PDF and processes pages in parallel using the FAST pipeline."""
print("FAST MODE: Converting PDF pages at 150 DPI...")
images = convert_from_bytes(pdf_bytes, dpi=150)
print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")
page_results = []
with ThreadPoolExecutor(max_workers=2) as executor:
results = executor.map(_process_single_page_fast, images)
for i, text in enumerate(results):
page_results.append({
"page_number": i + 1,
"text": text
})
print("FAST MODE: Finished all pages.")
return page_results