File size: 1,575 Bytes
0776232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# image_processor.py

from PIL import Image
import pytesseract
from pdf2image import convert_from_bytes
import os
from concurrent.futures import ThreadPoolExecutor

def enhance_image_fast(image: Image.Image) -> Image.Image:
    """A lightweight image enhancement pipeline optimized for speed."""
    return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')

def extract_text_from_image_fast(image: Image.Image) -> str:
    """Extracts text using Tesseract with a configuration favoring speed."""
    fast_config = r'--oem 1 --psm 6'
    text = pytesseract.image_to_string(image, config=fast_config)
    return text

def _process_single_page_fast(page_image: Image.Image) -> str:
    """Helper function that uses the new fast methods."""
    enhanced_image = enhance_image_fast(page_image)
    return extract_text_from_image_fast(enhanced_image)

def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
    """Converts a PDF and processes pages in parallel using the FAST pipeline."""
    print("FAST MODE: Converting PDF pages at 150 DPI...")
    images = convert_from_bytes(pdf_bytes, dpi=150)
    print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")

    page_results = []
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(_process_single_page_fast, images)
        for i, text in enumerate(results):
            page_results.append({
                "page_number": i + 1,
                "text": text
            })

    print("FAST MODE: Finished all pages.")
    return page_results