Spaces:

Sugamdeol
/

ocr-api

Running

ocr-api / image_processor.py

Upload 5 files

0776232 verified 3 days ago

1.58 kB

	# image_processor.py

	from PIL import Image
	import pytesseract
	from pdf2image import convert_from_bytes
	import os
	from concurrent.futures import ThreadPoolExecutor

	def enhance_image_fast(image: Image.Image) -> Image.Image:
	"""A lightweight image enhancement pipeline optimized for speed."""
	return image.convert('L').point(lambda x: 0 if x < 155 else 255, '1')

	def extract_text_from_image_fast(image: Image.Image) -> str:
	"""Extracts text using Tesseract with a configuration favoring speed."""
	fast_config = r'--oem 1 --psm 6'
	text = pytesseract.image_to_string(image, config=fast_config)
	return text

	def _process_single_page_fast(page_image: Image.Image) -> str:
	"""Helper function that uses the new fast methods."""
	enhanced_image = enhance_image_fast(page_image)
	return extract_text_from_image_fast(enhanced_image)

	def process_pdf_in_parallel(pdf_bytes: bytes) -> list[dict]:
	"""Converts a PDF and processes pages in parallel using the FAST pipeline."""
	print("FAST MODE: Converting PDF pages at 150 DPI...")
	images = convert_from_bytes(pdf_bytes, dpi=150)
	print(f"FAST MODE: PDF has {len(images)} pages. Starting optimized parallel OCR...")

	page_results = []
	with ThreadPoolExecutor(max_workers=2) as executor:
	results = executor.map(_process_single_page_fast, images)
	for i, text in enumerate(results):
	page_results.append({
	"page_number": i + 1,
	"text": text
	})

	print("FAST MODE: Finished all pages.")
	return page_results