Spaces:

Vachudev
/

mcp_ocr_tuner

Sleeping

mcp_ocr_tuner / ocr_engine_json.py

Initial Commit

dc79584 verified 8 days ago

1.55 kB

	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import os
	import logging
	logger = logging.getLogger("ocr_engine")
	def extract_text_from_file(file_path: str) -> str:
	"""
	Extracts text from a PDF or Image file using Tesseract.
	"""
	if not os.path.exists(file_path):
	return ""

	text_content = ""

	try:
	# Handle PDF
	if file_path.lower().endswith('.pdf'):
	try:
	# Convert PDF pages to images
	images = convert_from_path(file_path)
	for i, image in enumerate(images):
	page_text = pytesseract.image_to_string(image)
	text_content += f"--- Page {i+1} ---\n{page_text}\n"
	except Exception as e:
	logger.error(f"Error converting PDF: {e}")
	return f"Error reading PDF: {str(e)}"

	# Handle Images (JPG, PNG, etc.)
	elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
	try:
	image = Image.open(file_path)
	text_content = pytesseract.image_to_string(image)
	except Exception as e:
	logger.error(f"Error reading image: {e}")
	return f"Error reading image: {str(e)}"

	else:
	return "Unsupported file format. Please upload PDF or Image."

	except Exception as e:
	logger.error(f"OCR Critical Error: {e}")
	return f"OCR Failed: {str(e)}"

	return text_content.strip()