mcp_ocr_tuner / ocr_engine_json.py
Vachudev's picture
Initial Commit
dc79584 verified
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
logger = logging.getLogger("ocr_engine")
def extract_text_from_file(file_path: str) -> str:
"""
Extracts text from a PDF or Image file using Tesseract.
"""
if not os.path.exists(file_path):
return ""
text_content = ""
try:
# Handle PDF
if file_path.lower().endswith('.pdf'):
try:
# Convert PDF pages to images
images = convert_from_path(file_path)
for i, image in enumerate(images):
page_text = pytesseract.image_to_string(image)
text_content += f"--- Page {i+1} ---\n{page_text}\n"
except Exception as e:
logger.error(f"Error converting PDF: {e}")
return f"Error reading PDF: {str(e)}"
# Handle Images (JPG, PNG, etc.)
elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
try:
image = Image.open(file_path)
text_content = pytesseract.image_to_string(image)
except Exception as e:
logger.error(f"Error reading image: {e}")
return f"Error reading image: {str(e)}"
else:
return "Unsupported file format. Please upload PDF or Image."
except Exception as e:
logger.error(f"OCR Critical Error: {e}")
return f"OCR Failed: {str(e)}"
return text_content.strip()