Spaces:
Sleeping
Sleeping
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import os | |
| import logging | |
| logger = logging.getLogger("ocr_engine") | |
| def extract_text_from_file(file_path: str) -> str: | |
| """ | |
| Extracts text from a PDF or Image file using Tesseract. | |
| """ | |
| if not os.path.exists(file_path): | |
| return "" | |
| text_content = "" | |
| try: | |
| # Handle PDF | |
| if file_path.lower().endswith('.pdf'): | |
| try: | |
| # Convert PDF pages to images | |
| images = convert_from_path(file_path) | |
| for i, image in enumerate(images): | |
| page_text = pytesseract.image_to_string(image) | |
| text_content += f"--- Page {i+1} ---\n{page_text}\n" | |
| except Exception as e: | |
| logger.error(f"Error converting PDF: {e}") | |
| return f"Error reading PDF: {str(e)}" | |
| # Handle Images (JPG, PNG, etc.) | |
| elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')): | |
| try: | |
| image = Image.open(file_path) | |
| text_content = pytesseract.image_to_string(image) | |
| except Exception as e: | |
| logger.error(f"Error reading image: {e}") | |
| return f"Error reading image: {str(e)}" | |
| else: | |
| return "Unsupported file format. Please upload PDF or Image." | |
| except Exception as e: | |
| logger.error(f"OCR Critical Error: {e}") | |
| return f"OCR Failed: {str(e)}" | |
| return text_content.strip() |