Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import easyocr | |
| from pdf2image import convert_from_path | |
| from typing import Optional | |
| import tempfile | |
| import os | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| """ | |
| Extract text from PDF using hybrid approach: | |
| 1. First try PyMuPDF for searchable PDFs | |
| 2. If minimal text, fall back to OCR for scanned PDFs | |
| """ | |
| # Step 1: Try PyMuPDF extraction | |
| try: | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page_num in range(doc.page_count): | |
| page = doc.load_page(page_num) | |
| text += page.get_text() | |
| doc.close() | |
| # Check if we got meaningful text (more than 100 characters) | |
| if len(text.strip()) > 100: | |
| return text.strip() | |
| except Exception as e: | |
| print(f"PyMuPDF extraction failed: {e}") | |
| # Step 2: Fall back to OCR for scanned PDFs | |
| try: | |
| # Convert PDF to images | |
| images = convert_from_path(file_path) | |
| # Initialize EasyOCR for English and Hindi | |
| reader = easyocr.Reader(['en', 'hi']) | |
| ocr_text = "" | |
| for image in images: | |
| # Perform OCR on each page | |
| results = reader.readtext(image) | |
| # Extract text from OCR results | |
| for (bbox, text, confidence) in results: | |
| if confidence > 0.5: # Only include high-confidence text | |
| ocr_text += text + " " | |
| return ocr_text.strip() | |
| except Exception as e: | |
| print(f"OCR extraction failed: {e}") | |
| raise Exception(f"Failed to extract text from PDF: {e}") | |
| return "" | |