Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pytesseract | |
| from PIL import Image | |
| import io | |
| def extract_text_from_pdf(pdf_stream: io.BytesIO) -> str: | |
| """ | |
| Extracts text from a PDF file using PyMuPDF. | |
| Falls back to OCR if no text found on a page. | |
| """ | |
| try: | |
| doc = fitz.open(stream=pdf_stream.read(), filetype="pdf") | |
| full_text = [] | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text().strip() | |
| if text: | |
| full_text.append(f"--- Page {page_num + 1} ---\n{text}") | |
| else: | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| ocr_text = pytesseract.image_to_string(img).strip() | |
| full_text.append(f"--- Page {page_num + 1} (OCR) ---\n{ocr_text}") | |
| return "\n\n".join(full_text) | |
| except Exception as e: | |
| return f"Error occurred while processing the PDF: {str(e)}" | |