Gangadhar123's picture
Update utils.py
a5debe0 verified
raw
history blame contribute delete
964 Bytes
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
def extract_text_from_pdf(pdf_stream: io.BytesIO) -> str:
"""
Extracts text from a PDF file using PyMuPDF.
Falls back to OCR if no text found on a page.
"""
try:
doc = fitz.open(stream=pdf_stream.read(), filetype="pdf")
full_text = []
for page_num, page in enumerate(doc):
text = page.get_text().strip()
if text:
full_text.append(f"--- Page {page_num + 1} ---\n{text}")
else:
pix = page.get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
ocr_text = pytesseract.image_to_string(img).strip()
full_text.append(f"--- Page {page_num + 1} (OCR) ---\n{ocr_text}")
return "\n\n".join(full_text)
except Exception as e:
return f"Error occurred while processing the PDF: {str(e)}"