Spaces:

Gangadhar123
/

pdf_ocr_extraction_1

Sleeping

pdf_ocr_extraction_1 / utils.py

Update utils.py

a5debe0 verified 6 months ago

964 Bytes

	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image
	import io

	def extract_text_from_pdf(pdf_stream: io.BytesIO) -> str:
	"""
	Extracts text from a PDF file using PyMuPDF.
	Falls back to OCR if no text found on a page.
	"""
	try:
	doc = fitz.open(stream=pdf_stream.read(), filetype="pdf")
	full_text = []

	for page_num, page in enumerate(doc):
	text = page.get_text().strip()
	if text:
	full_text.append(f"--- Page {page_num + 1} ---\n{text}")
	else:
	pix = page.get_pixmap(dpi=300)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	ocr_text = pytesseract.image_to_string(img).strip()
	full_text.append(f"--- Page {page_num + 1} (OCR) ---\n{ocr_text}")

	return "\n\n".join(full_text)

	except Exception as e:
	return f"Error occurred while processing the PDF: {str(e)}"