Spaces:

neerajkalyank
/

pdf1excel

Runtime error

pdf1excel / toshiba.py

Update toshiba.py

2196fad verified about 1 month ago

936 Bytes

	import pdfplumber
	from PIL import Image
	import pytesseract
	import os

	def extract_text_with_ocr(pdf_file):
	# Check if the file exists before opening
	if not os.path.exists(pdf_file):
	print(f"Error: The file '{pdf_file}' does not exist.")
	return

	with pdfplumber.open(pdf_file) as pdf:
	for page_num, page in enumerate(pdf.pages):
	# Convert the page to an image
	image = page.to_image(resolution=300).original
	# Use OCR to extract text from the image
	text = pytesseract.image_to_string(image)
	if text:
	print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
	else:
	print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")

	# Usage example
	file_path = '/mnt/data/Toshiba PO.pdf' # Make sure this is the correct path to your PDF file
	extract_text_with_ocr(file_path)