Spaces:
Runtime error
Runtime error
import pdfplumber | |
from PIL import Image | |
import pytesseract | |
import os | |
def extract_text_with_ocr(pdf_file): | |
# Check if the file exists before opening | |
if not os.path.exists(pdf_file): | |
print(f"Error: The file '{pdf_file}' does not exist.") | |
return | |
with pdfplumber.open(pdf_file) as pdf: | |
for page_num, page in enumerate(pdf.pages): | |
# Convert the page to an image | |
image = page.to_image(resolution=300).original | |
# Use OCR to extract text from the image | |
text = pytesseract.image_to_string(image) | |
if text: | |
print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n") | |
else: | |
print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n") | |
# Usage example | |
file_path = '/mnt/data/Toshiba PO.pdf' # Make sure this is the correct path to your PDF file | |
extract_text_with_ocr(file_path) | |