Spaces:
Runtime error
Runtime error
File size: 936 Bytes
f7f8ecb e5ed1d6 2196fad f7f8ecb e5ed1d6 2196fad f7f8ecb e20c41e e5ed1d6 e20c41e e5ed1d6 e20c41e e5ed1d6 e20c41e 906860c 2196fad e5ed1d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
import pdfplumber
from PIL import Image
import pytesseract
import os
def extract_text_with_ocr(pdf_file):
# Check if the file exists before opening
if not os.path.exists(pdf_file):
print(f"Error: The file '{pdf_file}' does not exist.")
return
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages):
# Convert the page to an image
image = page.to_image(resolution=300).original
# Use OCR to extract text from the image
text = pytesseract.image_to_string(image)
if text:
print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
else:
print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")
# Usage example
file_path = '/mnt/data/Toshiba PO.pdf' # Make sure this is the correct path to your PDF file
extract_text_with_ocr(file_path)
|