Spaces:

neerajkalyank
/

pdf1excel

Runtime error

File size: 936 Bytes

f7f8ecb
e5ed1d6
 
2196fad
f7f8ecb
e5ed1d6
2196fad
 
 
 
 
f7f8ecb
e20c41e
e5ed1d6
 
 
 
e20c41e
e5ed1d6
e20c41e
e5ed1d6
e20c41e
906860c
2196fad
e5ed1d6

import pdfplumber
from PIL import Image
import pytesseract
import os

def extract_text_with_ocr(pdf_file):
    # Check if the file exists before opening
    if not os.path.exists(pdf_file):
        print(f"Error: The file '{pdf_file}' does not exist.")
        return

    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Convert the page to an image
            image = page.to_image(resolution=300).original
            # Use OCR to extract text from the image
            text = pytesseract.image_to_string(image)
            if text:
                print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
            else:
                print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")

# Usage example
file_path = '/mnt/data/Toshiba PO.pdf'  # Make sure this is the correct path to your PDF file
extract_text_with_ocr(file_path)