File size: 936 Bytes
f7f8ecb
e5ed1d6
 
2196fad
f7f8ecb
e5ed1d6
2196fad
 
 
 
 
f7f8ecb
e20c41e
e5ed1d6
 
 
 
e20c41e
e5ed1d6
e20c41e
e5ed1d6
e20c41e
906860c
2196fad
e5ed1d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import pdfplumber
from PIL import Image
import pytesseract
import os

def extract_text_with_ocr(pdf_file):
    # Check if the file exists before opening
    if not os.path.exists(pdf_file):
        print(f"Error: The file '{pdf_file}' does not exist.")
        return

    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Convert the page to an image
            image = page.to_image(resolution=300).original
            # Use OCR to extract text from the image
            text = pytesseract.image_to_string(image)
            if text:
                print(f"Page {page_num + 1} OCR Content:\n{text}\n{'-' * 40}\n")
            else:
                print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")

# Usage example
file_path = '/mnt/data/Toshiba PO.pdf'  # Make sure this is the correct path to your PDF file
extract_text_with_ocr(file_path)