neerajkalyank commited on
Commit
2196fad
·
verified ·
1 Parent(s): cc095af

Update toshiba.py

Browse files
Files changed (1) hide show
  1. toshiba.py +7 -1
toshiba.py CHANGED
@@ -1,8 +1,14 @@
1
  import pdfplumber
2
  from PIL import Image
3
  import pytesseract
 
4
 
5
  def extract_text_with_ocr(pdf_file):
 
 
 
 
 
6
  with pdfplumber.open(pdf_file) as pdf:
7
  for page_num, page in enumerate(pdf.pages):
8
  # Convert the page to an image
@@ -15,5 +21,5 @@ def extract_text_with_ocr(pdf_file):
15
  print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")
16
 
17
  # Usage example
18
- file_path = 'Toshiba PO.pdf' # Make sure this path points to your PDF file
19
  extract_text_with_ocr(file_path)
 
1
  import pdfplumber
2
  from PIL import Image
3
  import pytesseract
4
+ import os
5
 
6
  def extract_text_with_ocr(pdf_file):
7
+ # Check if the file exists before opening
8
+ if not os.path.exists(pdf_file):
9
+ print(f"Error: The file '{pdf_file}' does not exist.")
10
+ return
11
+
12
  with pdfplumber.open(pdf_file) as pdf:
13
  for page_num, page in enumerate(pdf.pages):
14
  # Convert the page to an image
 
21
  print(f"Page {page_num + 1} has no extractable text even with OCR.\n{'-' * 40}\n")
22
 
23
  # Usage example
24
+ file_path = '/mnt/data/Toshiba PO.pdf' # Make sure this is the correct path to your PDF file
25
  extract_text_with_ocr(file_path)