Samarth991 commited on
Commit
e433400
1 Parent(s): 951cf22

Update read_photodocument.py

Browse files
Files changed (1) hide show
  1. read_photodocument.py +15 -0
read_photodocument.py CHANGED
@@ -19,6 +19,8 @@ from spellchecker import SpellChecker
19
  from tqdm.auto import tqdm
20
  import nltk
21
  import contextlib
 
 
22
  nltk.download("stopwords") # TODO=find where this requirement originates from
23
 
24
 
@@ -28,6 +30,19 @@ def simple_rename(filepath, target_ext=".txt"):
28
  return f"OCR_{basename}_{target_ext}"
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def rm_local_text_files(name_contains="RESULT_"):
32
  """
33
  rm_local_text_files - remove local text files
 
19
  from tqdm.auto import tqdm
20
  import nltk
21
  import contextlib
22
+ import img2pdf
23
+ from PIL import Image
24
  nltk.download("stopwords") # TODO=find where this requirement originates from
25
 
26
 
 
30
  return f"OCR_{basename}_{target_ext}"
31
 
32
 
33
+ def convert_image_to_pdf(image_path,model=None):
34
+ pdf_path = image_path.replace('.jpg','.pdf')
35
+ image = Image.open(image_path)
36
+ pdf_bytes = img2pdf.convert(image.filename)
37
+ file = open(pdf_path, "wb")
38
+ file.write(pdf_bytes)
39
+ image.close()
40
+ file.close()
41
+ return convert_PDF_to_Text(PDF_file=pdf_path,ocr_model=model)
42
+
43
+
44
+
45
+
46
  def rm_local_text_files(name_contains="RESULT_"):
47
  """
48
  rm_local_text_files - remove local text files