BhagatSurya commited on
Commit
f9aff1d
1 Parent(s): 6e63c71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -12,7 +12,7 @@ from pdf2image.exceptions import (
12
  PDFSyntaxError
13
  )
14
  import fitz # PyMuPDF
15
- from PIL import Image
16
  import io
17
  import base64
18
 
@@ -39,12 +39,14 @@ def pdf_to_text(file):
39
  image_list = page.get_images(full=True)
40
  for img in image_list:
41
  xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
42
- # Ensure image_data is a bytes object before opening it
43
- if isinstance(image_data, str):
44
- image_data = image_data.encode() # Convert string to bytes if necessary
45
- image = Image.open(io.BytesIO(image_data))
46
- latex_code = image_to_latex(image)
47
- page_text += "\n" + latex_code # Add LaTeX code to page text
 
 
48
 
49
  page_text = clean_text(page_text)
50
  if len(page_text.split()) > 5:
@@ -57,13 +59,12 @@ def pdf_to_text(file):
57
  with open(output_file_name, 'w') as f:
58
  f.write(full_text)
59
 
60
- return f"{output_file_name}, {page_number}"
 
61
 
62
  iface = gr.Interface(fn=pdf_to_text,
63
  inputs=gr.inputs.File(label="Your PDF"),
64
- outputs="text",
65
  title="PDF to TXT",
66
  description="Convert your PDF files to clean text")
67
  iface.launch()
68
-
69
-
 
12
  PDFSyntaxError
13
  )
14
  import fitz # PyMuPDF
15
+ from PIL import Image, UnidentifiedImageError
16
  import io
17
  import base64
18
 
 
39
  image_list = page.get_images(full=True)
40
  for img in image_list:
41
  xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
42
+ # Decode image_data from base64 before opening it
43
+ image_data = base64.b64decode(image_data)
44
+ try:
45
+ image = Image.open(io.BytesIO(image_data))
46
+ latex_code = image_to_latex(image)
47
+ page_text += "\n" + latex_code # Add LaTeX code to page text
48
+ except UnidentifiedImageError:
49
+ print(f"Could not identify image on page {i+1}")
50
 
51
  page_text = clean_text(page_text)
52
  if len(page_text.split()) > 5:
 
59
  with open(output_file_name, 'w') as f:
60
  f.write(full_text)
61
 
62
+ return output_file_name, page_number
63
+
64
 
65
  iface = gr.Interface(fn=pdf_to_text,
66
  inputs=gr.inputs.File(label="Your PDF"),
67
+ outputs=[gr.outputs.File(label="Download TXT"), gr.outputs.Textbox(label="Last Page Processed")],
68
  title="PDF to TXT",
69
  description="Convert your PDF files to clean text")
70
  iface.launch()