Spaces:

BhagatSurya
/

convet_pdf_to_txt

Runtime error

BhagatSurya commited on Jun 20, 2023

Commit

f9aff1d

•

1 Parent(s): 6e63c71

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from pdf2image.exceptions import (
     PDFSyntaxError
 )
 import fitz  # PyMuPDF
-from PIL import Image
 import io
 import base64
@@ -39,12 +39,14 @@ def pdf_to_text(file):
         image_list = page.get_images(full=True)
         for img in image_list:
             xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
-            # Ensure image_data is a bytes object before opening it
-            if isinstance(image_data, str):
-                image_data = image_data.encode()  # Convert string to bytes if necessary
-            image = Image.open(io.BytesIO(image_data))
-            latex_code = image_to_latex(image)
-            page_text += "\n" + latex_code  # Add LaTeX code to page text
         page_text = clean_text(page_text)
         if len(page_text.split()) > 5:
@@ -57,13 +59,12 @@ def pdf_to_text(file):
     with open(output_file_name, 'w') as f:
         f.write(full_text)
-    return f"{output_file_name}, {page_number}"
 iface = gr.Interface(fn=pdf_to_text,
                      inputs=gr.inputs.File(label="Your PDF"),
-                     outputs="text",
                      title="PDF to TXT",
                      description="Convert your PDF files to clean text")
 iface.launch()

     PDFSyntaxError
 )
 import fitz  # PyMuPDF
+from PIL import Image, UnidentifiedImageError
 import io
 import base64
         image_list = page.get_images(full=True)
         for img in image_list:
             xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
+            # Decode image_data from base64 before opening it
+            image_data = base64.b64decode(image_data)
+            try:
+                image = Image.open(io.BytesIO(image_data))
+                latex_code = image_to_latex(image)
+                page_text += "\n" + latex_code  # Add LaTeX code to page text
+            except UnidentifiedImageError:
+                print(f"Could not identify image on page {i+1}")
         page_text = clean_text(page_text)
         if len(page_text.split()) > 5:
     with open(output_file_name, 'w') as f:
         f.write(full_text)
+    return output_file_name, page_number
 iface = gr.Interface(fn=pdf_to_text,
                      inputs=gr.inputs.File(label="Your PDF"),
+                     outputs=[gr.outputs.File(label="Download TXT"), gr.outputs.Textbox(label="Last Page Processed")],
                      title="PDF to TXT",
                      description="Convert your PDF files to clean text")
 iface.launch()