Spaces:
Runtime error
Runtime error
BhagatSurya
commited on
Commit
•
f9aff1d
1
Parent(s):
6e63c71
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ from pdf2image.exceptions import (
|
|
12 |
PDFSyntaxError
|
13 |
)
|
14 |
import fitz # PyMuPDF
|
15 |
-
from PIL import Image
|
16 |
import io
|
17 |
import base64
|
18 |
|
@@ -39,12 +39,14 @@ def pdf_to_text(file):
|
|
39 |
image_list = page.get_images(full=True)
|
40 |
for img in image_list:
|
41 |
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
|
42 |
-
#
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
|
49 |
page_text = clean_text(page_text)
|
50 |
if len(page_text.split()) > 5:
|
@@ -57,13 +59,12 @@ def pdf_to_text(file):
|
|
57 |
with open(output_file_name, 'w') as f:
|
58 |
f.write(full_text)
|
59 |
|
60 |
-
return
|
|
|
61 |
|
62 |
iface = gr.Interface(fn=pdf_to_text,
|
63 |
inputs=gr.inputs.File(label="Your PDF"),
|
64 |
-
outputs="
|
65 |
title="PDF to TXT",
|
66 |
description="Convert your PDF files to clean text")
|
67 |
iface.launch()
|
68 |
-
|
69 |
-
|
|
|
12 |
PDFSyntaxError
|
13 |
)
|
14 |
import fitz # PyMuPDF
|
15 |
+
from PIL import Image, UnidentifiedImageError
|
16 |
import io
|
17 |
import base64
|
18 |
|
|
|
39 |
image_list = page.get_images(full=True)
|
40 |
for img in image_list:
|
41 |
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
|
42 |
+
# Decode image_data from base64 before opening it
|
43 |
+
image_data = base64.b64decode(image_data)
|
44 |
+
try:
|
45 |
+
image = Image.open(io.BytesIO(image_data))
|
46 |
+
latex_code = image_to_latex(image)
|
47 |
+
page_text += "\n" + latex_code # Add LaTeX code to page text
|
48 |
+
except UnidentifiedImageError:
|
49 |
+
print(f"Could not identify image on page {i+1}")
|
50 |
|
51 |
page_text = clean_text(page_text)
|
52 |
if len(page_text.split()) > 5:
|
|
|
59 |
with open(output_file_name, 'w') as f:
|
60 |
f.write(full_text)
|
61 |
|
62 |
+
return output_file_name, page_number
|
63 |
+
|
64 |
|
65 |
iface = gr.Interface(fn=pdf_to_text,
|
66 |
inputs=gr.inputs.File(label="Your PDF"),
|
67 |
+
outputs=[gr.outputs.File(label="Download TXT"), gr.outputs.Textbox(label="Last Page Processed")],
|
68 |
title="PDF to TXT",
|
69 |
description="Convert your PDF files to clean text")
|
70 |
iface.launch()
|
|
|
|